In [None]:
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType
from pyspark.sql.functions import current_date,date_format

In [None]:
processing_date = '2026-01-01' # 2025-12-29
workspace="fabric_DEV"
container_name = 'lmsproject'

In [None]:
storage_account_name = 'delakehouse'
landing_folder = 'landing'

In [None]:
# abfss path for azure container
partition_path = f"Processing_Date={processing_date}"

abfs_path = f'abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{landing_folder}/{partition_path}'


In [None]:
# abfss path for onelake table
student_table_name = "student_table"
target_path = f"abfss://{workspace}@onelake.dfs.fabric.microsoft.com/lms_LH_Bronze.Lakehouse/Tables/{student_table_name}"


In [None]:
# Define the schema
schema = StructType([
    StructField("Student_ID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Grade_Level", StringType(), True),
    StructField("Course_ID", StringType(), True),
    StructField("Course_Name", StringType(), True),
    StructField("Enrollment_Date", StringType(), True),
    StructField("Completion_Date", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Final_Grade", StringType(), True),
    StructField("Attendance_Rate", DoubleType(), True),
    StructField("Time_Spent_on_Course_hrs", DoubleType(), True),
    StructField("Assignments_Completed", IntegerType(), True),
    StructField("Quizzes_Completed", IntegerType(), True),
    StructField("Forum_Posts", IntegerType(), True),
    StructField("Messages_Sent", IntegerType(), True),
    StructField("Quiz_Average_Score", DoubleType(), True),
    StructField("Assignment_Scores", StringType(), True),
    StructField("Assignment_Average_Score", DoubleType(), True),
    StructField("Project_Score", DoubleType(), True),
    StructField("Extra_Credit", DoubleType(), True),
    StructField("Overall_Performance", DoubleType(), True),
    StructField("Feedback_Score", DoubleType(), True),
    StructField("Parent_Involvement", StringType(), True),
    StructField("Demographic_Group", StringType(), True),
    StructField("Internet_Access", StringType(), True),
    StructField("Learning_Disabilities", StringType(), True),
    StructField("Preferred_Learning_Style", StringType(), True),
    StructField("Language_Proficiency", StringType(), True),
    StructField("Participation_Rate", StringType(), True),
    StructField("Completion_Time_Days", IntegerType(), True),
    StructField("Performance_Score", DoubleType(), True),
    StructField("Course_Completion_Rate", DoubleType(), True)
])
df = spark.read.csv(header=True,schema=schema,path=abfs_path)

In [None]:
# check if table exists or not
if not DeltaTable.isDeltaTable(spark,target_path):
    print(f"table does not exists, creating new table: {student_table_name}")
    # add processing date if not exists
    df = df.withColumn("Processing_Date",date_format(current_date(),'yyyy-MM-dd'))
    df.write.format("delta").mode("overwrite").save(target_path)
    print("table created")
else:
    print(f"updating table: {student_table_name}")
    df.createOrReplaceTempView("student_table_temp")
    sql_statement = f""" MERGE INTO {student_table_name} AS target 
                    USING student_table_temp AS source 
                    ON target.Student_ID = source.Student_ID  AND target.Course_ID = source.Course_ID
                    WHEN MATCHED THEN
                        UPDATE SET
                            target.Name = source.Name,
                            target.Age = source.Age,
                            target.Gender = source.Gender,
                            target.Grade_Level = source.Grade_Level,
                            target.Course_Name = source.Course_Name,
                            target.Enrollment_Date = source.Enrollment_Date,
                            target.Completion_Date = source.Completion_Date,
                            target.Status = source.Status,
                            target.Final_Grade = source.Final_Grade,
                            target.Attendance_Rate = source.Attendance_Rate,
                            target.Time_Spent_on_Course_hrs = source.Time_Spent_on_Course_hrs,
                            target.Assignments_Completed = source.Assignments_Completed,
                            target.Quizzes_Completed = source.Quizzes_Completed,
                            target.Forum_Posts = source.Forum_Posts,
                            target.Messages_Sent = source.Messages_Sent,
                            target.Quiz_Average_Score = source.Quiz_Average_Score,
                            target.Assignment_Scores = source.Assignment_Scores,
                            target.Assignment_Average_Score = source.Assignment_Average_Score,
                            target.Project_Score = source.Project_Score,
                            target.Extra_Credit = source.Extra_Credit,
                            target.Overall_Performance = source.Overall_Performance,
                            target.Feedback_Score = source.Feedback_Score,
                            target.Parent_Involvement = source.Parent_Involvement,
                            target.Demographic_Group = source.Demographic_Group,
                            target.Internet_Access = source.Internet_Access,
                            target.Learning_Disabilities = source.Learning_Disabilities,
                            target.Preferred_Learning_Style = source.Preferred_Learning_Style,
                            target.Language_Proficiency = source.Language_Proficiency,
                            target.Participation_Rate = source.Participation_Rate,
                            target.Completion_Time_Days = source.Completion_Time_Days,
                            target.Performance_Score = source.Performance_Score,
                            target.Course_Completion_Rate = source.Course_Completion_Rate,
                            target.Processing_Date = '{processing_date}'
                    WHEN NOT MATCHED THEN
                    INSERT ( Student_ID, Name, Age, Gender, Grade_Level, Course_ID, Course_Name, Enrollment_Date, Completion_Date,
                                Status, Final_Grade, Attendance_Rate, Time_Spent_on_Course_hrs, Assignments_Completed, Quizzes_Completed,
                                Forum_Posts, Messages_Sent, Quiz_Average_Score, Assignment_Scores, Assignment_Average_Score, Project_Score,
                                Extra_Credit, Overall_Performance, Feedback_Score, Parent_Involvement, Demographic_Group, Internet_Access,
                                Learning_Disabilities, Preferred_Learning_Style, Language_Proficiency, Participation_Rate, Completion_Time_Days,
                                Performance_Score, Course_Completion_Rate, Processing_Date)
                                VALUES 
                                ( source.Student_ID, source.Name, source.Age, source.Gender, source.Grade_Level, source.Course_ID, source.Course_Name, 
                                source.Enrollment_Date, source.Completion_Date, source.Status, source.Final_Grade, source.Attendance_Rate, 
                                source.Time_Spent_on_Course_hrs, source.Assignments_Completed, source.Quizzes_Completed, source.Forum_Posts, 
                                source.Messages_Sent, source.Quiz_Average_Score, source.Assignment_Scores, source.Assignment_Average_Score, 
                                source.Project_Score, source.Extra_Credit, source.Overall_Performance, source.Feedback_Score, source.Parent_Involvement, 
                                source.Demographic_Group, source.Internet_Access, source.Learning_Disabilities, source.Preferred_Learning_Style, 
                                source.Language_Proficiency, source.Participation_Rate, source.Completion_Time_Days, source.Performance_Score, 
                                source.Course_Completion_Rate, '{processing_date}')"""
    spark.sql(sql_statement).show()