Ingestion & Time Fields

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.appName("CourseAnalytics").getOrCreate()

# Load with schema inference
inferred_df = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/course_enrollements.csv")
print("Inferred Schema:")
inferred_df.printSchema()

# Manual schema 
manual_schema = StructType([StructField("EnrollID", StringType(), True),
    StructField("UserID", StringType(), True),
    StructField("CourseID", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("CompletionDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", IntegerType(), True)])

# Load with manual schema
df = spark.read.option("header", True).schema(manual_schema).csv("/Volumes/workspace/default/shared/course_enrollements.csv")

# Add DaysToComplete column
df = df.withColumn("DaysToComplete", when(col("CompletionDate").isNotNull(),datediff(col("CompletionDate"), col("EnrollDate"))).otherwise(None))
display(df)

Inferred Schema:
root
 |-- EnrollID: string (nullable = true)
 |-- UserID: string (nullable = true)
 |-- CourseID: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: string (nullable = true)
 |-- CompletionDate: string (nullable = true)
 |-- ProgressPercent: string (nullable = true)
 |-- Rating: string (nullable = true)



EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,4.0,9.0
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,,
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,,
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5.0,16.0
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4.0,11.0


2. User Learning Path Progress

In [0]:
from pyspark.sql.window import Window

# Group by UserID : count of courses enrolled
courses_per_user = df.groupBy("UserID") \
    .agg(count("*").alias("CoursesEnrolled")) \
    .orderBy("CoursesEnrolled", ascending=False)
display(courses_per_user)

#Avg progress % across all enrollments
avg_progress_per_user = df.groupBy("UserID") \
    .agg(avg("ProgressPercent").alias("AvgProgressPercentage")) \
    .orderBy("AvgProgressPercentage", ascending=False)
display(avg_progress_per_user)

# Add completion flag
df_with_flag = df.withColumn("IsCompleted", when(col("ProgressPercent") == 100, True).otherwise(False))

display(df_with_flag.select("EnrollID", "UserID", "CourseName", "ProgressPercent", "IsCompleted"))

UserID,CoursesEnrolled
U001,2
U002,1
U004,1
U003,1


UserID,AvgProgressPercentage
U003,100.0
U004,100.0
U001,65.0
U002,45.0


EnrollID,UserID,CourseName,ProgressPercent,IsCompleted
E001,U001,Python Basics,100,True
E002,U002,Excel for Finance,45,False
E003,U001,ML with PySpark,30,False
E004,U003,Python Basics,100,True
E005,U004,Digital Marketing,100,True


3. Engagement Scoring

In [0]:
#Create a score: ProgressPercent * Rating (if not null)
#Replace null Rating with 0 before computing
df = df.withColumn("EngagementScore",col("ProgressPercent") * coalesce(col("Rating"), lit(0)))
display(df.select("EnrollID", "UserID", "CourseName", "ProgressPercent", "Rating", "EngagementScore"))

EnrollID,UserID,CourseName,ProgressPercent,Rating,EngagementScore
E001,U001,Python Basics,100,4.0,400
E002,U002,Excel for Finance,45,,0
E003,U001,ML with PySpark,30,,0
E004,U003,Python Basics,100,5.0,500
E005,U004,Digital Marketing,100,4.0,400


4. Identify Drop-offs

In [0]:
#Filter all records with ProgressPercent < 50 and CompletionDate is null
dropouts = df.filter((col("ProgressPercent") < 50) & (col("CompletionDate").isNull()))
# Create dropout view
dropouts.createOrReplaceTempView("Dropouts")
print("Dropout records:")
spark.sql("SELECT * FROM Dropouts").show()

Dropout records:
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|      false|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|      false|              0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+




5. Joins with Metadata

In [0]:
catalog_df = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/course_catalog.csv")
display(catalog_df)

# Join to find average progress per instructor
joined_df = df.join(catalog_df, "CourseID", "left")
instructor_progress = joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress"),count("*").alias("TotalEnrollments")).orderBy("AvgProgress", ascending=False)

#Show who teaches the most enrolled course
most_enrolled = joined_df.groupBy("CourseID", "CourseName", "Instructor").count() \
    .orderBy("count", ascending=False).limit(1)
print("Instructor progress:")
display(instructor_progress)
print("\nMost enrolled course:")
display(most_enrolled)

CourseID,Instructor,DurationHours,Level
C001,Abdullah Khan,8,Beginner
C002,Sana Gupta,5,Beginner
C003,Ibrahim Khan,10,Intermediate
C004,Zoya Sheikh,6,Beginner


Instructor progress:


Instructor,AvgProgress,TotalEnrollments
Zoya Sheikh,100.0,1
Abdullah Khan,100.0,2
Sana Gupta,45.0,1
Ibrahim Khan,30.0,1



Most enrolled course:


CourseID,CourseName,Instructor,count
C001,Python Basics,Abdullah Khan,2


6. Delta Lake Practice

In [0]:
#Save as Delta Table enrollments_delta
df.write.mode("overwrite").format("delta").save("/Volumes/workspace/default/shared/enrollments_delta")

# Update: Set all ratings to 5 where Course = 'Python Basics'
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, "/Volumes/workspace/default/shared/enrollments_delta")
delta_table.update(condition="CourseName = 'Python Basics'",set={"Rating": "5"})

#Delete: All rows where ProgressPercent = 0
delta_table.delete("ProgressPercent = 0")
delta_table.history().show()

+-------+--------------------+----------------+--------------------+---------+--------------------+----+--------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+--------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      7|2025-06-19 06:56:...|5873923491206719|ahmedashiq2k17@gm...| OPTIMIZE|{predicate -> [],...|NULL|    NULL|0619-053659-98ecx...|          5|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      6| 2025-06-19 06:56:48|5873923491206719|ahmedashiq2k17@gm

7. Window Functions

In [0]:
#Use dense_rank() to rank courses by number of enrollments
window_spec_courses = Window.orderBy(col("enroll_count").desc())
course_ranks = df.groupBy("CourseID", "CourseName").count() \
    .withColumnRenamed("count", "enroll_count") \
    .withColumn("Rank", dense_rank().over(window_spec_courses))
print("Course rankings:")
display(course_ranks)

#lead() to find next course by each user (sorted by EnrollDate)
window_spec_user = Window.partitionBy("UserID").orderBy("EnrollDate")
user_course_path = df.withColumn("NextCourse", lead("CourseName", 1).over(window_spec_user))
print("\nUser learning paths:")
display(user_course_path.select("UserID", "CourseName", "NextCourse"))

Course rankings:




CourseID,CourseName,enroll_count,Rank
C001,Python Basics,2,1
C003,ML with PySpark,1,2
C002,Excel for Finance,1,2
C004,Digital Marketing,1,2



User learning paths:


UserID,CourseName,NextCourse
U001,Python Basics,ML with PySpark
U001,ML with PySpark,
U002,Excel for Finance,
U003,Python Basics,
U004,Digital Marketing,


8. SQL Logic for Dashboard Views

In [0]:
df.createOrReplaceTempView("enrollments")

# Daily enrollments view
spark.sql("""create or replace TEMP view daily_enrollments as select EnrollDate,count(*) AS enrollments_count
from enrollments
group by EnrollDate order by EnrollDate""")

print("Daily enrollments:")
spark.sql("SELECT * FROM daily_enrollments").show()

# Category performance view
spark.sql("""create or replace TEMP view category_performance as
select Category,avg(ProgressPercent) AS avg_progress, avg(Rating) AS avg_rating
from enrollments
group by Category order by avg_rating desc""")

print("\nCategory performance:")
spark.sql("SELECT * FROM category_performance").show()

# Top 3 courses view
spark.sql(""" create or replace TEMP view top_3_courses as
select CourseName,count(*) AS enrollments, avg(Rating) as avg_rating
from enrollments
group by CourseName order by enrollments desc limit 3""")

print("\nTop 3 courses:")
spark.sql("SELECT * FROM top_3_courses").show()

Daily enrollments:
+----------+-----------------+
|EnrollDate|enrollments_count|
+----------+-----------------+
|2024-04-01|                1|
|2024-04-02|                1|
|2024-04-03|                1|
|2024-04-04|                1|
|2024-04-05|                1|
+----------+-----------------+


Category performance:
+------------+------------+----------+
|    Category|avg_progress|avg_rating|
+------------+------------+----------+
| Programming|       100.0|       4.5|
|   Marketing|       100.0|       4.0|
|Data Science|        30.0|      NULL|
|Productivity|        45.0|      NULL|
+------------+------------+----------+


Top 3 courses:
+-----------------+-----------+----------+
|       CourseName|enrollments|avg_rating|
+-----------------+-----------+----------+
|    Python Basics|          2|       4.5|
|Excel for Finance|          1|      NULL|
|  ML with PySpark|          1|      NULL|
+-----------------+-----------+----------+



9. Time Travel

In [0]:
# Time travel to version before updates
original_data = spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .load("/Volumes/workspace/default/shared/enrollments_delta")

print("Original data before updates:")
display(original_data)

Original data before updates:


EnrollID,UserID,CourseID,CourseName,Category,EnrollDate,CompletionDate,ProgressPercent,Rating,DaysToComplete,IsCompleted,EngagementScore
E001,U001,C001,Python Basics,Programming,2024-04-01,2024-04-10,100,4.0,9.0,True,400
E002,U002,C002,Excel for Finance,Productivity,2024-04-02,,45,,,False,0
E003,U001,C003,ML with PySpark,Data Science,2024-04-03,,30,,,False,0
E004,U003,C001,Python Basics,Programming,2024-04-04,2024-04-20,100,5.0,16.0,True,500
E005,U004,C004,Digital Marketing,Marketing,2024-04-05,2024-04-16,100,4.0,11.0,True,400


Export

In [0]:
#Write to JSON, partitioned by Category
df.write.partitionBy("Category") \
    .format("json") \
    .save("/Volumes/workspace/default/shared/course_enrollments_json")

# Create summary DataFrame
summary_df = df.groupBy("CourseName").agg(count("*").alias("TotalEnrollments"),avg("Rating").alias("AvgRating"),avg("ProgressPercent").alias("AvgProgress")).orderBy("TotalEnrollments", ascending=False)

# Save summary as Parquet
summary_df.write.format("parquet") \
    .save("/Volumes/workspace/default/shared/course_summary_parquet")
print("Summary statistics:")
display(summary_df)

Summary statistics:


CourseName,TotalEnrollments,AvgRating,AvgProgress
Python Basics,2,4.5,100.0
ML with PySpark,1,,30.0
Excel for Finance,1,,45.0
Digital Marketing,1,4.0,100.0
