In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark


In [0]:
#1. Load the data with schema inference enabled
df_infer = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("file:/Workspace/Shared/course_enrollments.csv")

df_infer.show()
df_infer.printSchema()


+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [0]:
#2. Manually define schema and compare both approaches
from pyspark.sql.types import *

manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

df_manual = spark.read.option("header", "true").schema(manual_schema) \
    .csv("file:/Workspace/Shared/course_enrollments.csv")

df_manual.show()
df_manual.printSchema()

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [0]:
#3. Filter records where ProgressPercent < 50
df_filtered = df_infer.filter(df_infer.ProgressPercent < 50)
df_filtered.show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



In [0]:
#4. Replace null ratings with average rating
from pyspark.sql.functions import avg, when, col

avg_rating = df_infer.select(avg("Rating")).first()[0]

df_filled = df_infer.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))
df_filled.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|          

In [0]:
#5. Add column IsActive → 1 if Status is Active, else 0
df_active = df_filled.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
df_active.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      E

In [0]:
#6. Find average progress by course
df_active.groupBy("CourseName").avg("ProgressPercent").show()

+--------------------+--------------------+
|          CourseName|avg(ProgressPercent)|
+--------------------+--------------------+
|Data Analysis wit...|               100.0|
|         Java Basics|                 0.0|
|Machine Learning 101|                60.0|
|Python for Beginners|                85.0|
| Power BI Essentials|                30.0|
+--------------------+--------------------+



In [0]:
#7. Get count of students in each course category
df_active.groupBy("Category").count().show()

+-----------+-----+
|   Category|count|
+-----------+-----+
|Programming|    3|
|         AI|    1|
|  Analytics|    2|
+-----------+-----+



In [0]:
#8. Identify the most enrolled course
from pyspark.sql.functions import desc

df_active.groupBy("CourseName").count().orderBy(desc("count")).limit(1).show()

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



In [0]:
#9. Load course_details.csv
df_course_details = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("file:/Workspace/Shared/course_details.csv")

df_course_details.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



In [0]:
#10. Join course_enrollments with course_details
df_joined = df_active.join(df_course_details, on="CourseName", how="left")
df_joined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

In [0]:
#11. Rank students in each course based on ProgressPercent
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("CourseName").orderBy(desc("ProgressPercent"))

df_ranked = df_joined.withColumn("Rank", rank().over(windowSpec))
df_ranked.select("StudentName", "CourseName", "ProgressPercent", "Rank").show()

+-----------+--------------------+---------------+----+
|StudentName|          CourseName|ProgressPercent|Rank|
+-----------+--------------------+---------------+----+
|     Simran|Data Analysis wit...|            100|   1|
|       Neha|         Java Basics|              0|   1|
|       Zara|Machine Learning 101|             60|   1|
|     Aakash| Power BI Essentials|             30|   1|
|    Ibrahim|Python for Beginners|             90|   1|
|     Aditya|Python for Beginners|             80|   2|
+-----------+--------------------+---------------+----+



In [0]:
#12. Get lead and lag of EnrollDate by Category
from pyspark.sql.functions import lead, lag

windowCat = Window.partitionBy("Category").orderBy("EnrollDate")

df_lead_lag = df_joined.withColumn("NextEnrollDate", lead("EnrollDate").over(windowCat)) \
    .withColumn("PrevEnrollDate", lag("EnrollDate").over(windowCat))

df_lead_lag.select("EnrollmentID", "Category", "EnrollDate", "NextEnrollDate", "PrevEnrollDate").show()

+------------+-----------+----------+--------------+--------------+
|EnrollmentID|   Category|EnrollDate|NextEnrollDate|PrevEnrollDate|
+------------+-----------+----------+--------------+--------------+
|      ENR005|         AI|2024-05-17|          NULL|          NULL|
|      ENR002|  Analytics|2024-05-12|    2024-05-13|          NULL|
|      ENR003|  Analytics|2024-05-13|          NULL|    2024-05-12|
|      ENR001|Programming|2024-05-10|    2024-05-15|          NULL|
|      ENR004|Programming|2024-05-15|    2024-05-18|    2024-05-10|
|      ENR006|Programming|2024-05-18|          NULL|    2024-05-15|
+------------+-----------+----------+--------------+--------------+



In [0]:
#13. Pivot data to show total enrollments by Category and Status
df_pivot = df_joined.groupBy("Category").pivot("Status").count()
df_pivot.show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+



In [0]:
#14. Extract year and month from EnrollDate
from pyspark.sql.functions import year, month

df_with_date = df_joined.withColumn("EnrollYear", year("EnrollDate")) \
                        .withColumn("EnrollMonth", month("EnrollDate"))

df_with_date.select("EnrollmentID", "EnrollDate", "EnrollYear", "EnrollMonth").show()

+------------+----------+----------+-----------+
|EnrollmentID|EnrollDate|EnrollYear|EnrollMonth|
+------------+----------+----------+-----------+
|      ENR001|2024-05-10|      2024|          5|
|      ENR002|2024-05-12|      2024|          5|
|      ENR003|2024-05-13|      2024|          5|
|      ENR004|2024-05-15|      2024|          5|
|      ENR005|2024-05-17|      2024|          5|
|      ENR006|2024-05-18|      2024|          5|
+------------+----------+----------+-----------+



In [0]:
#15. Drop rows where Status is null or empty
df_cleaned = df_with_date.filter((col("Status").isNotNull()) & (col("Status") != ""))
df_cleaned.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|   

In [0]:
#16. Remove duplicate enrollments using dropDuplicates()
df_no_duplicates = df_cleaned.dropDuplicates(["EnrollmentID"])
df_no_duplicates.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|            6|     Manoj|      2024|          5|
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|   

In [0]:
#17. Write final cleaned DataFrame to:
# CSV 
df_no_duplicates.write.mode("overwrite") \
    .option("header", "true") \
    .csv("file:/Workspace/Shared/final_course_output_csv")

# JSON 
df_no_duplicates.write.mode("overwrite") \
    .json("file:/Workspace/Shared/final_course_output_json")

# Parquet 
df_no_duplicates.write.mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("file:/Workspace/Shared/final_course_output_parquet")