In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder \
    .appName("OnlineCourseAnalysis") \
    .getOrCreate()

spark

**Data Loading**

In [6]:
#1. Load the data with schema inference enabled.
df_inferred = spark.read.csv("course_enrollments.csv", header=True, inferSchema=True)
df_inferred.printSchema()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



In [13]:
#2. Manually define schema and compare both approaches.
manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])
df_manual = spark.read.csv("course_enrollments.csv", header=True, schema=manual_schema)

# Compare schemas
print("Inferred schema types:")
df_manual.printSchema()

print("\nManual schema types:")
df_inferred.printSchema()

Inferred schema types:
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)


Manual schema types:
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



**Filtering and Transformation**

In [15]:
#3. Filter records where ProgressPercent < 50 .
low= df_manual.filter(col("ProgressPercent") < 50)
low.show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



In [17]:
#4. Replace null ratings with average rating.
avg_rating = df_manual.select(avg("Rating")).collect()[0][0]
df_fill = df_manual.na.fill(avg_rating, subset=["Rating"])
df_fill.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|          

In [18]:
#5. Add column IsActive → 1 if Status is Active, else 0.
df_with_active = df_fill.withColumn("IsActive", when(col("Status")=="Active", 1).otherwise(0))
df_with_active.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      E

**Aggregations & Metrics**

In [20]:
#6. Find average progress by course.
avg_progress = df_with_active.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress"))
avg_progress.show()

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



In [21]:
#7. Get count of students in each course category.
category_count = df_with_active.groupBy("Category").count().orderBy("count", ascending=False)
category_count.show()

+-----------+-----+
|   Category|count|
+-----------+-----+
|Programming|    3|
|  Analytics|    2|
|         AI|    1|
+-----------+-----+



In [22]:
#8. Identify the most enrolled course.
most_enrolled = df_with_active.groupBy("CourseName").count().orderBy("count", ascending=False).limit(1)
most_enrolled.show()

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



**Joins**

In [25]:
#9. Create second CSV: course_details.csv
course_details= spark.read.csv("course_details.csv", header=True, inferSchema=True)
course_details.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



In [26]:
#10. Join course_enrollments with course_details to include duration and instructor.
join_df = df_with_active.join(course_details, "CourseName", "left")
join_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

**Window Functions**

In [27]:
#11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
ranked_df = join_df.withColumn("Rank", rank().over(window_spec))
ranked_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|            6|     Manoj|   1|
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|      

In [28]:
#12. Get lead and lag of EnrollDate by Category.
category_window = Window.partitionBy("Category").orderBy("EnrollDate")
date_diff_df = ranked_df.withColumn("NextEnrollment", lead("EnrollDate").over(category_window)) \
.withColumn("PrevEnrollment", lag("EnrollDate").over(category_window))
date_diff_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollment|PrevEnrollment|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|          NULL|          NULL|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|          NULL|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|          

**Pivoting & Formatting**

In [29]:
#13. Pivot data to show total enrollments by Category and Status.
pivot_df = date_diff_df.groupBy("Category").pivot("Status").count().na.fill(0)
pivot_df.show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|        0|       0|
|  Analytics|     1|        1|       0|
+-----------+------+---------+--------+



In [30]:
#14. Extract year and month from EnrollDate .
date_extracted = date_diff_df.withColumn("EnrollYear", year("EnrollDate")) \
.withColumn("EnrollMonth", month("EnrollDate"))
date_extracted.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollment|PrevEnrollment|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|          NULL|          NULL|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|    

**Cleaning and Deduplication**

In [31]:
#15. Drop rows where Status is null or empty.
drop_df = date_extracted.filter(col("Status").isNotNull() & (col("Status") != ""))
drop_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollment|PrevEnrollment|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|          NULL|          NULL|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|    

In [32]:
#16. Remove duplicate enrollments using dropDuplicates() .
dupe_df = drop_df.dropDuplicates(["EnrollmentID"])
dupe_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollment|PrevEnrollment|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----------+-----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|   2|    2024-05-15|          NULL|      2024|          5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|    

**Export**

In [33]:
#17. Write the final cleaned DataFrame to:
#CSV (overwrite mode)
dupe_df.write.csv("final_enrollments_csv", header=True, mode="overwrite")
#JSON (overwrite mode)
dupe_df.write.json("final_enrollments_json", mode="overwrite")
#Parquet (snappy compression)
dupe_df.write.parquet("final_enrollments_parquet", mode="overwrite", compression="snappy")