In [21]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("June16Assignment1").getOrCreate()
spark

In [22]:
course_enrollments_df=spark.read.csv("course_enrollments.csv",header=True,inferSchema=True)
course_enrollments_df.show()
course_details_df=spark.read.csv("course_details.csv",header=True,inferSchema=True)
course_details_df.show()

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [23]:
# Filtering and Transformation
# 3. Filter records where ProgressPercent < 50 .
course_enrollments_df.filter(course_enrollments_df.ProgressPercent<50).show()
# 4. Replace null ratings with average rating.
from pyspark.sql.functions import avg
average_rating = course_enrollments_df.select(avg("Rating")).collect()[0][0]
course_enrollments_df_filled = course_enrollments_df.fillna({"Rating": average_rating})
course_enrollments_df_filled.show()
# 5. Add column IsActive → 1 if Status is Active, else 0.
from pyspark.sql import *
from pyspark.sql.functions import *
course_enrollments_df.withColumn("IsActive",when(course_enrollments_df.Status=="Active",1).otherwise(0)).show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Begin

In [24]:
# Aggregations & Metrics
# 6. Find average progress by course.
course_enrollments_df.groupBy('CourseName').agg({'ProgressPercent': 'avg'}).show()
# 7. Get count of students in each course category.
course_enrollments_df.groupBy('Category').agg({'StudentName': 'count'}).show()
# 8. Identify the most enrolled course.
course_enrollments_df.groupBy('CourseName').agg({'StudentName': 'count'}).orderBy('count(StudentName)', ascending=False).show()

+--------------------+--------------------+
|          CourseName|avg(ProgressPercent)|
+--------------------+--------------------+
|Data Analysis wit...|               100.0|
|         Java Basics|                 0.0|
|Machine Learning 101|                60.0|
|Python for Beginners|                85.0|
| Power BI Essentials|                30.0|
+--------------------+--------------------+

+-----------+------------------+
|   Category|count(StudentName)|
+-----------+------------------+
|Programming|                 3|
|         AI|                 1|
|  Analytics|                 2|
+-----------+------------------+

+--------------------+------------------+
|          CourseName|count(StudentName)|
+--------------------+------------------+
|Python for Beginners|                 2|
|Data Analysis wit...|                 1|
|         Java Basics|                 1|
|Machine Learning 101|                 1|
| Power BI Essentials|                 1|
+--------------------+-------------

In [25]:
# 10. Join course_enrollments with course_details to include duration and instructor.
course_enrollments_df.join(course_details_df , course_enrollments_df.CourseName == course_details_df.CourseName).show()

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+--------------------+-------------+----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|          CourseName|DurationWeeks|Instructor|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+--------------------+-------------+----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|Python for Beginners|            4|    Rakesh|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|Data Analysis wit...|            3|    Anjali|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active| Power BI Essentials|            5|     Rekha|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inacti

In [26]:
# Window Functions
# 11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
from pyspark.sql.functions import *
windowSpec = Window.partitionBy("CourseName").orderBy(desc("ProgressPercent"))
course_enrollments_df.withColumn("rank",rank().over(windowSpec)).show()
# 12. Get lead and lag of EnrollDate by Category.
from pyspark.sql.window import Window
from pyspark.sql.functions import *
windowSpec = Window.partitionBy("Category").orderBy("EnrollDate")

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+----+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|rank|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+----+
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|   1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|   1|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|   1|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|   1|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|   1|
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|   2|
+------------+-----

In [27]:
# Pivoting & Formatting
# 13. Pivot data to show total enrollments by Category and Status.
course_enrollments_df.groupBy('Category').pivot('Status').agg({'StudentName': 'count'}).show()
# 14. Extract year and month from EnrollDate .
from pyspark.sql.functions import *
course_enrollments_df.withColumn("year",year("EnrollDate")).withColumn("month",month("EnrollDate")).show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+----+-----+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|year|month|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+----+-----+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|2024|    5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|2024|    5|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|2024|    5|
|      ENR004|       Neha|   

In [28]:
# Cleaning and Deduplication
# 15. Drop rows where Status is null or empty.
course_enrollments_df.na.drop(subset=["Status"]).show()
# 16. Remove duplicate enrollments using dropDuplicates() .
course_enrollments_df.dropDuplicates().show()

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [29]:
# Export
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
# JSON (overwrite mode)
# Parquet (snappy compression)
course_enrollments_df.write.mode("overwrite").csv("course_enrollments_csv")
course_enrollments_df.write.mode("overwrite").json("course_enrollments.json")
course_enrollments_df.write.mode("overwrite").parquet("course_enrollments.parquet",compression="snappy")