In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Multi_Category_Exercise") \
    .getOrCreate()

In [None]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]

#PHASE 1 — SCHEMA & INGESTION

1. Define an explicit schema
2. Create a DataFrame using the schema
3. Print and verify schema

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

df = spark.createDataFrame(orders_data, schema)
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



#PHASE 2 — DATA CLEANING

4. Trim all string columns
5. Standardize city , category , and product values
6. Convert amount to IntegerType
7. Handle invalid, empty, and null amount values
8. Convert order_date into DateType (handle multiple formats)
9. Remove duplicate order_id records
10. Keep only Completed orders

In [None]:
from pyspark.sql.functions import col, trim, when, regexp_replace, expr, coalesce, to_date

#Trim all string columns first
for column in df.columns:
    df = df.withColumn(column, trim(col(column)))

#Standardize city values
df = df.withColumn("city",
    when(col("city") == "Delhi", "Delhi")
    .when(col("city") == "Mumbai", "Mumbai")
    .when(col("city") == "Bangalore", "Bangalore")
    .when(col("city") == "Chennai", "Chennai")
    .otherwise(col("city")))

#Standardize category values
df = df.withColumn("category",
    when(col("category") == "Electronics", "Electronics")
    .when(col("category") == "Home", "Home")
    .otherwise(col("category")))

#Standardize product values
df = df.withColumn("product",
    when(col("product") == "Laptop", "Laptop")
    .when(col("product") == "Mobile", "Mobile")
    .when(col("product") == "Tablet", "Tablet")
    .when(col("product") == "Mixer", "Mixer")
    .when(col("product") == "Vacuum", "Vacuum")
    .when(col("product") == "AirPurifier", "AirPurifier")
    .otherwise(col("product")))

#Handle invalid amount values and convert to Integer
df = df.withColumn("amount",
    when((col("amount").isNull()) |
         (col("amount") == "") |
         (col("amount") == "invalid"), None)
    .otherwise(col("amount").cast(IntegerType())))

# First standardize separators (/ becomes -)
df = df.withColumn("order_date_temp",
    regexp_replace(col("order_date"), "/", "-"))

# Then convert based on pattern detection using SQL CASE
df = df.withColumn("order_date",
    expr("""
        CASE
            WHEN order_date_temp RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
                THEN to_date(order_date_temp, 'yyyy-MM-dd')
            WHEN order_date_temp RLIKE '^[0-9]{2}-[0-9]{2}-[0-9]{4}$'
                THEN to_date(order_date_temp, 'dd-MM-yyyy')
            ELSE NULL
        END
    """)
).drop("order_date_temp")

#Remove duplicate order_ids
df = df.dropDuplicates(["order_id"])

#Keep only Completed orders
df_cleaned = df.filter(col("status") == "Completed")

#Remove rows with null amounts or dates
df_cleaned = df_cleaned.filter(col("amount").isNotNull() & col("order_date").isNotNull())


df_cleaned.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|AirPurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

#PHASE 3 — DATA VALIDATION

11. Count records before and after cleaning
12. Verify no nulls in order_id , amount , and order_date
13. Confirm correct data types

In [None]:
print(f"Records after cleaning: {df_cleaned.count()}")


# Filter out null amounts and dates
df_cleaned = df_cleaned.filter(col("amount").isNotNull() & col("order_date").isNotNull())

print(f"Null check - order_id: {df_cleaned.filter(col('order_id').isNull()).count()}")
print(f"Null check - amount: {df_cleaned.filter(col('amount').isNull()).count()}")
print(f"Null check - order_date: {df_cleaned.filter(col('order_date').isNull()).count()}")
df_cleaned.printSchema()

Records after cleaning: 17
Null check - order_id: 0
Null check - amount: 0
Null check - order_date: 0
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



#PHASE 4 — ANALYTICS & AGGREGATIONS

14. Total revenue per city
15. Total revenue per category
16. Total revenue per product
17. Average order value per city
18. Identify top 3 products by revenue

In [None]:
# Total revenue per city
revenue_by_city = df_cleaned.groupBy("city").agg(sum("amount").alias("total_revenue"))
revenue_by_city.orderBy(col("total_revenue").desc()).show()

# Total revenue per category
revenue_by_category = df_cleaned.groupBy("category").agg(sum("amount").alias("total_revenue"))
revenue_by_category.orderBy(col("total_revenue").desc()).show()

# Total revenue per product
revenue_by_product = df_cleaned.groupBy("product").agg(sum("amount").alias("total_revenue"))
revenue_by_product.orderBy(col("total_revenue").desc()).show()

# Average order value per city
avg_order_by_city = df_cleaned.groupBy("city").agg(avg("amount").alias("avg_order_value"))
avg_order_by_city.orderBy(col("avg_order_value").desc()).show()

# Top 3 products by revenue
top_3_products = revenue_by_product.orderBy(col("total_revenue").desc()).limit(3)
top_3_products.show()


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       217000|
|    Delhi|       187000|
|   Mumbai|       185000|
|  Chennai|        62000|
+---------+-------------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|       464000|
|       Home|       187000|
+-----------+-------------+

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Laptop|       314000|
|     Vacuum|        88000|
|     Tablet|        85000|
|AirPurifier|        78000|
|     Mobile|        65000|
|      Mixer|        21000|
+-----------+-------------+

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|  Chennai|           62000.0|
|    Delhi|           37400.0|
|   Mumbai|           37000.0|
|Bangalore|36166.666666666664|
+---------+------------------+

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       3140

# PHASE 5 — WINDOW FUNCTIONS

19. Rank cities by total revenue
20. Rank products within each category by revenue
21. Identify the top product per category

In [None]:
# Rank cities by total revenue
window_city = Window.orderBy(col("total_revenue").desc())
cities_ranked = revenue_by_city.withColumn("rank", rank().over(window_city))
print("Cities Ranked by Revenue:")
cities_ranked.show()

# Rank products within each category by revenue
product_category_revenue = df_cleaned.groupBy("category", "product").agg(sum("amount").alias("total_revenue"))
window_category = Window.partitionBy("category").orderBy(col("total_revenue").desc())
products_ranked = product_category_revenue.withColumn("rank", dense_rank().over(window_category))
print("Products Ranked within Category:")
products_ranked.orderBy("category", "rank").show()

# Top product per category
top_product_per_category = products_ranked.filter(col("rank") == 1)
print("Top Product per Category:")
top_product_per_category.show()

Cities Ranked by Revenue:
+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|Bangalore|       217000|   1|
|    Delhi|       187000|   2|
|   Mumbai|       185000|   3|
|  Chennai|        62000|   4|
+---------+-------------+----+

Products Ranked within Category:
+-----------+-----------+-------------+----+
|   category|    product|total_revenue|rank|
+-----------+-----------+-------------+----+
|Electronics|     Laptop|       314000|   1|
|Electronics|     Tablet|        85000|   2|
|Electronics|     Mobile|        65000|   3|
|       Home|     Vacuum|        88000|   1|
|       Home|AirPurifier|        78000|   2|
|       Home|      Mixer|        21000|   3|
+-----------+-----------+-------------+----+

Top Product per Category:
+-----------+-------+-------------+----+
|   category|product|total_revenue|rank|
+-----------+-------+-------------+----+
|Electronics| Laptop|       314000|   1|
|       Home| Vacuum|        88000|   1|
+---------

# PHASE 6 — PERFORMANCE AWARENESS

22. Cache the cleaned DataFrame
23. Run multiple aggregations and observe behavior
24. Use explain(True) to inspect shuffle and execution plan
25. Repartition data by city and explain why

In [None]:
# Cache the cleaned DataFrame
df_cleaned.cache()

# Run multiple aggregations
print(f"Total orders: {df_cleaned.count()}")
print(f"Total revenue: {df_cleaned.agg(sum('amount')).collect()[0][0]}")

# Explain execution plan
df_cleaned.explain(True)

# Repartition by city
df_repartitioned = df_cleaned.repartition("city")
print(f"\nRepartitioned by city. New partition count: {df_repartitioned.rdd.getNumPartitions()}")

#Cache released
df_cleaned.unpersist()


Total orders: 17
Total revenue: 651000
== Parsed Logical Plan ==
'Filter 'and('isNotNull('amount), 'isNotNull('order_date))
+- Filter (isnotnull(amount#20560) AND isnotnull(order_date#20562))
   +- Filter (status#20556 = Completed)
      +- Deduplicate [order_id#20549]
         +- Project [order_id#20549, customer_id#20550, city#20557, category#20558, product#20559, amount#20560, order_date#20562, status#20556]
            +- Project [order_id#20549, customer_id#20550, city#20557, category#20558, product#20559, amount#20560, CASE WHEN RLIKE(order_date_temp#20561, ^[0-9]{4}-[0-9]{2}-[0-9]{2}$) THEN to_date(order_date_temp#20561, Some(yyyy-MM-dd), Some(Etc/UTC), true) WHEN RLIKE(order_date_temp#20561, ^[0-9]{2}-[0-9]{2}-[0-9]{4}$) THEN to_date(order_date_temp#20561, Some(dd-MM-yyyy), Some(Etc/UTC), true) ELSE cast(null as date) END AS order_date#20562, status#20556, order_date_temp#20561]
               +- Project [order_id#20549, customer_id#20550, city#20557, category#20558, product#20

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: date, status: string]

# PHASE 7 — FILE FORMAT OUTPUT

26. Write cleaned order-level data to Parquet
27. Write aggregated analytics to ORC
28. Read both back and validate schema

In [None]:
# Write cleaned data to Parquet
df_cleaned.write.mode("overwrite").parquet("output/cleaned_orders.parquet")
print("Cleaned data written to Parquet")

# Write aggregated analytics to ORC
revenue_by_city.write.mode("overwrite").orc("output/city_analytics.orc")
print("City analytics written to ORC")

# Read back and validate
parquet_df = spark.read.parquet("output/cleaned_orders.parquet")
orc_df = spark.read.orc("output/city_analytics.orc")


parquet_df.printSchema()
orc_df.printSchema()

Cleaned data written to Parquet
City analytics written to ORC
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)



# PHASE 8 — DEBUGGING CHECK

29. Explain why this line is incorrect:
df = df.filter(df.amount > 30000).show()
30. Write the corrected version

29- .show() returns None, so df becomes None instead of a DataFrame. You lose the filtered DataFrame and can't use it further

30-CORRECT:
<br>
df = df.filter(df.amount > 30000)
<br>
df.show()

df_filtered = df.filter(df.amount > 30000)
<br>
df_filtered.show()