In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Dataset

In [23]:
raw_orders = [
    ("ORD001","C001","Ravi"," Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("ORD002","C002","Sneha","Mumbai"," Mobile ","Electronics","32000","05/01/2024","Completed"),
    ("ORD003","C003","Aman","Bangalore","Laptop","Electronics","55000","2024/01/06","Completed"),
    ("ORD004","C004","Pooja","Delhi","Tablet"," Electronics ","","2024-01-07","Cancelled"),
    ("ORD005","C005","Neha","Chennai","Laptop","Electronics","48000","invalid_date","Completed"),
    ("ORD006","C006","Rahul","Mumbai","Mobile","Electronics",None,"2024-01-08","Completed"),
    ("ORD007","C007","Kiran","Bangalore","Tablet","Electronics","30000","2024-01-08","Completed"),
    ("ORD008","C008","Amit","Delhi","Laptop","electronics","45000","2024-01-09","Completed"),
    ("ORD009","C009","Priya"," Pune","Mobile","Electronics","28000","09-01-2024","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD011","C011","Meena","Chennai","Tablet","Electronics","31000","2024-01-11","Completed"),
    ("ORD012","C012","Arjun","Delhi","Mobile","Electronics","27000","2024/01/11","Completed"),
    ("ORD013","C013","Nikhil","Bangalore","Laptop","Electronics","60000","2024-01-12","Completed"),
    ("ORD014","C014","Rohit","Mumbai","Mobile","Electronics","invalid_price","2024-01-12","Completed"),
    ("ORD015","C015","Anita","Delhi","Tablet","Electronics","29000","2024-01-13","Completed"),
    ("ORD016","C016","Vikas","Chennai","Laptop","Electronics","52000","2024-01-13","Completed"),
    ("ORD017","C017","Sunita","Mumbai","Mobile","Electronics","33000","2024-01-14","Completed"),
    ("ORD018","C018","Deepak","Bangalore","Laptop","Electronics","58000","2024-01-14","Completed"),
    ("ORD019","C019","Pallavi","Delhi","Mobile","Electronics","26000","2024-01-15","Completed"),
    ("ORD020","C020","Manish","Mumbai","Tablet","Electronics","34000","2024-01-15","Completed")
]

In [24]:
from datetime import datetime

def parse_date(date_str):
    if not date_str or date_str.strip().lower() == "invalid_date":
        return None
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d", "%d-%m-%Y"):
        try:
            return datetime.strptime(date_str.strip(), fmt).strftime("%Y-%m-%d")
        except ValueError:
            continue
    return None


In [33]:
def parse_price(price_str):
    try:
        return int(price_str)
    except (TypeError, ValueError):
        return None

In [34]:
preprocessed = []
for row in raw_orders:
    order_id, cust_id, cust_name, city, product, category, price, order_date, status = row
    preprocessed.append((
        order_id,
        cust_id,
        cust_name,
        city,
        product,
        category,
        parse_price(price),
        parse_date(order_date),
        status
    ))

In [35]:
columns = ["OrderID","CustomerID","CustomerName","City","Product","Category","Price","OrderDate","Status"]
df = spark.createDataFrame(preprocessed, columns)

In [36]:
df.show()

+-------+----------+------------+---------+--------+-------------+-----+----------+---------+
|OrderID|CustomerID|CustomerName|     City| Product|     Category|Price| OrderDate|   Status|
+-------+----------+------------+---------+--------+-------------+-----+----------+---------+
| ORD001|      C001|        Ravi|   Delhi |  Laptop|  Electronics|45000|2024-01-05|Completed|
| ORD002|      C002|       Sneha|   Mumbai| Mobile |  Electronics|32000|2024-01-05|Completed|
| ORD003|      C003|        Aman|Bangalore|  Laptop|  Electronics|55000|2024-01-06|Completed|
| ORD004|      C004|       Pooja|    Delhi|  Tablet| Electronics | NULL|2024-01-07|Cancelled|
| ORD005|      C005|        Neha|  Chennai|  Laptop|  Electronics|48000|      NULL|Completed|
| ORD006|      C006|       Rahul|   Mumbai|  Mobile|  Electronics| NULL|2024-01-08|Completed|
| ORD007|      C007|       Kiran|Bangalore|  Tablet|  Electronics|30000|2024-01-08|Completed|
| ORD008|      C008|        Amit|    Delhi|  Laptop|  electr

#Trim space and normailize data

In [37]:
from pyspark.sql.functions import col, trim, lower, upper, regexp_replace, when
from pyspark.sql.functions import to_date
from pyspark.sql.types import IntegerType

In [38]:
df = df.withColumn("City", trim(col("City"))) \
       .withColumn("Product", trim(col("Product"))) \
       .withColumn("Category", upper(trim(col("Category"))))

 # Standardize Date formats

In [39]:
df.show()
df.printSchema()

+-------+----------+------------+---------+-------+-----------+-----+----------+---------+
|OrderID|CustomerID|CustomerName|     City|Product|   Category|Price| OrderDate|   Status|
+-------+----------+------------+---------+-------+-----------+-----+----------+---------+
| ORD001|      C001|        Ravi|    Delhi| Laptop|ELECTRONICS|45000|2024-01-05|Completed|
| ORD002|      C002|       Sneha|   Mumbai| Mobile|ELECTRONICS|32000|2024-01-05|Completed|
| ORD003|      C003|        Aman|Bangalore| Laptop|ELECTRONICS|55000|2024-01-06|Completed|
| ORD004|      C004|       Pooja|    Delhi| Tablet|ELECTRONICS| NULL|2024-01-07|Cancelled|
| ORD005|      C005|        Neha|  Chennai| Laptop|ELECTRONICS|48000|      NULL|Completed|
| ORD006|      C006|       Rahul|   Mumbai| Mobile|ELECTRONICS| NULL|2024-01-08|Completed|
| ORD007|      C007|       Kiran|Bangalore| Tablet|ELECTRONICS|30000|2024-01-08|Completed|
| ORD008|      C008|        Amit|    Delhi| Laptop|ELECTRONICS|45000|2024-01-09|Completed|

#CLEANING & TRANSFORMATION TASKS

In [40]:
df = df.toDF(
    "order_id",
    "customer_id",
    "customer_name",
    "city",
    "product",
    "category",
    "price",
    "order_date",
    "status"
)

In [42]:
from pyspark.sql.functions import col, round, when

df = df.withColumn("price_with_tax", round(col("price") * 1.18, 2))

In [43]:
df = df.withColumn(
    "price_category",
    when(col("price") < 30000, "Low")
    .when((col("price") >= 30000) & (col("price") < 50000), "Medium")
    .otherwise("High")
)

In [44]:
df.show(truncate=False)

+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+
|order_id|customer_id|customer_name|city     |product|category   |price|order_date|status   |price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+
|ORD001  |C001       |Ravi         |Delhi    |Laptop |ELECTRONICS|45000|2024-01-05|Completed|53100.0       |Medium        |
|ORD002  |C002       |Sneha        |Mumbai   |Mobile |ELECTRONICS|32000|2024-01-05|Completed|37760.0       |Medium        |
|ORD003  |C003       |Aman         |Bangalore|Laptop |ELECTRONICS|55000|2024-01-06|Completed|64900.0       |High          |
|ORD004  |C004       |Pooja        |Delhi    |Tablet |ELECTRONICS|NULL |2024-01-07|Cancelled|NULL          |High          |
|ORD005  |C005       |Neha         |Chennai  |Laptop |ELECTRONICS|48000|NULL      |Completed|56640.0       |Medium        |
|ORD006 

#Create order_year , order_month

In [47]:
df = df.withColumn("order_year", col("order_date").substr(1,4)) \
       .withColumn("order_month", col("order_date").substr(6,2))



In [48]:
df.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+----------+-----------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|   status|price_with_tax|price_category|order_year|order_month|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+----------+-----------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|ELECTRONICS|45000|2024-01-05|Completed|       53100.0|        Medium|      2024|         01|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|ELECTRONICS|32000|2024-01-05|Completed|       37760.0|        Medium|      2024|         01|
|  ORD003|       C003|         Aman|Bangalore| Laptop|ELECTRONICS|55000|2024-01-06|Completed|       64900.0|          High|      2024|         01|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|ELECTRONICS| NULL|2024-01-07|Cancelled|          NULL|          

# Aggregate total revenue per city

In [49]:
revenue_per_city = df.groupBy("City").agg(sum("Price").alias("total_revenue"))

In [50]:
revenue_per_city.show()

+---------+-------------+
|     City|total_revenue|
+---------+-------------+
|Bangalore|       203000|
|  Chennai|       131000|
|   Mumbai|       209000|
|     Pune|        28000|
|    Delhi|       172000|
+---------+-------------+



# Aggregate total revenue per product

In [51]:
revenue_per_product = df.groupBy("Product").agg(sum("Price").alias("total_revenue"))

In [52]:
revenue_per_product.show()

+-------+-------------+
|Product|total_revenue|
+-------+-------------+
| Laptop|       473000|
| Mobile|       146000|
| Tablet|       124000|
+-------+-------------+



#Identify top 3 cities by revenue

In [53]:
from pyspark.sql.functions import desc
top3_cities = revenue_per_city.orderBy(desc("total_revenue")).limit(3)

In [54]:
top3_cities.show()

+---------+-------------+
|     City|total_revenue|
+---------+-------------+
|   Mumbai|       209000|
|Bangalore|       203000|
|    Delhi|       172000|
+---------+-------------+



# Identify products with average price above threshold

In [55]:
from pyspark.sql.functions import avg
threshold = 40000
products_above_threshold = df.groupBy("Product") \
                             .agg(avg("Price").alias("avg_price")) \
                             .filter(col("avg_price") > threshold)

# Write cleaned data to Parquet

In [56]:
df.write.mode("overwrite").parquet("cleaned_orders_parquet")

# Read Parquet back and verify schema

In [57]:
df_parquet = spark.read.parquet("cleaned_orders_parquet")
df_parquet.printSchema()
df_parquet.show(truncate=False)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- price_with_tax: double (nullable = true)
 |-- price_category: string (nullable = true)
 |-- order_year: string (nullable = true)
 |-- order_month: string (nullable = true)

+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+----------+-----------+
|order_id|customer_id|customer_name|city     |product|category   |price|order_date|status   |price_with_tax|price_category|order_year|order_month|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+---------+--------------+--------------+----------+-----------+
|ORD010  |C010    

#Write the same data to ORC


In [62]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

import os, findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
findspark.init()

# no. of Partitions

In [70]:
print("Number of partitions:", df.rdd.getNumPartitions())

Number of partitions: 2


# Repartition before writing

In [71]:
df_repart = df.repartition(4)

df_repart.write.mode("overwrite").parquet("/content/cleaned_orders_parquet")

df_repart.write.mode("overwrite").orc("/content/cleaned_orders_orc")

# Compare file counts between Parquet and ORC

In [72]:
import os

parquet_files = os.listdir("/content/cleaned_orders_parquet")
orc_files = os.listdir("/content/cleaned_orders_orc")

print("Parquet file count:", len(parquet_files))
print("ORC file count:", len(orc_files))

Parquet file count: 10
ORC file count: 10


# Run pipeline

In [73]:
df_repart.explain(True)

== Parsed Logical Plan ==
Repartition 4, true
+- Project [order_id#265, customer_id#266, customer_name#267, city#268, product#269, category#270, price#271L, order_date#272, status#273, price_with_tax#274, price_category#275, order_year#312, substr(order_date#272, 6, 2) AS order_month#313]
   +- Project [order_id#265, customer_id#266, customer_name#267, city#268, product#269, category#270, price#271L, order_date#272, status#273, price_with_tax#274, price_category#275, substr(order_date#272, 1, 4) AS order_year#312]
      +- Project [order_id#265, customer_id#266, customer_name#267, city#268, product#269, category#270, price#271L, order_date#272, status#273, price_with_tax#274, CASE WHEN (price#271L < cast(30000 as bigint)) THEN Low WHEN ((price#271L >= cast(30000 as bigint)) AND (price#271L < cast(50000 as bigint))) THEN Medium ELSE High END AS price_category#275]
         +- Project [order_id#265, customer_id#266, customer_name#267, city#268, product#269, category#270, price#271L, orde