In [0]:
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import explode, col, sum as _sum, count, when, avg
spark


In [0]:

data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)


+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



In [0]:
#1 Flatten the Items array using explode()
from pyspark.sql.functions import explode,col

df_exploded = df_sales.withColumn("Item", explode("Items")) \
                      .select("OrderID", "Customer", "Region", "Amount", col("Item.Product").alias("Product"), col("Item.Qty").alias("Qty"))
df_exploded.show()


+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|    101|     Ali|  Asia|1200.0| Laptop|  1|
|    101|     Ali|  Asia|1200.0|  Mouse|  2|
|    102|    Zara|Europe| 650.0| Tablet|  1|
|    103|   Mohan|  Asia| 890.0|  Phone|  2|
|    103|   Mohan|  Asia| 890.0|Charger|  1|
|    104|    Sara|    US| 450.0|   Desk|  1|
+-------+--------+------+------+-------+---+



In [0]:
#2 Count total quantity sold per product
df_exploded.groupBy("Product").agg(_sum("Qty").alias("TotalQtySold")).show()


+-------+------------+
|Product|TotalQtySold|
+-------+------------+
| Laptop|         1.0|
|  Mouse|         2.0|
| Tablet|         1.0|
|  Phone|         2.0|
|Charger|         1.0|
|   Desk|         1.0|
+-------+------------+



In [0]:
#3 Count number of orders per region
df_sales.groupBy("Region").agg(count("*").alias("OrderCount")).show()


+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+



In [0]:
#4 Create HighValueOrder column
from pyspark.sql.functions import when

df_sales_flagged = df_sales.withColumn("HighValueOrder", when(col("Amount") > 1000, "Yes").otherwise("No"))
df_sales_flagged.select("OrderID", "Amount", "HighValueOrder").show()


+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+



In [0]:
#5 Add ShippingZone based on Region
df_sales_zoned = df_sales.withColumn("ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Other"))

df_sales_zoned.select("OrderID", "Region", "ShippingZone").show()


+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



In [0]:
#6 Register as Temporary View
df_sales.createOrReplaceTempView("sales_view")


In [0]:
#7 SQL query on sales_view
spark.sql("""
    SELECT Region, COUNT(*) as OrderCount, AVG(Amount) as AvgAmount
    FROM sales_view
    GROUP BY Region
""").show()


+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|  Asia|         2|   1045.0|
|Europe|         1|    650.0|
|    US|         1|    450.0|
+------+----------+---------+



In [0]:
#8 Save as Permanent Table
df_sales.write.mode("overwrite").saveAsTable("permanent_sales_view")
#to check 
spark.sql("SELECT * FROM permanent_sales_view").show()


+-------+--------+--------------------+------+------+
|OrderID|Customer|               Items|Region|Amount|
+-------+--------+--------------------+------+------+
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|
+-------+--------+--------------------+------+------+



In [0]:
#9 SQL query to filter orders with more than 1 item
spark.sql("""
    SELECT OrderID, Customer, Size(Items) AS ItemCount
    FROM sales_view
    WHERE SIZE(Items) > 1
""").show()


+-------+--------+---------+
|OrderID|Customer|ItemCount|
+-------+--------+---------+
|    101|     Ali|        2|
|    103|   Mohan|        2|
+-------+--------+---------+



In [0]:
#10 SQL to get customers with Amount > 800
spark.sql("""
    SELECT Customer, Amount
    FROM sales_view
    WHERE Amount > 800
""").show()


+--------+------+
|Customer|Amount|
+--------+------+
|     Ali|1200.0|
|   Mohan| 890.0|
+--------+------+



In [0]:
#11 Save exploded DataFrame as partitioned Parquet
df_exploded.write.mode("overwrite").partitionBy("Region").parquet("dbfs:/FileStore/sales_data/parquet_partitioned")


In [0]:
#To download the parquet file
import shutil
import os
import uuid

# 1. Define local temp dir for the copy
local_temp_parquet = f"/tmp/sales_parquet_partitioned_{uuid.uuid4()}"
dbfs_parquet_source = "/dbfs/FileStore/sales_data/parquet_partitioned"

# 2. Recursively copy the full partitioned folder (with subdirectories)
shutil.copytree(dbfs_parquet_source, local_temp_parquet)

# 3. Zip the folder
local_zip_path = "/tmp/sales_parquet_partitioned.zip"
shutil.make_archive(local_zip_path.replace(".zip", ""), 'zip', local_temp_parquet)

# 4. Move zip to DBFS for download
dbfs_zip_dest = "/dbfs/FileStore/sales_data/sales_parquet_partitioned.zip"
shutil.copy(local_zip_path, dbfs_zip_dest)


'/dbfs/FileStore/sales_data/sales_parquet_partitioned.zip'

In [0]:
#12 Read parquet and group by Product
df_parquet = spark.read.parquet("/tmp/sales_parquet_partitioned")
df_parquet.groupBy("Product").agg(_sum("Qty").alias("TotalQty")).show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|     2.0|
|Charger|     1.0|
| Laptop|     1.0|
|  Mouse|     2.0|
| Tablet|     1.0|
|   Desk|     1.0|
+-------+--------+

