In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
spark = SparkSession.builder \
    .appName("Set 1") \
    .getOrCreate()

spark
data = [
Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1},
{"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}],
Region="Europe", Amount=650.0),
Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2},
{"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}],
Region="US", Amount=450.0)
]
df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



# **Working with JSON & Nested Fields**

**1)Flatten the Items array using explode()**

In [4]:
from pyspark.sql.functions import explode, col
df_exploded = df_sales.select("OrderID", "Customer", "Region", "Amount", explode("Items").alias("Item")
).select("OrderID", "Customer", "Region", "Amount",col("Item.Product").alias("Product"),col("Item.Qty").alias("Qty"))

df_exploded.show()

+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|    101|     Ali|  Asia|1200.0| Laptop|  1|
|    101|     Ali|  Asia|1200.0|  Mouse|  2|
|    102|    Zara|Europe| 650.0| Tablet|  1|
|    103|   Mohan|  Asia| 890.0|  Phone|  2|
|    103|   Mohan|  Asia| 890.0|Charger|  1|
|    104|    Sara|    US| 450.0|   Desk|  1|
+-------+--------+------+------+-------+---+



**2) Count total quantity sold per product**

In [5]:
prod_qty = df_exploded.groupBy("Product").agg(sum("Qty").alias("Total_Qty"))
prod_qty.show()

+-------+---------+
|Product|Total_Qty|
+-------+---------+
| Laptop|      1.0|
|  Mouse|      2.0|
| Tablet|      1.0|
|   Desk|      1.0|
|  Phone|      2.0|
|Charger|      1.0|
+-------+---------+



**3) Count number of orders per region**

In [7]:
from pyspark.sql.functions import count
orders_reg = df_sales.groupBy("Region").agg(count("OrderID").alias("Order_Count"))
orders_reg.show()

+------+-----------+
|Region|Order_Count|
+------+-----------+
|Europe|          1|
|  Asia|          2|
|    US|          1|
+------+-----------+



# **Using when and otherwise**

**4. Create a new column HighValueOrder :**

**"Yes" if Amount > 1000**

**"No" otherwise**

In [8]:
from pyspark.sql.functions import when, lit
df_sales = df_sales.withColumn("HighValueOrder",when(col("Amount") > 1000, lit("Yes")).otherwise(lit("No")))
df_sales.show()

+-------+--------+--------------------+------+------+--------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|
+-------+--------+--------------------+------+------+--------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|
+-------+--------+--------------------+------+------+--------------+



**5. Add a column ShippingZone :**

**Asia → "Zone A", Europe → "Zone B", US → "Zone C"**

In [9]:
df_sales = df_sales.withColumn("ShippingZone",when(col("Region") == "Asia", lit("Zone A"))
.when(col("Region") == "Europe", lit("Zone B")).when(col("Region") == "US", lit("Zone C")).otherwise(lit("Unknown")))
df_sales.show()

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|      Zone B|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|      Zone C|
+-------+--------+--------------------+------+------+--------------+------------+



# **Temporary & Permanent Views**

**6. Register df_sales as a temporary view named sales_view .**

In [10]:
df_sales.createOrReplaceTempView("sales_view")

**7. Write a SQL query to:**

**Count orders by Region**

**Find average amount per region**

In [11]:
# Count orders by Region
spark.sql("select region, count(*) as Order_Count from sales_view group by Region").show()

# Find average amount per region
spark.sql("select region, avg(amount) as Avg_Amount from sales_view group by Region").show()

+------+-----------+
|region|Order_Count|
+------+-----------+
|Europe|          1|
|  Asia|          2|
|    US|          1|
+------+-----------+

+------+----------+
|region|Avg_Amount|
+------+----------+
|Europe|     650.0|
|  Asia|    1045.0|
|    US|     450.0|
+------+----------+



**8. Create a permanent view using saveAsTable() .**

In [12]:
df_sales.write.saveAsTable("sale_view")

# **SQL Queries via Spark**

In [13]:
spark.sql("SELECT Region, COUNT(*) as OrderCount FROM sales_view GROUP BY Region").show()

+------+----------+
|Region|OrderCount|
+------+----------+
|Europe|         1|
|  Asia|         2|
|    US|         1|
+------+----------+



**9. Use SQL to filter all orders with more than 1 item.**

In [14]:
spark.sql("select OrderID, Customer, size(Items) as ItemCount from sales_view where size(Items) > 1").show()

+-------+--------+---------+
|OrderID|Customer|ItemCount|
+-------+--------+---------+
|    101|     Ali|        2|
|    103|   Mohan|        2|
+-------+--------+---------+



**10. Use SQL to extract customer names where Amount > 800.**

In [15]:
spark.sql("select customer from sales_view where Amount > 800").show()

+--------+
|customer|
+--------+
|     Ali|
|   Mohan|
+--------+



# **Saving as Parquet and Reading Again**

**11. Save the exploded product-level DataFrame as a partitioned Parquet file by region .**

In [16]:
df_exploded.write.partitionBy("Region").parquet("sales_partitioned.parquet")

**12. Read the parquet back and perform a group-by on Product .**

In [17]:
df_parquet = spark.read.parquet("sales_partitioned.parquet")
df_parquet.groupBy("Product").agg(count("*").alias("Count")).show()

+-------+-----+
|Product|Count|
+-------+-----+
|  Phone|    1|
| Laptop|    1|
|Charger|    1|
|  Mouse|    1|
|   Desk|    1|
| Tablet|    1|
+-------+-----+

