In [1]:
!pip install pyspark



# Dataset Loading

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Food Delivery Analytics") \
.getOrCreate()

In [3]:
orders_data = [
("O001","North","Delhi","Rest-01","Pizza","2024-02-01",450,35),
("O002","North","Delhi","Rest-01","Burger","2024-02-01",250,25),
("O003","North","Chandigarh","Rest-02","Pasta","2024-02-02",350,30),
("O004","South","Bangalore","Rest-03","Pizza","2024-02-01",500,40),
("O005","South","Chennai","Rest-04","Burger","2024-02-02",220,20),
("O006","South","Bangalore","Rest-03","Pasta","2024-02-03",380,32),
("O007","East","Kolkata","Rest-05","Pizza","2024-02-01",420,38),
("O008","East","Kolkata","Rest-05","Burger","2024-02-02",260,26),
("O009","East","Patna","Rest-06","Pasta","2024-02-03",300,28),
("O010","West","Mumbai","Rest-07","Pizza","2024-02-01",520,42),
("O011","West","Mumbai","Rest-07","Burger","2024-02-02",280,27),
("O012","West","Pune","Rest-08","Pasta","2024-02-03",340,31),
("O013","North","Delhi","Rest-01","Pizza","2024-02-04",480,37),
("O014","South","Chennai","Rest-04","Pizza","2024-02-04",510,41),
("O015","East","Patna","Rest-06","Burger","2024-02-04",240,24),
("O016","West","Pune","Rest-08","Pizza","2024-02-04",500,39),
("O017","North","Chandigarh","Rest-02","Burger","2024-02-05",260,26),
("O018","South","Bangalore","Rest-03","Burger","2024-02-05",290,29),
("O019","East","Kolkata","Rest-05","Pasta","2024-02-05",360,33),
("O020","West","Mumbai","Rest-07","Pasta","2024-02-05",390,34),
("O021","North","Delhi","Rest-01","Pasta","2024-02-06",370,30),
("O022","South","Chennai","Rest-04","Pasta","2024-02-06",330,29),
("O023","East","Patna","Rest-06","Pizza","2024-02-06",460,36),
("O024","West","Pune","Rest-08","Burger","2024-02-06",270,26)
]

In [4]:
columns = [
"order_id","region","city","restaurant_id",
"food_item","order_date","amount","delivery_time_min"
]

In [5]:
df_orders = spark.createDataFrame(orders_data, columns)
df_orders.show(5)
df_orders.printSchema()

+--------+------+----------+-------------+---------+----------+------+-----------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+----------+-------------+---------+----------+------+-----------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|
+--------+------+----------+-------------+---------+----------+------+-----------------+
only showing top 5 rows
root
 |-- order_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- restaurant_id: string (nullable = true)
 |-- food_i

# Select only order_id , region , food_item , amount

In [6]:
df_orders.select("order_id" , "region" , "food_item" , "amount").show()

+--------+------+---------+------+
|order_id|region|food_item|amount|
+--------+------+---------+------+
|    O001| North|    Pizza|   450|
|    O002| North|   Burger|   250|
|    O003| North|    Pasta|   350|
|    O004| South|    Pizza|   500|
|    O005| South|   Burger|   220|
|    O006| South|    Pasta|   380|
|    O007|  East|    Pizza|   420|
|    O008|  East|   Burger|   260|
|    O009|  East|    Pasta|   300|
|    O010|  West|    Pizza|   520|
|    O011|  West|   Burger|   280|
|    O012|  West|    Pasta|   340|
|    O013| North|    Pizza|   480|
|    O014| South|    Pizza|   510|
|    O015|  East|   Burger|   240|
|    O016|  West|    Pizza|   500|
|    O017| North|   Burger|   260|
|    O018| South|   Burger|   290|
|    O019|  East|    Pasta|   360|
|    O020|  West|    Pasta|   390|
+--------+------+---------+------+
only showing top 20 rows


#Rename amount to order_value

In [10]:
df_orders.withColumnRenamed("amount", "oredr_value").show()

+--------+------+----------+-------------+---------+----------+-----------+-----------------+
|order_id|region|      city|restaurant_id|food_item|order_date|oredr_value|delivery_time_min|
+--------+------+----------+-------------+---------+----------+-----------+-----------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|        450|               35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|        250|               25|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|        350|               30|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|        500|               40|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|        220|               20|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|        380|               32|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|        420|               38|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-

# Create a new column amount_in_hundreds

In [12]:
from pyspark.sql.functions import col, expr, dayofmonth

df_orders.select("order_id", "region", "food_item", col("amount").alias("order_value")).show()

+--------+------+---------+-----------+
|order_id|region|food_item|order_value|
+--------+------+---------+-----------+
|    O001| North|    Pizza|        450|
|    O002| North|   Burger|        250|
|    O003| North|    Pasta|        350|
|    O004| South|    Pizza|        500|
|    O005| South|   Burger|        220|
|    O006| South|    Pasta|        380|
|    O007|  East|    Pizza|        420|
|    O008|  East|   Burger|        260|
|    O009|  East|    Pasta|        300|
|    O010|  West|    Pizza|        520|
|    O011|  West|   Burger|        280|
|    O012|  West|    Pasta|        340|
|    O013| North|    Pizza|        480|
|    O014| South|    Pizza|        510|
|    O015|  East|   Burger|        240|
|    O016|  West|    Pizza|        500|
|    O017| North|   Burger|        260|
|    O018| South|   Burger|        290|
|    O019|  East|    Pasta|        360|
|    O020|  West|    Pasta|        390|
+--------+------+---------+-----------+
only showing top 20 rows


# Rename amount to order_value

In [13]:
df_orders.select("order_id", "region", "food_item", col("amount").alias("order_value")).show()

+--------+------+---------+-----------+
|order_id|region|food_item|order_value|
+--------+------+---------+-----------+
|    O001| North|    Pizza|        450|
|    O002| North|   Burger|        250|
|    O003| North|    Pasta|        350|
|    O004| South|    Pizza|        500|
|    O005| South|   Burger|        220|
|    O006| South|    Pasta|        380|
|    O007|  East|    Pizza|        420|
|    O008|  East|   Burger|        260|
|    O009|  East|    Pasta|        300|
|    O010|  West|    Pizza|        520|
|    O011|  West|   Burger|        280|
|    O012|  West|    Pasta|        340|
|    O013| North|    Pizza|        480|
|    O014| South|    Pizza|        510|
|    O015|  East|   Burger|        240|
|    O016|  West|    Pizza|        500|
|    O017| North|   Burger|        260|
|    O018| South|   Burger|        290|
|    O019|  East|    Pasta|        360|
|    O020|  West|    Pasta|        390|
+--------+------+---------+-----------+
only showing top 20 rows


# Create a new column amount_in_hundreds

In [14]:
df_orders.withColumn("amount_in_hundreds", col("amount")/100).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|amount_in_hundreds|
+--------+------+----------+-------------+---------+----------+------+-----------------+------------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|               4.5|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|               2.5|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|               3.5|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|               5.0|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|               2.2|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32|               3.8|
|    O007|  East|   Kolkata|

# Select distinct combinations of region and food_item

In [15]:
df_orders.select("region", "food_item").distinct().show()

+------+---------+
|region|food_item|
+------+---------+
|  West|   Burger|
|  East|    Pizza|
|  West|    Pizza|
| North|    Pizza|
| South|    Pizza|
|  East|   Burger|
| North|    Pasta|
|  East|    Pasta|
| North|   Burger|
| South|    Pasta|
| South|   Burger|
|  West|    Pasta|
+------+---------+



# Reorder columns in a logical reporting format

In [18]:
df_orders.select("region","city","restaurant_id","order_id","food_item",
                 col("amount").alias("order_value"),"delivery_time_min","order_date").show()

+------+----------+-------------+--------+---------+-----------+-----------------+----------+
|region|      city|restaurant_id|order_id|food_item|order_value|delivery_time_min|order_date|
+------+----------+-------------+--------+---------+-----------+-----------------+----------+
| North|     Delhi|      Rest-01|    O001|    Pizza|        450|               35|2024-02-01|
| North|     Delhi|      Rest-01|    O002|   Burger|        250|               25|2024-02-01|
| North|Chandigarh|      Rest-02|    O003|    Pasta|        350|               30|2024-02-02|
| South| Bangalore|      Rest-03|    O004|    Pizza|        500|               40|2024-02-01|
| South|   Chennai|      Rest-04|    O005|   Burger|        220|               20|2024-02-02|
| South| Bangalore|      Rest-03|    O006|    Pasta|        380|               32|2024-02-03|
|  East|   Kolkata|      Rest-05|    O007|    Pizza|        420|               38|2024-02-01|
|  East|   Kolkata|      Rest-05|    O008|   Burger|        

# Create a column order_day extracted from order_date

In [19]:
df_orders.withColumn("order_day", dayofmonth(col("order_date"))).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+---------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|order_day|
+--------+------+----------+-------------+---------+----------+------+-----------------+---------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|        1|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|        1|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|        2|
|    O004| South| Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|        1|
|    O005| South|   Chennai|      Rest-04|   Burger|2024-02-02|   220|               20|        2|
|    O006| South| Bangalore|      Rest-03|    Pasta|2024-02-03|   380|               32|        3|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|        1|
|    O008|

# Filter orders where amount > 400

In [21]:
df_orders.filter(df_orders.amount>400).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

# Filter only Pizza orders

In [23]:
df_orders.filter(df_orders.food_item=="Pizza").show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

# Filter orders from Delhi and Mumbai

In [25]:
df_orders.filter((df_orders.city=="Delhi") | (df_orders.city=="Mumbai")).show()

+--------+------+------+-------------+---------+----------+------+-----------------+
|order_id|region|  city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+------+-------------+---------+----------+------+-----------------+
|    O001| North| Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O002| North| Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|
|    O010|  West|Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O011|  West|Mumbai|      Rest-07|   Burger|2024-02-02|   280|               27|
|    O013| North| Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O020|  West|Mumbai|      Rest-07|    Pasta|2024-02-05|   390|               34|
|    O021| North| Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|
+--------+------+------+-------------+---------+----------+------+-----------------+



# Filter orders with delivery time greater than 35 minutes


In [27]:
df_orders.filter(col("delivery_time_min") > 35).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-------------+---------+----------+------+-----------------+



# Apply multiple conditions using AND and OR (e.g., Pizza orders with amount > 400 OR delivery_time > 40)

In [29]:
df_orders.filter(((col("food_item") == "Pizza") & (col("amount") > 400)) | (col("delivery_time_min") > 40)).show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

# Apply filters in different ways

In [30]:
df_orders.createOrReplaceTempView("orders")

spark.sql("""
SELECT * FROM orders
WHERE food_item = 'Pizza' AND amount > 400
""").show()

+--------+------+---------+-------------+---------+----------+------+-----------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|
+--------+------+---------+-------------+---------+----------+------+-----------------+
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|               42|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|
|    O016|  West|     Pune|      Rest-08|    Pizza|2024-02-04|   500|               39|
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|
+--------+------+---------+-----

# Apply filters in different orders and compare

In [31]:

df_orders.filter(col("amount") > 400).filter(col("food_item") == "Pizza").explain(True)

df_orders.filter(col("food_item") == "Pizza").filter(col("amount") > 400).explain(True)

== Parsed Logical Plan ==
'Filter '`=`('food_item, Pizza)
+- Filter (amount#6L > cast(400 as bigint))
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
order_id: string, region: string, city: string, restaurant_id: string, food_item: string, order_date: string, amount: bigint, delivery_time_min: bigint
Filter (food_item#4 = Pizza)
+- Filter (amount#6L > cast(400 as bigint))
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Filter ((isnotnull(amount#6L) AND isnotnull(food_item#4)) AND ((amount#6L > 400) AND (food_item#4 = Pizza)))
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
*(1) Filter ((isnotnull(amount#6L) AND isnotnull(food_item#4)) AND ((amount#6L > 400) AND (

# Pipeline with select → filter → derived column

In [32]:
pipeline = df_orders \
    .select("order_id","region","food_item","amount") \
    .filter(col("amount") > 300) \
    .withColumn("amount_in_hundreds", col("amount")/100)

In [33]:
pipeline.count()

15

# Check number of partitions


In [34]:
df_orders.rdd.getNumPartitions()

2

# Repartition into 4 partitions

In [35]:
df4 = df_orders.repartition(4)

# Coalesce into 1 partition


In [36]:
df1 = df_orders.coalesce(1)

# Write repartitioned data to Parquet

In [37]:
df4.write.mode("overwrite").parquet("orders_repart")

# Write coalesced data to Parquet


In [38]:
df1.write.mode("overwrite").parquet("orders_coalesce")

# Total revenue per region

In [40]:
from pyspark.sql.functions import sum, avg, max, min, col
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")).show()

+------+-------------+
|region|total_revenue|
+------+-------------+
| South|         2230|
|  East|         2040|
|  West|         2300|
| North|         2160|
+------+-------------+



# Average order amount per food item

In [41]:
df_orders.groupBy("food_item").agg(avg("amount").alias("avg_amount")).show()

+---------+----------+
|food_item|avg_amount|
+---------+----------+
|   Burger|    258.75|
|    Pizza|     480.0|
|    Pasta|     352.5|
+---------+----------+



# Maximum order amount per city

In [42]:
df_orders.groupBy("city").agg(max("amount").alias("max_amount")).show()

+----------+----------+
|      city|max_amount|
+----------+----------+
| Bangalore|       500|
|     Patna|       460|
|   Chennai|       510|
|    Mumbai|       520|
|   Kolkata|       420|
|      Pune|       500|
|     Delhi|       480|
|Chandigarh|       350|
+----------+----------+



 Minimum delivery time per restaurant

In [43]:

df_orders.groupBy("restaurant_id").agg(min("delivery_time_min").alias("min_delivery")).show()


+-------------+------------+
|restaurant_id|min_delivery|
+-------------+------------+
|      Rest-01|          25|
|      Rest-06|          24|
|      Rest-04|          20|
|      Rest-03|          29|
|      Rest-02|          26|
|      Rest-08|          26|
|      Rest-07|          27|
|      Rest-05|          26|
+-------------+------------+



# Count number of orders per region

In [45]:
 from pyspark.sql.functions import count

 df_orders.groupBy("region").agg(count("*").alias("order_count")).show()


+------+-----------+
|region|order_count|
+------+-----------+
| South|          6|
|  East|          6|
|  West|          6|
| North|          6|
+------+-----------+



#Total revenue per restaurant

In [46]:
df_orders.groupBy("restaurant_id").agg(sum("amount").alias("total_revenue")).show()

+-------------+-------------+
|restaurant_id|total_revenue|
+-------------+-------------+
|      Rest-01|         1550|
|      Rest-06|         1000|
|      Rest-04|         1060|
|      Rest-03|         1170|
|      Rest-02|          610|
|      Rest-08|         1110|
|      Rest-07|         1190|
|      Rest-05|         1040|
+-------------+-------------+



# Region + food item wise total revenue

In [47]:
df_orders.groupBy("region","food_item").agg(sum("amount").alias("total_revenue")).show()

+------+---------+-------------+
|region|food_item|total_revenue|
+------+---------+-------------+
|  West|   Burger|          550|
|  East|    Pizza|          880|
|  West|    Pizza|         1020|
| North|    Pizza|          930|
| South|    Pizza|         1010|
|  East|   Burger|          500|
| North|    Pasta|          720|
|  East|    Pasta|          660|
| North|   Burger|          510|
| South|    Pasta|          710|
| South|   Burger|          510|
|  West|    Pasta|          730|
+------+---------+-------------+



#City wise average delivery time

In [48]:
df_orders.groupBy("city").agg(avg("delivery_time_min").alias("avg_delivery_time")).show()

+----------+------------------+
|      city| avg_delivery_time|
+----------+------------------+
| Bangalore|33.666666666666664|
|     Patna|29.333333333333332|
|   Chennai|              30.0|
|    Mumbai|34.333333333333336|
|   Kolkata|32.333333333333336|
|      Pune|              32.0|
|     Delhi|             31.75|
|Chandigarh|              28.0|
+----------+------------------+



# Identify regions with revenue above a threshold (e.g., 1500)

In [49]:
df_orders.groupBy("region").agg(sum("amount").alias("total_revenue")) \
    .filter(col("total_revenue") > 1500).show()

+------+-------------+
|region|total_revenue|
+------+-------------+
| South|         2230|
|  East|         2040|
|  West|         2300|
| North|         2160|
+------+-------------+



Use explain(True) and identify shuffle operators

In [50]:

df_orders.groupBy("region").agg(sum("amount")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['region], ['region, unresolvedalias('sum('amount))]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Analyzed Logical Plan ==
region: string, sum(amount): bigint
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#647L]
+- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Optimized Logical Plan ==
Aggregate [region#1], [region#1, sum(amount#6L) AS sum(amount)#647L]
+- Project [region#1, amount#6L]
   +- LogicalRDD [order_id#0, region#1, city#2, restaurant_id#3, food_item#4, order_date#5, amount#6L, delivery_time_min#7L], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[region#1], functions=[sum(amount#6L)], output=[region#1, sum(amount)#647L])
   +- Exchange hashpartitioning(region#1, 200), ENSURE_REQUIREMENTS, [plan_id=846]
      +- HashAg

# Compute running total of revenue per region ordered by date

In [51]:
from pyspark.sql import Window
from pyspark.sql.functions import sum, rank, dense_rank, row_number, col
window_region_date = Window.partitionBy("region").orderBy("order_date") \
                           .rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_orders.withColumn("running_total", sum("amount").over(window_region_date)).show()


+--------+------+----------+-------------+---------+----------+------+-----------------+-------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|running_total|
+--------+------+----------+-------------+---------+----------+------+-----------------+-------------+
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|          420|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|          680|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|          980|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|         1220|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|         1580|
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|         2040|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|   


# Rank orders by amount within each region


In [52]:
window_region_amount = Window.partitionBy("region").orderBy(col("amount").desc())
df_orders.withColumn("rank_in_region", rank().over(window_region_amount)).show()


+--------+------+----------+-------------+---------+----------+------+-----------------+--------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|rank_in_region|
+--------+------+----------+-------------+---------+----------+------+-----------------+--------------+
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|             1|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|             2|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|             3|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|             4|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|             5|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|             6|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04| 

# Assign row numbers per restaurant based on delivery time


In [53]:
window_restaurant_delivery = Window.partitionBy("restaurant_id").orderBy("delivery_time_min")
df_orders.withColumn("row_num", row_number().over(window_restaurant_delivery)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+-------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|row_num|
+--------+------+----------+-------------+---------+----------+------+-----------------+-------+
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|      1|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|      2|
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|      3|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|      4|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|      1|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|      2|
|    O018| South| Bangalore|      Rest-03|   Burger|2024-02-05|   290|               29|      1|
|    O006| South| Bangalore|  

# Dense rank food items per region by revenue


In [54]:
window_region_food = Window.partitionBy("region").orderBy(col("amount").desc())
df_orders.withColumn("dense_rank_food", dense_rank().over(window_region_food)).show()


+--------+------+----------+-------------+---------+----------+------+-----------------+---------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|dense_rank_food|
+--------+------+----------+-------------+---------+----------+------+-----------------+---------------+
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|              1|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|              2|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|              3|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|              4|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|              5|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|               24|              6|
|    O013| North|     Delhi|      Rest-01|    Pizza|202

# Identify top 2 highest value orders per region

In [56]:
df_orders.withColumn("rank_in_region", rank().over(window_region_amount)) \
         .filter(col("rank_in_region") <= 2).show()


+--------+------+---------+-------------+---------+----------+------+-----------------+--------------+
|order_id|region|     city|restaurant_id|food_item|order_date|amount|delivery_time_min|rank_in_region|
+--------+------+---------+-------------+---------+----------+------+-----------------+--------------+
|    O023|  East|    Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|             1|
|    O007|  East|  Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|             2|
|    O013| North|    Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|             1|
|    O001| North|    Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|             2|
|    O014| South|  Chennai|      Rest-04|    Pizza|2024-02-04|   510|               41|             1|
|    O004| South|Bangalore|      Rest-03|    Pizza|2024-02-01|   500|               40|             2|
|    O010|  West|   Mumbai|      Rest-07|    Pizza|2024-02-01|   520|    


# Compare rank, dense_rank, and row_number outputs


In [57]:
df_orders.withColumn("rank", rank().over(window_region_amount)) \
         .withColumn("dense_rank", dense_rank().over(window_region_amount)) \
         .withColumn("row_number", row_number().over(window_region_amount)).show()


+--------+------+----------+-------------+---------+----------+------+-----------------+----+----------+----------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|rank|dense_rank|row_number|
+--------+------+----------+-------------+---------+----------+------+-----------------+----+----------+----------+
|    O023|  East|     Patna|      Rest-06|    Pizza|2024-02-06|   460|               36|   1|         1|         1|
|    O007|  East|   Kolkata|      Rest-05|    Pizza|2024-02-01|   420|               38|   2|         2|         2|
|    O019|  East|   Kolkata|      Rest-05|    Pasta|2024-02-05|   360|               33|   3|         3|         3|
|    O009|  East|     Patna|      Rest-06|    Pasta|2024-02-03|   300|               28|   4|         4|         4|
|    O008|  East|   Kolkata|      Rest-05|   Burger|2024-02-02|   260|               26|   5|         5|         5|
|    O015|  East|     Patna|      Rest-06|   Burger|2024-02-04|   240|  

# Cumulative delivery time per restaurant

In [58]:
window_restaurant = Window.partitionBy("restaurant_id").orderBy("order_date") \
                          .rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_orders.withColumn("cum_delivery_time", sum("delivery_time_min").over(window_restaurant)).show()

+--------+------+----------+-------------+---------+----------+------+-----------------+-----------------+
|order_id|region|      city|restaurant_id|food_item|order_date|amount|delivery_time_min|cum_delivery_time|
+--------+------+----------+-------------+---------+----------+------+-----------------+-----------------+
|    O001| North|     Delhi|      Rest-01|    Pizza|2024-02-01|   450|               35|               35|
|    O002| North|     Delhi|      Rest-01|   Burger|2024-02-01|   250|               25|               60|
|    O013| North|     Delhi|      Rest-01|    Pizza|2024-02-04|   480|               37|               97|
|    O021| North|     Delhi|      Rest-01|    Pasta|2024-02-06|   370|               30|              127|
|    O003| North|Chandigarh|      Rest-02|    Pasta|2024-02-02|   350|               30|               30|
|    O017| North|Chandigarh|      Rest-02|   Burger|2024-02-05|   260|               26|               56|
|    O004| South| Bangalore|      Res