# Dataset 1

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import col, trim, when
from datetime import datetime


In [2]:
spark = SparkSession.builder.appName("DeliveryDataClean").getOrCreate()

In [3]:

delivery_data = [
    ("DLV001","Delhi ","D001","Delivered","120","2024-01-05 10:30"),
    ("DLV002","Mumbai","D002","Delivered","90","05/01/2024 11:00"),
    ("DLV003","Bangalore","D003","In Transit","200","2024/01/06 09:45"),
    ("DLV004","Delhi","D004","Cancelled","","2024-01-07 14:00"),
    ("DLV005","Chennai","D002","Delivered","invalid","2024-01-08 16:20"),
    ("DLV006","Mumbai","D005","Delivered",None,"2024-01-08 18:10"),
    ("DLV007","Delhi","D001","Delivered","140","09-01-2024 12:30"),
    ("DLV008","Bangalore","D003","Delivered","160","2024-01-09 15:45"),
    ("DLV009","Mumbai","D004","Delivered","110","2024-01-10 13:20"),
    ("DLV009","Mumbai","D004","Delivered","110","2024-01-10 13:20")
]


In [4]:

def parse_datetime(dt_str):
    formats = ["%Y-%m-%d %H:%M", "%d/%m/%Y %H:%M", "%Y/%m/%d %H:%M", "%d-%m-%Y %H:%M"]
    for fmt in formats:
        try:
            return datetime.strptime(dt_str, fmt)
        except:
            continue
    return None


In [5]:

cleaned_data = []
for row in delivery_data:
    delivery_id, city, driver_id, status, time_minutes, ts = row
    city = city.strip()

    try:
        time_minutes = int(time_minutes)
    except:
        time_minutes = None

    ts = parse_datetime(ts)

    cleaned_data.append((delivery_id, city, driver_id, status, time_minutes, ts))


In [6]:

schema = StructType([
    StructField("delivery_id", StringType(), False),
    StructField("city", StringType(), True),
    StructField("driver_id", StringType(), True),
    StructField("status", StringType(), True),
    StructField("delivery_time_minutes", IntegerType(), True),
    StructField("delivery_timestamp", TimestampType(), True)
])


In [7]:

df = spark.createDataFrame(cleaned_data, schema)

df = df.dropDuplicates()

df.show(truncate=False)

+-----------+---------+---------+----------+---------------------+-------------------+
|delivery_id|city     |driver_id|status    |delivery_time_minutes|delivery_timestamp |
+-----------+---------+---------+----------+---------------------+-------------------+
|DLV003     |Bangalore|D003     |In Transit|200                  |2024-01-06 09:45:00|
|DLV002     |Mumbai   |D002     |Delivered |90                   |2024-01-05 11:00:00|
|DLV001     |Delhi    |D001     |Delivered |120                  |2024-01-05 10:30:00|
|DLV007     |Delhi    |D001     |Delivered |140                  |2024-01-09 12:30:00|
|DLV008     |Bangalore|D003     |Delivered |160                  |2024-01-09 15:45:00|
|DLV009     |Mumbai   |D004     |Delivered |110                  |2024-01-10 13:20:00|
|DLV005     |Chennai  |D002     |Delivered |NULL                 |2024-01-08 16:20:00|
|DLV004     |Delhi    |D004     |Cancelled |NULL                 |2024-01-07 14:00:00|
|DLV006     |Mumbai   |D005     |Delivered 

In [8]:
delivery_data=df

In [23]:
df.show()

+-----------+---------+---------+----------+---------------------+-------------------+
|delivery_id|     city|driver_id|    status|delivery_time_minutes| delivery_timestamp|
+-----------+---------+---------+----------+---------------------+-------------------+
|     DLV003|Bangalore|     D003|In Transit|                  200|2024-01-06 09:45:00|
|     DLV002|   Mumbai|     D002| Delivered|                   90|2024-01-05 11:00:00|
|     DLV001|    Delhi|     D001| Delivered|                  120|2024-01-05 10:30:00|
|     DLV007|    Delhi|     D001| Delivered|                  140|2024-01-09 12:30:00|
|     DLV008|Bangalore|     D003| Delivered|                  160|2024-01-09 15:45:00|
|     DLV009|   Mumbai|     D004| Delivered|                  110|2024-01-10 13:20:00|
|     DLV005|  Chennai|     D002| Delivered|                 NULL|2024-01-08 16:20:00|
|     DLV004|    Delhi|     D004| Cancelled|                 NULL|2024-01-07 14:00:00|
|     DLV006|   Mumbai|     D005| Delivered

# Dataset 2

In [9]:
driver_data = [
    ("D001","Ravi","Senior"),
    ("D002","Amit","Junior"),
    ("D003","Sneha","Senior"),
    ("D004","Karan","Junior"),
    ("D005","Neha","Senior")
]


In [10]:
schema = StructType([
    StructField("driver_id", StringType(), False),
    StructField("driver_name", StringType(), True),
    StructField("driver_level", StringType(), True)
])


In [11]:
df_driver = spark.createDataFrame(driver_data, schema)


In [13]:
from pyspark.sql.functions import trim, initcap

df_driver = (
    df_driver
    .withColumn("driver_id", trim(col("driver_id")))
    .withColumn("driver_name", trim(col("driver_name")))
    .withColumn("driver_level", initcap(trim(col("driver_level"))))  # ensures "Senior"/"Junior"
)


In [14]:
df_driver.show(truncate=False)


+---------+-----------+------------+
|driver_id|driver_name|driver_level|
+---------+-----------+------------+
|D001     |Ravi       |Senior      |
|D002     |Amit       |Junior      |
|D003     |Sneha      |Senior      |
|D004     |Karan      |Junior      |
|D005     |Neha       |Senior      |
+---------+-----------+------------+



# Dataset 3

In [15]:
city_zone_data = [
    ("Delhi","North"),
    ("Mumbai","West"),
    ("Bangalore","South"),
    ("Chennai","South")
]

In [16]:
schema = StructType([
    StructField("city", StringType(), False),
    StructField("zone", StringType(), True)
])


In [17]:
df_city_zone = spark.createDataFrame(city_zone_data, schema)


In [18]:
df_city_zone = (
    df_city_zone
    .withColumn("city", trim(col("city")))
    .withColumn("zone", initcap(trim(col("zone")))
)


In [19]:
df_city_zone.show(truncate=False)

+---------+-----+
|city     |zone |
+---------+-----+
|Delhi    |North|
|Mumbai   |West |
|Bangalore|South|
|Chennai  |South|
+---------+-----+



# Keep only delivered orders

In [24]:
df_delivered = df.filter(col("status") == "Delivered")

In [25]:
df_delivered.show()

+-----------+---------+---------+---------+---------------------+-------------------+
|delivery_id|     city|driver_id|   status|delivery_time_minutes| delivery_timestamp|
+-----------+---------+---------+---------+---------------------+-------------------+
|     DLV002|   Mumbai|     D002|Delivered|                   90|2024-01-05 11:00:00|
|     DLV001|    Delhi|     D001|Delivered|                  120|2024-01-05 10:30:00|
|     DLV007|    Delhi|     D001|Delivered|                  140|2024-01-09 12:30:00|
|     DLV008|Bangalore|     D003|Delivered|                  160|2024-01-09 15:45:00|
|     DLV009|   Mumbai|     D004|Delivered|                  110|2024-01-10 13:20:00|
|     DLV005|  Chennai|     D002|Delivered|                 NULL|2024-01-08 16:20:00|
|     DLV006|   Mumbai|     D005|Delivered|                 NULL|2024-01-08 18:10:00|
+-----------+---------+---------+---------+---------------------+-------------------+



# Remove cancelled and in-transit deliveries

In [26]:

before_count = df.count()

df_delivered = df.filter(col("status") == "Delivered")

after_count = df_delivered.count()

print("Record count before filtering:", before_count)
print("Record count after filtering:", after_count)


df_delivered.show(truncate=False)

Record count before filtering: 9
Record count after filtering: 7
+-----------+---------+---------+---------+---------------------+-------------------+
|delivery_id|city     |driver_id|status   |delivery_time_minutes|delivery_timestamp |
+-----------+---------+---------+---------+---------------------+-------------------+
|DLV002     |Mumbai   |D002     |Delivered|90                   |2024-01-05 11:00:00|
|DLV001     |Delhi    |D001     |Delivered|120                  |2024-01-05 10:30:00|
|DLV007     |Delhi    |D001     |Delivered|140                  |2024-01-09 12:30:00|
|DLV008     |Bangalore|D003     |Delivered|160                  |2024-01-09 15:45:00|
|DLV009     |Mumbai   |D004     |Delivered|110                  |2024-01-10 13:20:00|
|DLV005     |Chennai  |D002     |Delivered|NULL                 |2024-01-08 16:20:00|
|DLV006     |Mumbai   |D005     |Delivered|NULL                 |2024-01-08 18:10:00|
+-----------+---------+---------+---------+---------------------+----------

# Join delivery data with driver master

In [28]:
from pyspark.sql.functions import broadcast


df_enriched = df_delivered.join(
    broadcast(df_driver),
    on="driver_id",
    how="left"
)


# Join enriched data with city zone lookup

In [29]:
df_final = df_enriched.join(
    broadcast(df_city_zone),
    on="city",
    how="left"
)


# What is happening?

In [30]:
df_final.explain(True)


== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [driver_id#2, delivery_id#0, city#1, status#3, delivery_time_minutes#4, delivery_timestamp#5, driver_name#31, driver_level#32]
:  +- Join LeftOuter, (driver_id#2 = driver_id#30)
:     :- Filter (status#3 = Delivered)
:     :  +- Deduplicate [city#1, driver_id#2, delivery_time_minutes#4, delivery_id#0, status#3, delivery_timestamp#5]
:     :     +- LogicalRDD [delivery_id#0, city#1, driver_id#2, status#3, delivery_time_minutes#4, delivery_timestamp#5], false
:     +- ResolvedHint (strategy=broadcast)
:        +- Project [driver_id#30, driver_name#31, initcap(trim(driver_level#27, None)) AS driver_level#32]
:           +- Project [driver_id#30, trim(driver_name#26, None) AS driver_name#31, driver_level#27]
:              +- Project [trim(driver_id#25, None) AS driver_id#30, driver_name#26, driver_level#27]
:                 +- LogicalRDD [driver_id#25, driver_name#26, driver_level#27], false
+- ResolvedHint (strategy

# Average delivery time per city

In [32]:
from pyspark.sql import functions as F

avg_city = df_final.groupBy("city").agg(
    F.avg("delivery_time_minutes").alias("avg_delivery_time")
)
avg_city.show()


+---------+-----------------+
|     city|avg_delivery_time|
+---------+-----------------+
|Bangalore|            160.0|
|  Chennai|             NULL|
|   Mumbai|            100.0|
|    Delhi|            130.0|
+---------+-----------------+



# Average delivery time per driver

In [33]:
avg_driver = df_final.groupBy("driver_id", "driver_name").agg(
    F.avg("delivery_time_minutes").alias("avg_delivery_time")
)
avg_driver.show()

+---------+-----------+-----------------+
|driver_id|driver_name|avg_delivery_time|
+---------+-----------+-----------------+
|     D003|      Sneha|            160.0|
|     D004|      Karan|            110.0|
|     D005|       Neha|             NULL|
|     D002|       Amit|             90.0|
|     D001|       Ravi|            130.0|
+---------+-----------+-----------------+



# Rank drivers by performance within each city

In [36]:
from pyspark.sql.window import Window

window_city = Window.partitionBy("city").orderBy(F.avg("delivery_time_minutes").over(Window.partitionBy("city","driver_id")))
ranked_drivers = avg_driver.join(df_final.select("driver_id","city").distinct(), "driver_id") \
    .withColumn("rank_in_city", F.rank().over(Window.partitionBy("city").orderBy("avg_delivery_time")))
ranked_drivers.show()


+---------+-----------+-----------------+---------+------------+
|driver_id|driver_name|avg_delivery_time|     city|rank_in_city|
+---------+-----------+-----------------+---------+------------+
|     D003|      Sneha|            160.0|Bangalore|           1|
|     D002|       Amit|             90.0|  Chennai|           1|
|     D001|       Ravi|            130.0|    Delhi|           1|
|     D005|       Neha|             NULL|   Mumbai|           1|
|     D002|       Amit|             90.0|   Mumbai|           2|
|     D004|      Karan|            110.0|   Mumbai|           3|
+---------+-----------+-----------------+---------+------------+



# Fastest driver per zone

In [37]:
window_zone = Window.partitionBy("zone").orderBy("avg_delivery_time")
fastest_driver_zone = avg_driver.join(df_final.select("driver_id","zone").distinct(), "driver_id") \
    .withColumn("rank_in_zone", F.rank().over(window_zone)) \
    .filter(col("rank_in_zone") == 1)
fastest_driver_zone.show()


+---------+-----------+-----------------+-----+------------+
|driver_id|driver_name|avg_delivery_time| zone|rank_in_zone|
+---------+-----------+-----------------+-----+------------+
|     D001|       Ravi|            130.0|North|           1|
|     D002|       Amit|             90.0|South|           1|
|     D005|       Neha|             NULL| West|           1|
+---------+-----------+-----------------+-----+------------+



# Identify top 2 drivers per city

In [38]:

top2_drivers_city = ranked_drivers.filter(col("rank_in_city") <= 2)
top2_drivers_city.show()


+---------+-----------+-----------------+---------+------------+
|driver_id|driver_name|avg_delivery_time|     city|rank_in_city|
+---------+-----------+-----------------+---------+------------+
|     D003|      Sneha|            160.0|Bangalore|           1|
|     D002|       Amit|             90.0|  Chennai|           1|
|     D001|       Ravi|            130.0|    Delhi|           1|
|     D005|       Neha|             NULL|   Mumbai|           1|
|     D002|       Amit|             90.0|   Mumbai|           2|
+---------+-----------+-----------------+---------+------------+



# Caching

In [39]:
from pyspark.storagelevel import StorageLevel

df_final.cache()
df_final.count()
avg_city.cache().count()
avg_driver.cache().count()

5

# Compare Execution Plans

In [40]:

df_final.explain(True)

df_final.cache().count()
df_final.explain(True)


avg_driver.cache().count()
avg_driver.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [driver_id#2, delivery_id#0, city#1, status#3, delivery_time_minutes#4, delivery_timestamp#5, driver_name#31, driver_level#32]
:  +- Join LeftOuter, (driver_id#2 = driver_id#30)
:     :- Filter (status#3 = Delivered)
:     :  +- Deduplicate [city#1, driver_id#2, delivery_time_minutes#4, delivery_id#0, status#3, delivery_timestamp#5]
:     :     +- LogicalRDD [delivery_id#0, city#1, driver_id#2, status#3, delivery_time_minutes#4, delivery_timestamp#5], false
:     +- ResolvedHint (strategy=broadcast)
:        +- Project [driver_id#30, driver_name#31, initcap(trim(driver_level#27, None)) AS driver_level#32]
:           +- Project [driver_id#30, trim(driver_name#26, None) AS driver_name#31, driver_level#27]
:              +- Project [trim(driver_id#25, None) AS driver_id#30, driver_name#26, driver_level#27]
:                 +- LogicalRDD [driver_id#25, driver_name#26, driver_level#27], false
+- ResolvedHint (strategy

# Repartition by city to reduce shuffles

In [41]:

df_city_partitioned = df_final.repartition("city")

df_city_partitioned.write.mode("overwrite").parquet("/path/analytics/by_city/")

# File formats

In [42]:
df_delivered.write.mode("overwrite").parquet("/path/clean/deliveries_parquet/")

avg_city.write.mode("overwrite").orc("/path/analytics/avg_city_orc/")
avg_driver.write.mode("overwrite").orc("/path/analytics/avg_driver_orc/")