In [1]:
import os 
print(os.getcwd())



h:\pyspark_SQL_python_advanced-coding_interview


##  calculates the number of delayed orders for each delivery partner. An order is considered delayed if the actual delivery time exceeds the predicted delivery time. 

# Pyspark

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime
from pyspark.sql.functions import *


# Initialize Spark Session
spark = SparkSession.builder.appName("SwiggyOrders").getOrCreate()

# Define the schema
schema = StructType([
    StructField("orderid", IntegerType(), True),
    StructField("custid", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("del_partner", StringType(), True),
    StructField("order_time", TimestampType(), True),
    StructField("deliver_time", TimestampType(), True),
    StructField("predicted_time", IntegerType(), True)
])

# Define the data with datetime objects
data = [
    (1, 101, "Mumbai", "Partner A", datetime(2024, 12, 18, 10, 0, 0), datetime(2024, 12, 18, 11, 30, 0), 60),
    (2, 102, "Delhi", "Partner A", datetime(2024, 12, 18, 9, 0, 0), datetime(2024, 12, 18, 10, 0, 0), 45),
    (3, 103, "Pune", "Partner A", datetime(2024, 12, 18, 15, 0, 0), datetime(2024, 12, 18, 15, 30, 0), 30),
    (4, 104, "Mumbai", "Partner A", datetime(2024, 12, 18, 14, 0, 0), datetime(2024, 12, 18, 14, 50, 0), 45),
    (5, 105, "Bangalore", "Partner B", datetime(2024, 12, 18, 8, 0, 0), datetime(2024, 12, 18, 8, 29, 0), 30),
    (6, 106, "Hyderabad", "Partner B", datetime(2024, 12, 13, 10, 0, 0), datetime(2024, 12, 18, 10, 40, 0), 70),
    (7, 107, "Kolkata", "Partner B", datetime(2024, 12, 18, 10, 0, 0), datetime(2024, 12, 18, 10, 40, 0), 45),
    (8, 109, "Chennai", "Partner C", datetime(2024, 12, 18, 7, 0, 0), datetime(2024, 12, 18, 7, 40, 0), 30),
    (9, 110, "Delhi", "Partner C", datetime(2024, 12, 18, 12, 0, 0), datetime(2024, 12, 18, 13, 0, 0), 50),
    (10, 111, "Hyderabad", "Partner C", datetime(2024, 12, 18, 16, 0, 0), datetime(2024, 12, 18, 16, 45, 0), 30)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()


+-------+------+---------+-----------+-------------------+-------------------+--------------+
|orderid|custid|     city|del_partner|         order_time|       deliver_time|predicted_time|
+-------+------+---------+-----------+-------------------+-------------------+--------------+
|      1|   101|   Mumbai|  Partner A|2024-12-18 10:00:00|2024-12-18 11:30:00|            60|
|      2|   102|    Delhi|  Partner A|2024-12-18 09:00:00|2024-12-18 10:00:00|            45|
|      3|   103|     Pune|  Partner A|2024-12-18 15:00:00|2024-12-18 15:30:00|            30|
|      4|   104|   Mumbai|  Partner A|2024-12-18 14:00:00|2024-12-18 14:50:00|            45|
|      5|   105|Bangalore|  Partner B|2024-12-18 08:00:00|2024-12-18 08:29:00|            30|
|      6|   106|Hyderabad|  Partner B|2024-12-13 10:00:00|2024-12-18 10:40:00|            70|
|      7|   107|  Kolkata|  Partner B|2024-12-18 10:00:00|2024-12-18 10:40:00|            45|
|      8|   109|  Chennai|  Partner C|2024-12-18 07:00:00|20

In [12]:
# Calculate actual delivery time in minutes
from pyspark.sql.functions import unix_timestamp, col

# Calculate the actual delivery time in minutes
df = df.withColumn(
    "actual_delivery_time_minutes",
    (unix_timestamp(col("deliver_time")) - unix_timestamp(col("order_time"))) / 60
)

df.show()


+-------+------+---------+-----------+-------------------+-------------------+--------------+----------------------------+
|orderid|custid|     city|del_partner|         order_time|       deliver_time|predicted_time|actual_delivery_time_minutes|
+-------+------+---------+-----------+-------------------+-------------------+--------------+----------------------------+
|      1|   101|   Mumbai|  Partner A|2024-12-18 10:00:00|2024-12-18 11:30:00|            60|                        90.0|
|      2|   102|    Delhi|  Partner A|2024-12-18 09:00:00|2024-12-18 10:00:00|            45|                        60.0|
|      3|   103|     Pune|  Partner A|2024-12-18 15:00:00|2024-12-18 15:30:00|            30|                        30.0|
|      4|   104|   Mumbai|  Partner A|2024-12-18 14:00:00|2024-12-18 14:50:00|            45|                        50.0|
|      5|   105|Bangalore|  Partner B|2024-12-18 08:00:00|2024-12-18 08:29:00|            30|                        29.0|
|      6|   106|

In [13]:
# Determine if order is delayed
df = df.withColumn("is_delayed", when(col("actual_delivery_time_minutes") > col("predicted_time"), 1).otherwise(0))
df.show()

+-------+------+---------+-----------+-------------------+-------------------+--------------+----------------------------+----------+
|orderid|custid|     city|del_partner|         order_time|       deliver_time|predicted_time|actual_delivery_time_minutes|is_delayed|
+-------+------+---------+-----------+-------------------+-------------------+--------------+----------------------------+----------+
|      1|   101|   Mumbai|  Partner A|2024-12-18 10:00:00|2024-12-18 11:30:00|            60|                        90.0|         1|
|      2|   102|    Delhi|  Partner A|2024-12-18 09:00:00|2024-12-18 10:00:00|            45|                        60.0|         1|
|      3|   103|     Pune|  Partner A|2024-12-18 15:00:00|2024-12-18 15:30:00|            30|                        30.0|         0|
|      4|   104|   Mumbai|  Partner A|2024-12-18 14:00:00|2024-12-18 14:50:00|            45|                        50.0|         1|
|      5|   105|Bangalore|  Partner B|2024-12-18 08:00:00|2024

In [14]:
# Calculate delayed orders per partner
delayed_orders_per_partner = df.groupBy("del_partner").agg(sum("is_delayed").alias("delayed_orders_count"))

delayed_orders_per_partner.show()

+-----------+--------------------+
|del_partner|delayed_orders_count|
+-----------+--------------------+
|  Partner A|                   3|
|  Partner B|                   1|
|  Partner C|                   3|
+-----------+--------------------+



# Spark SQL 

In [18]:
# Create or replace the temporary view
spark.sql("""
CREATE OR REPLACE TEMP VIEW swiggy_orders AS
SELECT 1 AS orderid, 101 AS custid, 'Mumbai' AS city, 'Partner A' AS del_partner, CAST('2024-12-18 10:00:00' AS TIMESTAMP) AS order_time, CAST('2024-12-18 11:30:00' AS TIMESTAMP) AS deliver_time, 60 AS predicted_time UNION ALL
SELECT 2, 102, 'Pune', 'Partner A', CAST('2024-12-18 09:00:00' AS TIMESTAMP), CAST('2024-12-18 10:00:00' AS TIMESTAMP), 45 UNION ALL
SELECT 3, 103, 'Delhi', 'Partner A', CAST('2024-12-18 15:00:00' AS TIMESTAMP), CAST('2024-12-18 15:30:00' AS TIMESTAMP), 30 UNION ALL
SELECT 4, 104, 'Mumbai', 'Partner A', CAST('2024-12-18 14:00:00' AS TIMESTAMP), CAST('2024-12-18 14:50:00' AS TIMESTAMP), 45 UNION ALL
SELECT 5, 105, 'Bangalore', 'Partner B', CAST('2024-12-18 08:00:00' AS TIMESTAMP), CAST('2024-12-18 08:29:00' AS TIMESTAMP), 30 UNION ALL
SELECT 6, 106, 'Hyderabad', 'Partner B', CAST('2024-12-18 13:00:00' AS TIMESTAMP), CAST('2024-12-18 14:00:00' AS TIMESTAMP), 70 UNION ALL
SELECT 7, 107, 'Kolkata', 'Partner B', CAST('2024-12-18 10:00:00' AS TIMESTAMP), CAST('2024-12-18 10:40:00' AS TIMESTAMP), 45 UNION ALL
SELECT 8, 108, 'Delhi', 'Partner B', CAST('2024-12-18 18:00:00' AS TIMESTAMP), CAST('2024-12-18 18:30:00' AS TIMESTAMP), 40 UNION ALL
SELECT 9, 109, 'Chennai', 'Partner C', CAST('2024-12-18 07:00:00' AS TIMESTAMP), CAST('2024-12-18 07:40:00' AS TIMESTAMP), 30 UNION ALL
SELECT 10, 110, 'Mumbai', 'Partner C', CAST('2024-12-18 12:00:00' AS TIMESTAMP), CAST('2024-12-18 13:00:00' AS TIMESTAMP), 50 UNION ALL
SELECT 11, 111, 'Delhi', 'Partner C', CAST('2024-12-18 09:00:00' AS TIMESTAMP), CAST('2024-12-18 09:35:00' AS TIMESTAMP), 30 UNION ALL
SELECT 12, 112, 'Hyderabad', 'Partner C', CAST('2024-12-18 16:00:00' AS TIMESTAMP), CAST('2024-12-18 16:45:00' AS TIMESTAMP), 30
""")

# Verify the temporary view creation
query = spark.sql("SELECT * FROM swiggy_orders")
query.show()


+-------+------+---------+-----------+-------------------+-------------------+--------------+
|orderid|custid|     city|del_partner|         order_time|       deliver_time|predicted_time|
+-------+------+---------+-----------+-------------------+-------------------+--------------+
|      1|   101|   Mumbai|  Partner A|2024-12-18 10:00:00|2024-12-18 11:30:00|            60|
|      2|   102|     Pune|  Partner A|2024-12-18 09:00:00|2024-12-18 10:00:00|            45|
|      3|   103|    Delhi|  Partner A|2024-12-18 15:00:00|2024-12-18 15:30:00|            30|
|      4|   104|   Mumbai|  Partner A|2024-12-18 14:00:00|2024-12-18 14:50:00|            45|
|      5|   105|Bangalore|  Partner B|2024-12-18 08:00:00|2024-12-18 08:29:00|            30|
|      6|   106|Hyderabad|  Partner B|2024-12-18 13:00:00|2024-12-18 14:00:00|            70|
|      7|   107|  Kolkata|  Partner B|2024-12-18 10:00:00|2024-12-18 10:40:00|            45|
|      8|   108|    Delhi|  Partner B|2024-12-18 18:00:00|20

In [20]:
res = spark.sql("""
-- Calculate actual delivery time in minutes
WITH CalculatedDeliveryTime AS (
    SELECT *, 
           (CAST(unix_timestamp(deliver_time) AS BIGINT) - CAST(unix_timestamp(order_time) AS BIGINT)) / 60 AS actual_delivery_time_minutes
    FROM swiggy_orders
),

-- Determine if the order is delayed
DelayedOrders AS (
    SELECT *,
           CASE WHEN (CAST(unix_timestamp(deliver_time) AS BIGINT) - CAST(unix_timestamp(order_time) AS BIGINT)) / 60 > predicted_time THEN 1 ELSE 0 END AS is_delayed
    FROM swiggy_orders
)

-- Calculate delayed orders per partner
SELECT del_partner, SUM(is_delayed) AS delayed_orders_count
FROM DelayedOrders
GROUP BY del_partner
""")

res.show()


+-----------+--------------------+
|del_partner|delayed_orders_count|
+-----------+--------------------+
|  Partner A|                   3|
|  Partner B|                   0|
|  Partner C|                   4|
+-----------+--------------------+

