![](https://media.licdn.com/dms/image/v2/D5622AQEQJ3u_ECwEzA/feedshare-shrink_2048_1536/B56Zuz4Cy3HcAk-/0/1768249372070?e=1770249600&v=beta&t=mas2waJIQx2xRTIxJUlK_55097wUCjR1yrL30W_WMZ0)

In [0]:
%python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.functions import to_timestamp

data_dict = [
{"txn_id":1,"vehicle_no":"DL01AA1111","vehicle_type":"car","crossing_time":"2026-01-10 08:00"},
{"txn_id":2,"vehicle_no":"DL01AA1111","vehicle_type":"car","crossing_time":"2026-01-10 11:00"},
{"txn_id":3,"vehicle_no":"DL02BB2222","vehicle_type":"truck","crossing_time":"2026-01-10 09:00"},
{"txn_id":4,"vehicle_no":"DL02BB2222","vehicle_type":"truck","crossing_time":"2026-01-10 16:00"},
{"txn_id":5,"vehicle_no":"DL03CC3333","vehicle_type":"bus","crossing_time":"2026-01-10 10:00"},
{"txn_id":6,"vehicle_no":"DL03CC3333","vehicle_type":"bus","crossing_time":"2026-01-10 12:00"},
{"txn_id":7,"vehicle_no":"DL04DD4444","vehicle_type":"motorcycle","crossing_time":"2026-01-10 11:00"},
{"txn_id":8,"vehicle_no":"DL05EE5555","vehicle_type":"car","crossing_time":"2026-01-10 14:00"}
]
schema1 = StructType([
    StructField("txn_id", IntegerType(), False),
    StructField("vehicle_no", StringType(), False),
    StructField("vehicle_type", StringType(), False),
    StructField("crossing_time", StringType(), False)
])
df1 = spark.createDataFrame(data_dict, schema=schema1)
df2=df1.withColumn("crossing_time", to_timestamp("crossing_time"))


In [0]:
%python
from pyspark.sql.window import Window
from pyspark.sql.functions import lag,timestamp_diff,when, isnull,col,sum
df3=df2.withColumn("prev_crossing_time",lag("crossing_time",1).over(Window.partitionBy("vehicle_no").orderBy("crossing_time"))).withColumn("time_dff",timestamp_diff("minute","prev_crossing_time","crossing_time"))
df4 = df3.selectExpr(
    "*",
    """
    CASE
        WHEN vehicle_type = 'car'
             AND time_dff <= 240 THEN 20
        WHEN vehicle_type = 'car'
             AND (time_dff > 240 OR time_dff IS NULL) THEN 40

        WHEN vehicle_type = 'truck'
             AND time_dff <= 240 THEN 40
        WHEN vehicle_type = 'truck'
             AND (time_dff > 240 OR time_dff IS NULL) THEN 80

        WHEN vehicle_type = 'bus'
             AND time_dff <= 240 THEN 30
        WHEN vehicle_type = 'bus'
             AND (time_dff > 240 OR time_dff IS NULL) THEN 70
    END AS amount
    """
)
display(df4.groupBy().sum("amount"))

    
display(df4)


Write an SQL query to generate a call summary with the following rules:
1. Calls between two persons are bidirectional
 (10 → 20) and (20 → 10) should be treated as the same pair.
2. Always display the smaller person ID as Person1 and the larger person ID as Person2.
3. For each unique person pair, calculate:
 call_count → Total number of calls between the two persons
 total_duration → Sum of all call durations between the two persons

![image_1770741749647.png](./image_1770741749647.png "image_1770741749647.png")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS input_table (
    from_id   INT,
    to_id     INT,
    duration  INT
)
USING DELTA;
INSERT INTO input_table VALUES
(10, 20, 58),
(20, 10, 12),
(10, 30, 20),
(30, 40, 200),
(30, 40, 300),
(40, 30, 500);


In [0]:
%sql
with temp_tbl as 
(select  
case when from_id > to_id then to_id else from_id 
end as from_id,
case when from_id > to_id then from_id else to_id 
end as to_id,
duration
from input_table)
select from_id,to_id,sum(duration) from temp_tbl group by from_id,to_id;

![image_1770742413574.png](./image_1770742413574.png "image_1770742413574.png")

In [0]:
%sql
CREATE TABLE IF NOT EXISTS orders_input (
    order_id        INT,
    customer_id     STRING,
    customer_name   STRING,
    platform        STRING,
    product_stack   STRING,
    revenue         INT
)
USING DELTA;
INSERT INTO orders_input VALUES
(101, 'C001', 'Alice',   'AWS',   'Data Engineering', 12000),
(102, 'C002', 'Bob',     'Azure', 'SQL Analytics',    18000),
(103, 'C001', 'Alice',   'AWS',   'Python ETL',        8000),
(104, 'C003', 'Charlie', 'GCP',   'BigQuery',         25000),
(105, 'C002', 'Bob',     'Azure', 'Power BI',          7000),
(106, 'C004', 'Diana',   'AWS',   'Snowflake',        30000),
(107, 'C003', 'Charlie', 'GCP',   'Data Pipelines',    5000);



In [0]:
%sql
with total_rev as 
 (select customer_id,customer_name, sum(revenue) as total_revenue from orders_input group by customer_id,customer_name)

 select *, rank() over(order by total_revenue desc) as revenue_rank from total_rev;