In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Smart City Traffic Analytics") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

#PHASE 1

In [4]:
# 1. Read traffic_data.csv as StringType
schema = StructType([
    StructField("sensor_id", StringType(), True),
    StructField("location", StringType(), True),
    StructField("road_name", StringType(), True),
    StructField("vehicle_count", StringType(), True),
    StructField("avg_speed", StringType(), True),
    StructField("temperature", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("status", StringType(), True)
])

df_raw = spark.read.csv("traffic_data_large.csv", header=True, schema=schema)

In [5]:
# 2. Print schema and count records
print("\nSchema:")
df_raw.printSchema()
print(f"\nTotal Records: {df_raw.count()}")


Schema:
root
 |-- sensor_id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- road_name: string (nullable = true)
 |-- vehicle_count: string (nullable = true)
 |-- avg_speed: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- status: string (nullable = true)


Total Records: 500000


In [6]:
# 3. Identify data quality issues
print("\nData Quality Issues Identified:")
print("- vehicle_count has 'invalid' values")
print("- avg_speed has missing/empty values")
print("- timestamp in different formats (yyyy-MM-dd, dd/MM/yyyy, yyyy/MM/dd)")
print("- INACTIVE sensors present")
print("- temperature column is irrelevant")

df_raw.show(10, truncate=False)


Data Quality Issues Identified:
- vehicle_count has 'invalid' values
- avg_speed has missing/empty values
- timestamp in different formats (yyyy-MM-dd, dd/MM/yyyy, yyyy/MM/dd)
- INACTIVE sensors present
- temperature column is irrelevant
+---------+---------+---------------+-------------+---------+-----------+-------------------+--------+
|sensor_id|location |road_name      |vehicle_count|avg_speed|temperature|timestamp          |status  |
+---------+---------+---------------+-------------+---------+-----------+-------------------+--------+
|S105     |Chennai  |OMR            |invalid      |NULL     |39         |12/01/2026 06:00:00|INACTIVE|
|S113     |Chennai  |Mount Road     |103          |73.5     |36         |2026-01-12 06:00:05|ACTIVE  |
|S228     |Delhi    |Janpath        |16           |20.0     |35         |2026-01-12 06:00:10|ACTIVE  |
|S160     |Bangalore|MG Road        |27           |27.1     |32         |2026-01-12 06:00:15|ACTIVE  |
|S252     |Mumbai   |Western Express|115

#PHASE 2

In [7]:
# 1. Trim all string columns
df_trimmed = df_raw.select([trim(col(c)).alias(c) for c in df_raw.columns])

In [8]:
# 2. Clean vehicle_count
df_cleaned = df_trimmed.withColumn(
    "vehicle_count_clean",
    when(
        (col("vehicle_count").isNull()) |
        (col("vehicle_count") == "") |
        (col("vehicle_count") == "invalid"),
        lit(None)
    ).otherwise(col("vehicle_count").cast(IntegerType()))
)

In [9]:
# 3. Clean avg_speed
df_cleaned = df_cleaned.withColumn(
    "avg_speed_clean",
    when(
        (col("avg_speed").isNull()) |
        (col("avg_speed") == ""),
        lit(None)
    ).otherwise(col("avg_speed").cast(DoubleType()))
)

In [13]:
# 4. Parse timestamp - supporting multiple formats
df_cleaned = df_cleaned.withColumn(
    "event_time",
    coalesce(
        try_to_timestamp(col("timestamp"), lit("yyyy-MM-dd HH:mm:ss")),
        try_to_timestamp(col("timestamp"), lit("dd/MM/yyyy HH:mm:ss")),
        try_to_timestamp(col("timestamp"), lit("yyyy/MM/dd HH:mm:ss"))
    )
)

In [14]:
# 5. Keep original timestamp for audit (already have it)
print("\nCleaned Data Sample:")
df_cleaned.select("sensor_id", "location", "road_name", "vehicle_count_clean",
                  "avg_speed_clean", "event_time", "status").show(10, truncate=False)


Cleaned Data Sample:
+---------+---------+---------------+-------------------+---------------+-------------------+--------+
|sensor_id|location |road_name      |vehicle_count_clean|avg_speed_clean|event_time         |status  |
+---------+---------+---------------+-------------------+---------------+-------------------+--------+
|S105     |Chennai  |OMR            |NULL               |NULL           |2026-01-12 06:00:00|INACTIVE|
|S113     |Chennai  |Mount Road     |103                |73.5           |2026-01-12 06:00:05|ACTIVE  |
|S228     |Delhi    |Janpath        |16                 |20.0           |2026-01-12 06:00:10|ACTIVE  |
|S160     |Bangalore|MG Road        |27                 |27.1           |2026-01-12 06:00:15|ACTIVE  |
|S252     |Mumbai   |Western Express|115                |59.3           |2026-01-12 06:00:20|ACTIVE  |
|S134     |Kolkata  |EM Bypass      |13                 |23.6           |2026-01-12 06:00:25|ACTIVE  |
|S246     |Delhi    |Janpath        |81            

#PHASE 3

In [15]:
# 1. Count invalid vehicle_count rows
invalid_vehicle_count = df_cleaned.filter(col("vehicle_count_clean").isNull()).count()
print(f"\nInvalid vehicle_count rows: {invalid_vehicle_count}")


Invalid vehicle_count rows: 49873


In [16]:
# 2. Count invalid timestamp rows
invalid_timestamp = df_cleaned.filter(col("event_time").isNull()).count()
print(f"Invalid timestamp rows: {invalid_timestamp}")

Invalid timestamp rows: 4853


In [17]:
# 3. Remove rows where status != "ACTIVE"
df_valid = df_cleaned.filter(col("status") == "ACTIVE")

In [18]:
# 4. Validate row counts
print(f"\nRows before filtering: {df_cleaned.count()}")
print(f"Rows after filtering (ACTIVE only): {df_valid.count()}")
print(f"Rows removed (INACTIVE): {df_cleaned.count() - df_valid.count()}")

# Rename cleaned columns
df_final = df_valid.select(
    "sensor_id",
    "location",
    "road_name",
    col("vehicle_count_clean").alias("vehicle_count"),
    col("avg_speed_clean").alias("avg_speed"),
    "temperature",
    col("timestamp").alias("original_timestamp"),
    "event_time",
    "status"
)


Rows before filtering: 500000
Rows after filtering (ACTIVE only): 475000
Rows removed (INACTIVE): 25000


#PHASE 4

In [19]:
# 1. Average speed per location
print("\n1. Average Speed per Location:")
avg_speed_location = df_final.groupBy("location") \
    .agg(round(avg("avg_speed"), 2).alias("avg_speed")) \
    .orderBy("location")
avg_speed_location.show()


1. Average Speed per Location:
+---------+---------+
| location|avg_speed|
+---------+---------+
|Bangalore|    47.46|
|  Chennai|     47.6|
|    Delhi|    47.62|
|Hyderabad|    47.54|
|  Kolkata|    47.43|
|   Mumbai|    47.47|
|     Pune|    47.42|
+---------+---------+



In [20]:
# 2. Total vehicle count per road
print("\n2. Total Vehicle Count per Road:")
total_vehicles_road = df_final.groupBy("road_name") \
    .agg(sum("vehicle_count").alias("total_vehicles")) \
    .orderBy(desc("total_vehicles"))
total_vehicles_road.show()


2. Total Vehicle Count per Road:
+---------------+--------------+
|      road_name|total_vehicles|
+---------------+--------------+
|  Outer Ring Rd|       1339365|
| Hitech City Rd|       1338486|
|           NH48|       1335420|
|      Howrah Rd|       1334512|
|Western Express|       1334351|
|       GST Road|       1333073|
|      EM Bypass|       1331117|
|     Mount Road|       1329511|
|  Gachibowli Rd|       1328605|
|      Ring Road|       1327408|
|Eastern Express|       1325865|
|    Madhapur Rd|       1324233|
|        FC Road|       1322292|
|  University Rd|       1322004|
|  Whitefield Rd|       1320360|
|            OMR|       1317171|
|      Link Road|       1316848|
|    Park Street|       1310784|
|        Janpath|       1303498|
|        MG Road|       1303485|
+---------------+--------------+
only showing top 20 rows


In [21]:
# 3. Peak traffic time per location
print("\n3. Peak Traffic Time per Location:")
peak_traffic = df_final.groupBy("location", "event_time") \
    .agg(sum("vehicle_count").alias("total_vehicles")) \
    .withColumn("rank", row_number().over(Window.partitionBy("location").orderBy(desc("total_vehicles")))) \
    .filter(col("rank") == 1) \
    .select("location", "event_time", "total_vehicles") \
    .orderBy("location")
peak_traffic.show(truncate=False)


3. Peak Traffic Time per Location:
+---------+----------+--------------+
|location |event_time|total_vehicles|
+---------+----------+--------------+
|Bangalore|NULL      |42531         |
|Chennai  |NULL      |38697         |
|Delhi    |NULL      |38574         |
|Hyderabad|NULL      |41108         |
|Kolkata  |NULL      |36491         |
|Mumbai   |NULL      |38168         |
|Pune     |NULL      |34848         |
+---------+----------+--------------+



In [22]:
# 4. Roads with lowest average speed (most congestion)
print("\n4. Roads with Lowest Average Speed (Most Congested):")
congested_roads = df_final.groupBy("road_name") \
    .agg(round(avg("avg_speed"), 2).alias("avg_speed")) \
    .orderBy("avg_speed")
congested_roads.show()


4. Roads with Lowest Average Speed (Most Congested):
+---------------+---------+
|      road_name|avg_speed|
+---------------+---------+
|      EM Bypass|    47.31|
|      Link Road|    47.37|
|        FC Road|    47.41|
|  Whitefield Rd|    47.41|
|        MG Road|    47.42|
|      Howrah Rd|    47.42|
|  University Rd|    47.43|
|       Nagar Rd|    47.43|
|       GST Road|    47.44|
|  Gachibowli Rd|    47.46|
|Eastern Express|    47.49|
| Hitech City Rd|    47.51|
|  Outer Ring Rd|    47.53|
|            OMR|    47.55|
|Western Express|    47.56|
|      Ring Road|    47.56|
|    Park Street|    47.58|
|           NH48|    47.61|
|    Madhapur Rd|    47.64|
|        Janpath|    47.68|
+---------------+---------+
only showing top 20 rows


#PHASE 5

In [23]:
# 1. Rank roads by congestion (lowest speed)
print("\n1. Rank Roads by Congestion (Lowest Speed):")
window_spec_speed = Window.orderBy("avg_speed")
roads_with_congestion_rank = df_final.groupBy("road_name") \
    .agg(round(avg("avg_speed"), 2).alias("avg_speed")) \
    .withColumn("congestion_rank", dense_rank().over(window_spec_speed))
roads_with_congestion_rank.orderBy("congestion_rank").show()


1. Rank Roads by Congestion (Lowest Speed):
+---------------+---------+---------------+
|      road_name|avg_speed|congestion_rank|
+---------------+---------+---------------+
|      EM Bypass|    47.31|              1|
|      Link Road|    47.37|              2|
|        FC Road|    47.41|              3|
|  Whitefield Rd|    47.41|              3|
|        MG Road|    47.42|              4|
|      Howrah Rd|    47.42|              4|
|  University Rd|    47.43|              5|
|       Nagar Rd|    47.43|              5|
|       GST Road|    47.44|              6|
|  Gachibowli Rd|    47.46|              7|
|Eastern Express|    47.49|              8|
| Hitech City Rd|    47.51|              9|
|  Outer Ring Rd|    47.53|             10|
|            OMR|    47.55|             11|
|Western Express|    47.56|             12|
|      Ring Road|    47.56|             12|
|    Park Street|    47.58|             13|
|           NH48|    47.61|             14|
|    Madhapur Rd|    47.64|    

In [24]:
# 2. For each location, rank roads by vehicle_count
print("\n2. Rank Roads by Vehicle Count per Location:")
window_spec_location = Window.partitionBy("location").orderBy(desc("total_vehicles"))
roads_by_location = df_final.groupBy("location", "road_name") \
    .agg(sum("vehicle_count").alias("total_vehicles")) \
    .withColumn("rank", row_number().over(window_spec_location))
roads_by_location.orderBy("location", "rank").show()


2. Rank Roads by Vehicle Count per Location:
+---------+---------------+--------------+----+
| location|      road_name|total_vehicles|rank|
+---------+---------------+--------------+----+
|Bangalore|  Outer Ring Rd|       1339365|   1|
|Bangalore|  Whitefield Rd|       1320360|   2|
|Bangalore|        MG Road|       1303485|   3|
|  Chennai|       GST Road|       1333073|   1|
|  Chennai|     Mount Road|       1329511|   2|
|  Chennai|            OMR|       1317171|   3|
|    Delhi|           NH48|       1335420|   1|
|    Delhi|      Ring Road|       1327408|   2|
|    Delhi|        Janpath|       1303498|   3|
|Hyderabad| Hitech City Rd|       1338486|   1|
|Hyderabad|  Gachibowli Rd|       1328605|   2|
|Hyderabad|    Madhapur Rd|       1324233|   3|
|  Kolkata|      Howrah Rd|       1334512|   1|
|  Kolkata|      EM Bypass|       1331117|   2|
|  Kolkata|    Park Street|       1310784|   3|
|   Mumbai|Western Express|       1334351|   1|
|   Mumbai|Eastern Express|       1325865|

In [25]:
# 3. Identify top 3 congested roads per location
print("\n3. Top 3 Congested Roads per Location:")
window_spec_congestion = Window.partitionBy("location").orderBy("avg_speed")
top_congested = df_final.groupBy("location", "road_name") \
    .agg(round(avg("avg_speed"), 2).alias("avg_speed")) \
    .withColumn("rank", row_number().over(window_spec_congestion)) \
    .filter(col("rank") <= 3) \
    .orderBy("location", "rank")
top_congested.show()


3. Top 3 Congested Roads per Location:
+---------+---------------+---------+----+
| location|      road_name|avg_speed|rank|
+---------+---------------+---------+----+
|Bangalore|  Whitefield Rd|    47.41|   1|
|Bangalore|        MG Road|    47.42|   2|
|Bangalore|  Outer Ring Rd|    47.53|   3|
|  Chennai|       GST Road|    47.44|   1|
|  Chennai|            OMR|    47.55|   2|
|  Chennai|     Mount Road|    47.82|   3|
|    Delhi|      Ring Road|    47.56|   1|
|    Delhi|           NH48|    47.61|   2|
|    Delhi|        Janpath|    47.68|   3|
|Hyderabad|  Gachibowli Rd|    47.46|   1|
|Hyderabad| Hitech City Rd|    47.51|   2|
|Hyderabad|    Madhapur Rd|    47.64|   3|
|  Kolkata|      EM Bypass|    47.31|   1|
|  Kolkata|      Howrah Rd|    47.42|   2|
|  Kolkata|    Park Street|    47.58|   3|
|   Mumbai|      Link Road|    47.37|   1|
|   Mumbai|Eastern Express|    47.49|   2|
|   Mumbai|Western Express|    47.56|   3|
|     Pune|        FC Road|    47.41|   1|
|     Pune|   

#PHASE 6

In [26]:
# Create window for lag function
window_spec_time = Window.partitionBy("sensor_id").orderBy("event_time")

df_anomaly = df_final.withColumn("prev_speed", lag("avg_speed").over(window_spec_time)) \
    .withColumn("prev_vehicle_count", lag("vehicle_count").over(window_spec_time))

# 1. Detect sudden drop in avg_speed (> 20% drop)
print("\n1. Sudden Drop in Average Speed (>20% drop):")
speed_drops = df_anomaly.filter(
    (col("prev_speed").isNotNull()) &
    (col("avg_speed") < col("prev_speed") * 0.8)
).select("sensor_id", "location", "road_name", "event_time",
         "prev_speed", "avg_speed",
         round((col("prev_speed") - col("avg_speed")) / col("prev_speed") * 100, 2).alias("drop_percentage"))
speed_drops.show()


1. Sudden Drop in Average Speed (>20% drop):
+---------+---------+---------------+-------------------+----------+---------+---------------+
|sensor_id| location|      road_name|         event_time|prev_speed|avg_speed|drop_percentage|
+---------+---------+---------------+-------------------+----------+---------+---------------+
|     S100|Bangalore|  Whitefield Rd|               NULL|      75.0|     23.0|          69.33|
|     S100|Bangalore|  Whitefield Rd|               NULL|      75.2|     59.3|          21.14|
|     S100|  Kolkata|      EM Bypass|               NULL|      59.3|     42.7|          27.99|
|     S100|Bangalore|  Outer Ring Rd|               NULL|      42.7|     17.3|          59.48|
|     S100|    Delhi|        Janpath|               NULL|      59.6|     37.2|          37.58|
|     S100|     Pune|       Nagar Rd|               NULL|      45.6|     15.5|          66.01|
|     S100|Bangalore|  Outer Ring Rd|               NULL|      53.5|     25.5|          52.34|
|   

In [27]:
# 2. Detect sudden spikes in vehicle_count (> 50% increase)
print("\n2. Sudden Spikes in Vehicle Count (>50% increase):")
vehicle_spikes = df_anomaly.filter(
    (col("prev_vehicle_count").isNotNull()) &
    (col("vehicle_count") > col("prev_vehicle_count") * 1.5)
).select("sensor_id", "location", "road_name", "event_time",
         "prev_vehicle_count", "vehicle_count",
         round((col("vehicle_count") - col("prev_vehicle_count")) / col("prev_vehicle_count") * 100, 2).alias("spike_percentage"))
vehicle_spikes.show()


2. Sudden Spikes in Vehicle Count (>50% increase):
+---------+---------+---------------+-------------------+------------------+-------------+----------------+
|sensor_id| location|      road_name|         event_time|prev_vehicle_count|vehicle_count|spike_percentage|
+---------+---------+---------------+-------------------+------------------+-------------+----------------+
|     S100|Bangalore|  Whitefield Rd|               NULL|                59|          110|           86.44|
|     S100|Bangalore|  Outer Ring Rd|               NULL|                34|           89|          161.76|
|     S100|Bangalore|        MG Road|               NULL|                34|          110|          223.53|
|     S100|  Kolkata|    Park Street|               NULL|                31|           90|          190.32|
|     S100|     Pune|        FC Road|               NULL|                70|          109|           55.71|
|     S100|    Delhi|      Ring Road|               NULL|                20|        

#PHASE 7

In [28]:
# 1. Check number of partitions
print(f"\n1. Number of partitions in df_final: {df_final.rdd.getNumPartitions()}")



1. Number of partitions in df_final: 2


In [29]:
# 2. Use explain(True) on congestion queries
print("\n2. Execution Plan for Congestion Query:")
congestion_query = df_final.groupBy("road_name").agg(avg("avg_speed").alias("avg_speed"))
congestion_query.explain(True)


2. Execution Plan for Congestion Query:
== Parsed Logical Plan ==
'Aggregate ['road_name], ['road_name, 'avg('avg_speed) AS avg_speed#529]
+- Project [sensor_id#54, location#55, road_name#56, vehicle_count_clean#62 AS vehicle_count#219, avg_speed_clean#63 AS avg_speed#220, temperature#59, timestamp#60 AS original_timestamp#221, event_time#95, status#61]
   +- Filter (status#61 = ACTIVE)
      +- Project [sensor_id#54, location#55, road_name#56, vehicle_count#57, avg_speed#58, temperature#59, timestamp#60, status#61, vehicle_count_clean#62, avg_speed_clean#63, coalesce(try_to_timestamp(timestamp#60, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#60, Some(dd/MM/yyyy HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#60, Some(yyyy/MM/dd HH:mm:ss), TimestampType, Some(Etc/UTC), false)) AS event_time#95]
         +- Project [sensor_id#54, location#55, road_name#56, vehicle_count#57, avg_speed#58, temperature#59, timestam

In [30]:
# 3. Repartition by location
df_repartitioned = df_final.repartition("location")
print(f"\n3. Number of partitions after repartitioning: {df_repartitioned.rdd.getNumPartitions()}")


3. Number of partitions after repartitioning: 3


In [35]:
# 4. Cache cleaned DataFrame
df_cached = df_final.cache()
df_cached.count()  # Trigger caching
print("\n4. DataFrame cached successfully")


4. DataFrame cached successfully


In [32]:
# 5. Compare execution plans
print("\n5. Execution Plan After Caching:")
df_cached.groupBy("road_name").agg(avg("avg_speed").alias("avg_speed")).explain(True)


5. Execution Plan After Caching:
== Parsed Logical Plan ==
'Aggregate ['road_name], ['road_name, 'avg('avg_speed) AS avg_speed#843]
+- Project [sensor_id#54, location#55, road_name#56, vehicle_count_clean#62 AS vehicle_count#219, avg_speed_clean#63 AS avg_speed#220, temperature#59, timestamp#60 AS original_timestamp#221, event_time#95, status#61]
   +- Filter (status#61 = ACTIVE)
      +- Project [sensor_id#54, location#55, road_name#56, vehicle_count#57, avg_speed#58, temperature#59, timestamp#60, status#61, vehicle_count_clean#62, avg_speed_clean#63, coalesce(try_to_timestamp(timestamp#60, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#60, Some(dd/MM/yyyy HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#60, Some(yyyy/MM/dd HH:mm:ss), TimestampType, Some(Etc/UTC), false)) AS event_time#95]
         +- Project [sensor_id#54, location#55, road_name#56, vehicle_count#57, avg_speed#58, temperature#59, timestamp#60, s

#PHASE 8

In [36]:
# 1. Convert cleaned DataFrame to RDD
rdd = df_final.rdd

In [37]:
# 2a. Total vehicle count using reduce
print("\n1. Total Vehicle Count using RDD reduce:")
total_vehicles_rdd = rdd.map(lambda row: row.vehicle_count if row.vehicle_count else 0) \
    .reduce(lambda a, b: a + b)
print(f"Total vehicles: {total_vehicles_rdd}")

# 2b. Count of records per location using map-reduce
print("\n2. Count of Records per Location using RDD map-reduce:")
location_counts = rdd.map(lambda row: (row.location, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()
for location, count in sorted(location_counts):
    print(f"{location}: {count}")


1. Total Vehicle Count using RDD reduce:
Total vehicles: 27793397

2. Count of Records per Location using RDD map-reduce:
Bangalore: 67658
Chennai: 67919
Delhi: 68014
Hyderabad: 68165
Kolkata: 67978
Mumbai: 68054
Pune: 67212


# 3. Explain why DataFrames are better
Catalyst Optimizer: Automatic query optimization
Tungsten Execution Engine: Faster execution with code generation
Schema: Type safety and better memory management
High-level APIs: Easier to write and maintain
Language agnostic: Same performance across Python, Scala, Java
Built-in functions: Rich library of optimized functions

#PHASE 9

In [38]:
# 1. Sort roads by highest congestion (lowest speed)
print("\n1. Roads Sorted by Highest Congestion:")
sorted_congestion = df_final.groupBy("road_name") \
    .agg(round(avg("avg_speed"), 2).alias("avg_speed")) \
    .orderBy("avg_speed")
sorted_congestion.show()


1. Roads Sorted by Highest Congestion:
+---------------+---------+
|      road_name|avg_speed|
+---------------+---------+
|      EM Bypass|    47.31|
|      Link Road|    47.37|
|        FC Road|    47.41|
|  Whitefield Rd|    47.41|
|        MG Road|    47.42|
|      Howrah Rd|    47.42|
|  University Rd|    47.43|
|       Nagar Rd|    47.43|
|       GST Road|    47.44|
|  Gachibowli Rd|    47.46|
|Eastern Express|    47.49|
| Hitech City Rd|    47.51|
|  Outer Ring Rd|    47.53|
|            OMR|    47.55|
|Western Express|    47.56|
|      Ring Road|    47.56|
|    Park Street|    47.58|
|           NH48|    47.61|
|    Madhapur Rd|    47.64|
|        Janpath|    47.68|
+---------------+---------+
only showing top 20 rows


In [39]:

# 2. Create two sets
print("\n2. Creating Sets:")
low_speed_roads = df_final.filter(col("avg_speed") < 25) \
    .select("road_name").distinct()
print("Roads with avg_speed < 25:")
low_speed_roads.show()

high_vehicle_roads = df_final.filter(col("vehicle_count") > 60) \
    .select("road_name").distinct()
print("Roads with vehicle_count > 60:")
high_vehicle_roads.show()


2. Creating Sets:
Roads with avg_speed < 25:
+---------------+
|      road_name|
+---------------+
|      EM Bypass|
|        FC Road|
|Western Express|
|    Madhapur Rd|
|  Whitefield Rd|
|           NH48|
|      Link Road|
|        MG Road|
|  University Rd|
|    Park Street|
|       GST Road|
|      Howrah Rd|
|Eastern Express|
|  Gachibowli Rd|
|        Janpath|
| Hitech City Rd|
|      Ring Road|
|  Outer Ring Rd|
|       Nagar Rd|
|     Mount Road|
+---------------+
only showing top 20 rows
Roads with vehicle_count > 60:
+---------------+
|      road_name|
+---------------+
|Western Express|
|    Madhapur Rd|
|      EM Bypass|
|        FC Road|
|  Whitefield Rd|
|           NH48|
|        MG Road|
|      Link Road|
|       GST Road|
|    Park Street|
|  Gachibowli Rd|
|Eastern Express|
|  University Rd|
|      Howrah Rd|
|     Mount Road|
|        Janpath|
| Hitech City Rd|
|       Nagar Rd|
|      Ring Road|
|            OMR|
+---------------+
only showing top 20 rows


In [40]:
# 3a. Roads in both sets (intersection)
print("\n3a. Roads in Both Sets (Intersection):")
both_sets = low_speed_roads.intersect(high_vehicle_roads)
both_sets.show()

# 3b. Roads in only one set (symmetric difference)
print("\n3b. Roads in Only One Set (Symmetric Difference):")
only_low_speed = low_speed_roads.subtract(high_vehicle_roads)
only_high_vehicle = high_vehicle_roads.subtract(low_speed_roads)
one_set_only = only_low_speed.union(only_high_vehicle)
one_set_only.show()


3a. Roads in Both Sets (Intersection):
+---------------+
|      road_name|
+---------------+
|      EM Bypass|
|        FC Road|
|Western Express|
|    Madhapur Rd|
|  Whitefield Rd|
|           NH48|
|      Link Road|
|        MG Road|
|  University Rd|
|    Park Street|
|       GST Road|
|      Howrah Rd|
|Eastern Express|
|  Gachibowli Rd|
|        Janpath|
| Hitech City Rd|
|      Ring Road|
|  Outer Ring Rd|
|       Nagar Rd|
|     Mount Road|
+---------------+
only showing top 20 rows

3b. Roads in Only One Set (Symmetric Difference):
+---------+
|road_name|
+---------+
+---------+



#PHASE 10

In [41]:
# 1. Write cleaned traffic data to Parquet (partitioned by location)
print("\n1. Writing cleaned data to Parquet (partitioned by location)...")
df_final.write.mode("overwrite") \
    .partitionBy("location") \
    .parquet("traffic_data_cleaned.parquet")
print("   ✓ Parquet files written successfully")


1. Writing cleaned data to Parquet (partitioned by location)...
   ✓ Parquet files written successfully


In [43]:
import pyspark.sql.functions as F
# 2. Writing congestion analytics to ORC
print("\n2. Writing congestion analytics to ORC...")
congestion_analytics = df_final.groupBy("location", "road_name") \
    .agg(
        F.round(F.avg("avg_speed"), 2).alias("avg_speed"),
        F.sum("vehicle_count").alias("total_vehicles"),
        F.count("*").alias("record_count")
    )
congestion_analytics.write.mode("overwrite").orc("congestion_analytics.orc")
print("   ✓ ORC files written successfully")


2. Writing congestion analytics to ORC...
   ✓ ORC files written successfully


In [44]:
# 3. Read back and validate
print("\n3. Reading back and validating...")
df_parquet = spark.read.parquet("traffic_data_cleaned.parquet")
print(f"   Parquet records read: {df_parquet.count()}")
print(f"   Original records: {df_final.count()}")
print(f"   ✓ Validation: {'PASS' if df_parquet.count() == df_final.count() else 'FAIL'}")

df_orc = spark.read.orc("congestion_analytics.orc")
print(f"\n   ORC records read: {df_orc.count()}")
print(f"   Original analytics records: {congestion_analytics.count()}")
print(f"   ✓ Validation: {'PASS' if df_orc.count() == congestion_analytics.count() else 'FAIL'}")


3. Reading back and validating...
   Parquet records read: 475000
   Original records: 475000
   ✓ Validation: PASS

   ORC records read: 21
   Original analytics records: 21
   ✓ Validation: PASS


In [45]:
# Stop Spark session when done
spark.stop()
print("\nSpark session stopped.")


Spark session stopped.
