In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("SmartCityTrafficAnalysis") \
    .getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f6a895a14d0>

Data Ingestion & Schema Analysis

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Load with schema inference
inferred_df = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/traffic_logs.csv")
print("Inferred Schema:")
inferred_df.printSchema()

# Manual schema definition
manual_schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", IntegerType(), True)
])

# Load with manual schema
manual_df = spark.read.option("header", True).schema(manual_schema).csv("/Volumes/workspace/default/shared/traffic_logs.csv")
print("\nManual Schema:")
manual_df.printSchema()

Inferred Schema:
root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: string (nullable = true)
 |-- ExitTime: string (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: string (nullable = true)
 |-- TollPaid: string (nullable = true)


Manual Schema:
root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)



Derived Column Creation

In [0]:
# Calculate trip duration and overspeed flag
calc_df = manual_df.withColumn("TripDurationMinutes", round((col("ExitTime").cast("long")-col("EntryTime").cast("long")) / 60, 2)).withColumn("IsOverspeed", col("SpeedKMH") > 60)

display(calc_df)

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False


3. Vehicle Behavior Aggregations

In [0]:
# Average speed per VehicleType
avg_speed = calc_df.groupBy("VehicleType") \
    .agg(avg("SpeedKMH").alias("Avg_Speed_KMH")) \
    .orderBy("Avg_Speed_KMH", ascending=False)
display(avg_speed)

# Total toll collected per gate
toll_by_gate = calc_df.groupBy("EntryPoint") \
    .agg(sum("TollPaid").alias("Total_Toll_Collected")) \
    .orderBy("Total_Toll_Collected", ascending=False)
display(toll_by_gate)

# Most used ExitPoint
exit_usage = calc_df.groupBy("ExitPoint") \
    .count() \
    .orderBy("count", ascending=False) \
    .limit(1)
display(exit_usage)


VehicleType,Avg_Speed_KMH
Car,70.0
Bike,55.0
Truck,45.0
Bus,40.0


EntryPoint,Total_Toll_Collected
GateB,170
GateA,80
GateC,50


ExitPoint,count
GateD,2


4. Window Functions

In [0]:
from pyspark.sql.window import Window

# Rank vehicles by speed within VehicleType
window_spec = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
speed_ranked = calc_df.withColumn("SpeedRank", rank().over(window_spec))
display(speed_ranked)

# Find last exit time for each vehicle
window_spec_vehicle = Window.partitionBy("VehicleID").orderBy("ExitTime")
lag_df = calc_df.withColumn("LastExitTime", lag("ExitTime", 1).over(window_spec_vehicle))
display(lag_df)

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed,SpeedRank
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False,1
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False,1
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True,1
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False,2
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False,1


LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed,LastExitTime
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False,
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False,
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False,
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True,
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False,


Session Segmentation

In [0]:
# Group by VehicleID and analyze trips
window_spec_session = Window.partitionBy("VehicleID").orderBy("EntryTime")
session_df = calc_df.withColumn("TimeSinceLastTripMinutes",round((col("EntryTime").cast("long") - lag("ExitTime", 1).over(window_spec_session).cast("long")) / 60, 2))

display(session_df)

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed,TimeSinceLastTripMinutes
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False,
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False,
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False,
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True,
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False,


Anomaly Detection

In [0]:
# High speed short trips
speed_anomalies = calc_df.filter(
    (col("SpeedKMH") > 70) & 
    (col("TripDurationMinutes") < 10))
display(speed_anomalies)

# Low toll for long trips
toll_anomalies = calc_df.filter(
    (col("TollPaid") < 50) & 
    (col("TripDurationMinutes") > 30))
display(toll_anomalies)

# Suspicious backtracking
backtracking = calc_df.filter(
    col("ExitPoint") < col("EntryPoint"))
display(backtracking)

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed


LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed


LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False


Join with Metadata

In [0]:
# Load vehicle registry
vehicle_registry = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/vehicle_registry.csv")

# Join with traffic data
joined_df = calc_df.join(vehicle_registry, "VehicleID", "left")
display(joined_df)

# Group trips by registered city
trips_by_city = joined_df.groupBy("RegisteredCity") \
    .agg(count("*").alias("TotalTrips")) \
    .orderBy("TotalTrips", ascending=False)
display(trips_by_city)

VehicleID,LogID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed,OwnerName,Model,RegisteredCity
V001,L001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False,Anil,Hyundai i20,Delhi
V002,L002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False,Rakesh,Tata Truck,Chennai
V003,L003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False,Sana,Yamaha R15,Mumbai
V004,L004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True,Neha,Honda City,Bangalore
V005,L005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False,Zoya,Volvo Bus,Pune


RegisteredCity,TotalTrips
Delhi,1
Chennai,1
Mumbai,1
Bangalore,1
Pune,1


Delta Lake Features

In [0]:
# Save as Delta table
calc_df.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/shared/delta_traffic_logs")

# Update toll rates for all Bikes
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, "/Volumes/workspace/default/shared/delta_traffic_logs")
delta_table.update(condition = "VehicleType = 'Bike'",set = {"TollPaid": "TollPaid + 10"})

# Delete trips longer than 60 minutes
delta_table.delete("TripDurationMinutes > 60")

# Check history
delta_table.history().show()

# Time travel
df_v1 = spark.read.format("delta").option("versionAsOf", 0).load("/Volumes/workspace/default/shared/delta_traffic_logs")
display(df_v1)

+-------+--------------------+----------------+--------------------+---------+--------------------+----+--------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+--------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|     12|2025-06-19 06:19:...|5873923491206719|ahmedashiq2k17@gm...|   DELETE|{predicate -> ["(...|NULL|    NULL|0619-053659-98ecx...|         11|WriteSerializable|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|     11| 2025-06-19 06:19:07|5873923491206719|ahmedashiq2k17@gm

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False


Advanced Conditions

In [0]:
# Tag trip types
categorized_df = calc_df.withColumn("TripType",when(col("TripDurationMinutes") < 15, "Short").when((col("TripDurationMinutes") >= 15) & (col("TripDurationMinutes") <= 30), "Medium").otherwise("Long"))
display(categorized_df)

# Flag vehicles with more than 3 trips in a day
daily_trips = calc_df.withColumn("TripDate", to_date("EntryTime")) \
.groupBy("VehicleID", "TripDate") \
.agg(count("*").alias("DailyTrips")) \
.withColumn("FrequentDriver", col("DailyTrips") > 3)
display(daily_trips)

LogID,VehicleID,EntryPoint,ExitPoint,EntryTime,ExitTime,VehicleType,SpeedKMH,TollPaid,TripDurationMinutes,IsOverspeed,TripType
L001,V001,GateA,GateC,2024-05-01T08:01:00.000Z,2024-05-01T08:20:00.000Z,Car,60,50,19.0,False,Medium
L002,V002,GateB,GateC,2024-05-01T08:10:00.000Z,2024-05-01T08:45:00.000Z,Truck,45,100,35.0,False,Long
L003,V003,GateA,GateD,2024-05-01T09:00:00.000Z,2024-05-01T09:18:00.000Z,Bike,55,30,18.0,False,Medium
L004,V004,GateC,GateD,2024-05-01T09:15:00.000Z,2024-05-01T09:35:00.000Z,Car,80,50,20.0,True,Medium
L005,V005,GateB,GateA,2024-05-01T10:05:00.000Z,2024-05-01T10:40:00.000Z,Bus,40,70,35.0,False,Long


VehicleID,TripDate,DailyTrips,FrequentDriver
V002,2024-05-01,1,False
V003,2024-05-01,1,False
V005,2024-05-01,1,False
V004,2024-05-01,1,False
V001,2024-05-01,1,False


Export & Reporting

In [0]:
# Write to Parquet partitioned by VehicleType
calc_df.write.partitionBy("VehicleType") \
    .parquet("/Volumes/workspace/default/shared/traffic_parquet")

# Write to CSV for dashboards
calc_df.write.option("header", True) \
    .csv("/Volumes/workspace/default/shared/traffic_csv")

# Create SQL view and summary
calc_df.createOrReplaceTempView("traffic_logs_view")

summary_df = spark.sql("""select VehicleType,ExitPoint,sum(TollPaid) AS TotalToll,avg(SpeedKMH) AS AvgSpeed,count(*) AS TripCount
from traffic_logs_view
group by VehicleType, ExitPoint
order by TotalToll desc""")

display(summary_df)

VehicleType,ExitPoint,TotalToll,AvgSpeed,TripCount
Truck,GateC,100,45.0,1
Bus,GateA,70,40.0,1
Car,GateD,50,80.0,1
Car,GateC,50,60.0,1
Bike,GateD,30,55.0,1
