In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [2]:
# =============================================================================
# TIME-BASED ANALYTICS
# =============================================================================

In [3]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...


Silver fact_flights count: 5,328,614


In [4]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [5]:
# Delay distribution analysis
delay_distribution = fact_flight_silver_df \
    .withColumn("delay_category",
        F.when(F.col("departure_delay") <= 0, "Early/On-time")
         .when(F.col("departure_delay") <= 15, "Slight Delay (1-15 min)")
         .when(F.col("departure_delay") <= 60, "Moderate Delay (16-60 min)")
         .when(F.col("departure_delay") <= 180, "Significant Delay (1-3 hours)")
         .otherwise("Major Delay (3+ hours)")) \
    .groupBy("delay_category") \
    .agg(
        F.count("*").alias("flight_count"),
        F.avg("departure_delay").alias("avg_delay_in_category"),
        F.avg("arrival_delay").alias("avg_arrival_delay_in_category")
    ) \
    .withColumn("percentage", F.col("flight_count") / F.sum("flight_count").over(Window.partitionBy()) * 100) \
    .orderBy("avg_delay_in_category")

In [6]:
delay_distribution.show(5)



+--------------------+------------+---------------------+-----------------------------+------------------+
|      delay_category|flight_count|avg_delay_in_category|avg_arrival_delay_in_category|        percentage|
+--------------------+------------+---------------------+-----------------------------+------------------+
|       Early/On-time|     3266273|    -4.36440432260255|           -9.195461431094994| 61.29685880793767|
|Slight Delay (1-1...|     1018823|    6.245125993425747|           1.1655487153325934|19.119849927204335|
|Moderate Delay (1...|      652603|   31.643231796360116|           26.782795029716333|12.247143441052401|
|Significant Delay...|      261630|    99.53188090050836|             95.8505288504208| 4.909907154092978|
|Major Delay (3+ h...|      129285|    273.1717094148364|           269.51206673506715| 2.426240669712612|
+--------------------+------------+---------------------+-----------------------------+------------------+



In [7]:
# Flight time efficiency analysis
flight_efficiency = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .withColumn("scheduled_vs_actual", F.col("elapsed_time") - F.col("scheduled_time")) \
    .groupBy("airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("scheduled_time").alias("avg_scheduled_time"),
        F.avg("elapsed_time").alias("avg_actual_time"),
        F.avg("scheduled_vs_actual").alias("avg_time_difference"),
        F.avg("air_time").alias("avg_air_time"),
        F.avg(F.col("taxi_out") + F.col("taxi_in")).alias("avg_ground_time")
    ) \
    .orderBy(F.col("avg_time_difference").asc())

In [8]:
flight_efficiency.show(5)

+--------------------+-------------+------------------+------------------+-------------------+------------------+------------------+
|             airline|total_flights|avg_scheduled_time|   avg_actual_time|avg_time_difference|      avg_air_time|   avg_ground_time|
+--------------------+-------------+------------------+------------------+-------------------+------------------+------------------+
|United Air Lines ...|       469400|200.14680017043034|191.33184237259866| -8.813115833569793|165.37974661609675| 25.96204572320153|
|Delta Air Lines Inc.|       799791|147.78246066784948|140.72518233425617| -7.024457277821633|115.83288953685273| 24.90344070241392|
|Southwest Airline...|      1157134|127.38634851279109|121.39206572765828|-6.0901005226898715|103.27516999062514|18.127171273527427|
|American Airlines...|       647587|172.23672340550382| 167.0195695516911|  -5.31458573638385|140.38007827191228|26.656652030494293|
|     JetBlue Airways|       244555| 172.4577334341968| 167.725522258

In [9]:
delay_distribution.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.delay_distribution")
