In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [2]:
# =============================================================================
# DAILY ANALYTICS
# =============================================================================

In [2]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...
Silver fact_flights count: 5,328,614


In [3]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [5]:
# Daily flight summary with dimensional joins
daily_summary = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("full_date", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.sum(F.when(F.col("diverted") == 1, 1).otherwise(0)).alias("diverted_flights"),
        F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)).alias("delayed_flights"),
        (F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)) / F.count("*") * 100).alias("delay_rate_pct")
    )


In [6]:
daily_summary.show(5)

+----------+--------------------+-------------+--------------------+------------------+-----------------+----------------+---------------+------------------+
| full_date|             airline|total_flights| avg_departure_delay| avg_arrival_delay|cancelled_flights|diverted_flights|delayed_flights|    delay_rate_pct|
+----------+--------------------+-------------+--------------------+------------------+-----------------+----------------+---------------+------------------+
|2015-03-14|Hawaiian Airlines...|          196|-0.03571428571428571|  5.11734693877551|                0|               0|              6| 3.061224489795918|
|2015-03-26|Delta Air Lines Inc.|         2492|  11.134430176565008| 5.879871434310968|                0|               3|            453| 18.17817014446228|
|2015-12-28|      Virgin America|          192|  16.389473684210525| 8.521052631578947|                2|               0|             46|23.958333333333336|
|2015-07-07|American Airlines...|         2674|  10.

In [7]:
# Weekly aggregation
weekly_summary = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", F.weekofyear("full_date").alias("week_of_year"), "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.max("departure_delay").alias("max_departure_delay"),
        F.min("departure_delay").alias("min_departure_delay"),
        F.stddev("departure_delay").alias("delay_std_dev")
    )

In [8]:
weekly_summary.show(5)

+----+------------+--------------------+-------------+-------------------+-----------------+-------------------+-------------------+------------------+
|year|week_of_year|             airline|total_flights|avg_departure_delay|cancelled_flights|max_departure_delay|min_departure_delay|     delay_std_dev|
+----+------------+--------------------+-------------+-------------------+-----------------+-------------------+-------------------+------------------+
|2015|          27|United Air Lines ...|         9881| 15.764276160098776|              173|              676.0|              -21.0| 40.62202474634887|
|2015|          50|    Spirit Air Lines|         2344| 6.5708172871202395|                7|              723.0|              -35.0| 34.97189753477474|
|2015|          23|Hawaiian Airlines...|         1544|-2.3860103626943006|                0|              197.0|              -16.0|11.986056324919012|
|2015|          31|American Airlines...|        18465| 12.067821619849164|              

In [9]:
# Monthly trends
monthly_trends = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", "month", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.sum("elapsed_time").alias("total_flight_time"),
        F.avg("elapsed_time").alias("avg_flight_time")
    )

In [10]:
monthly_trends.show(5)

+----+-----+--------------------+-------------+-------------------+-----------------+-----------------+------------------+
|year|month|             airline|total_flights|avg_departure_delay|cancelled_flights|total_flight_time|   avg_flight_time|
+----+-----+--------------------+-------------+-------------------+-----------------+-----------------+------------------+
|2015|    1|Frontier Airlines...|         6828| 17.970195729537366|               89|        1033523.0|153.47831897831898|
|2015|    4|Atlantic Southeas...|        49295|  6.795554545642339|              929|        4738564.0|  98.4452569909004|
|2015|   12|Atlantic Southeas...|        44140|  9.979379027533685|             1517|        4046762.0| 95.37501767617252|
|2015|    9|American Eagle Ai...|        21202|  4.605790432365065|              368|        1835243.0|   88.237078705707|
|2015|    3|Skywest Airlines ...|        50078|  5.847798463058957|              544|        4894297.0| 99.00469303125315|
+----+-----+----

In [12]:
# =============================================================================
# SAVE TO GOLD LAYER TABLES
# =============================================================================

# Write results to Delta tables in Gold layer
daily_summary.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.daily_flight_summary")