In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [2]:
# =============================================================================
# TIME-BASED ANALYTICS
# =============================================================================

In [3]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...
Silver fact_flights count: 5,328,614


In [4]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [5]:
# Seasonal analysis
seasonal_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .withColumn("season", 
        F.when(F.col("month").isin([12, 1, 2]), "Winter")
         .when(F.col("month").isin([3, 4, 5]), "Spring")
         .when(F.col("month").isin([6, 7, 8]), "Summer")
         .otherwise("Fall")) \
    .groupBy("year", "season") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.avg("distance").alias("avg_distance"),
        F.avg("air_time").alias("avg_air_time")
    ) \
    .orderBy("year", 
        F.when(F.col("season") == "Winter", 1)
         .when(F.col("season") == "Spring", 2)
         .when(F.col("season") == "Summer", 3)
         .otherwise(4))

In [6]:
seasonal_analysis.show(5)

+----+------+-------------+-------------------+-----------------+-----------------+------------------+
|year|season|total_flights|avg_departure_delay|cancelled_flights|     avg_distance|      avg_air_time|
+----+------+-------------+-------------------+-----------------+-----------------+------------------+
|2015|Winter|      1377481| 11.119387559122082|            40527|813.9649955244391|113.76376882236487|
|2015|Spring|      1485386|   8.95191314116608|            21191|818.2512451308953| 113.7264232812713|
|2015|Summer|      1533658| 11.748672457778913|            18940|836.5790000117366|114.06254541359908|
|2015|  Fall|       932089|  5.881324041510771|             6664|817.4366278327499|112.33122045854616|
+----+------+-------------+-------------------+-----------------+-----------------+------------------+



In [7]:
# Weekend vs Weekday analysis
weekend_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "is_weekend") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights")
    ) \
    .withColumn("weekend_flag", F.when(F.col("is_weekend") == 1, "Weekend").otherwise("Weekday")) \
    .orderBy("airline", "is_weekend")

In [8]:
weekend_analysis.show(5)

+--------------------+----------+-------------+-------------------+-------------------+-----------------+------------+
|             airline|is_weekend|total_flights|avg_departure_delay|  avg_arrival_delay|cancelled_flights|weekend_flag|
+--------------------+----------+-------------+-------------------+-------------------+-----------------+------------+
|Alaska Airlines Inc.|         0|       113824| 1.9334896967666275|-0.5900646382117056|              452|     Weekday|
|Alaska Airlines Inc.|         1|        44157|   2.01668636900973|  -1.31713823214245|              184|     Weekend|
|American Airlines...|         0|       472865|  9.220642682725517|   4.18132893210736|             6762|     Weekday|
|American Airlines...|         1|       174722|  9.622323178128207|  3.272022394265569|             3404|     Weekend|
|American Eagle Ai...|         0|       202064|  11.07501577714843|  7.959833671548501|            10812|     Weekday|
+--------------------+----------+-------------+-

In [9]:
# Quarter analysis
quarterly_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", "quarter", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.avg("distance").alias("avg_distance")
    ) \
    .orderBy("year", "quarter", "airline")

In [10]:
quarterly_analysis.show(5)

+----+-------+--------------------+-------------+-------------------+-----------------+------------------+
|year|quarter|             airline|total_flights|avg_departure_delay|cancelled_flights|      avg_distance|
+----+-------+--------------------+-------------+-------------------+-----------------+------------------+
|2015|      1|Alaska Airlines Inc.|        39727|  2.991827543455709|              216|1192.7543987716163|
|2015|      1|American Airlines...|       129648| 10.665006366264404|             4866|1078.6269591509317|
|2015|      1|American Eagle Ai...|        84986| 15.556966627710686|             8206| 433.1925964276469|
|2015|      1|Atlantic Southeas...|       149253|   9.87544007293639|             6839| 465.4183232497839|
|2015|      1|Delta Air Lines Inc.|       199349|   8.75566987751945|             2895| 855.6799081008684|
+----+-------+--------------------+-------------+-------------------+-----------------+------------------+
only showing top 5 rows


In [11]:
seasonal_analysis.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.seasonal_flight_trends")


In [12]:
weekend_analysis.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.weekend_vs_weekday_performance")
