In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [2]:
# =============================================================================
# OPERATIONAL EFFICIENCY METRICS
# =============================================================================

In [3]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...
Silver fact_flights count: 5,328,614


In [4]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [5]:
# Aircraft utilization (flights per aircraft per day)
aircraft_utilization = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("full_date", "airline", "tail_number") \
    .agg(F.count("*").alias("flights_per_day")) \
    .groupBy("airline") \
    .agg(
        F.avg("flights_per_day").alias("avg_flights_per_aircraft_per_day"),
        F.max("flights_per_day").alias("max_flights_per_aircraft_per_day"),
        F.countDistinct("tail_number").alias("fleet_size")
    )

In [6]:
aircraft_utilization.show(5)

+--------------------+--------------------------------+--------------------------------+----------+
|             airline|avg_flights_per_aircraft_per_day|max_flights_per_aircraft_per_day|fleet_size|
+--------------------+--------------------------------+--------------------------------+----------+
|     US Airways Inc.|               3.810192647171694|                             399|       351|
|Frontier Airlines...|              4.6689873775966495|                              31|        62|
|United Air Lines ...|               2.980639180101979|                             434|       721|
|American Airlines...|               3.300495899780336|                             364|      1044|
|Alaska Airlines Inc.|              3.6855476495975736|                              11|       147|
+--------------------+--------------------------------+--------------------------------+----------+
only showing top 5 rows


In [7]:
# On-time performance trends with rolling averages
window_spec = Window.partitionBy("airline").orderBy("full_date").rowsBetween(-6, 0)

otp_trends = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("full_date", "airline") \
    .agg(
        F.count("*").alias("daily_flights"),
        (F.sum(F.when(F.col("departure_delay") <= 15, 1).otherwise(0)) / F.count("*") * 100).alias("daily_otp")
    ) \
    .withColumn("rolling_7day_otp", F.avg("daily_otp").over(window_spec)) \
    .orderBy("airline", "full_date")

In [8]:
otp_trends.show(5)

+----------+--------------------+-------------+-----------------+-----------------+
| full_date|             airline|daily_flights|        daily_otp| rolling_7day_otp|
+----------+--------------------+-------------+-----------------+-----------------+
|2015-01-01|Alaska Airlines Inc.|          440|90.68181818181819|90.68181818181819|
|2015-01-02|Alaska Airlines Inc.|          477|83.01886792452831|86.85034305317325|
|2015-01-03|Alaska Airlines Inc.|          449|79.73273942093542|84.47780850909398|
|2015-01-04|Alaska Airlines Inc.|          458|77.29257641921397|82.68150048662397|
|2015-01-05|Alaska Airlines Inc.|          433|83.60277136258661|82.86575466181651|
+----------+--------------------+-------------+-----------------+-----------------+
only showing top 5 rows


In [9]:
# Flight number performance
flight_number_performance = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "flight_number") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.avg("distance").alias("avg_distance"),
        F.countDistinct("tail_number").alias("aircraft_used")
    ) \
    .filter(F.col("total_flights") >= 50) \
    .orderBy("airline", F.col("avg_departure_delay").desc())

In [10]:
flight_number_performance.show(5)

+--------------------+-------------+-------------+-------------------+------------------+-----------------+-----------------+-------------+
|             airline|flight_number|total_flights|avg_departure_delay| avg_arrival_delay|cancelled_flights|     avg_distance|aircraft_used|
+--------------------+-------------+-------------+-------------------+------------------+-----------------+-----------------+-------------+
|Alaska Airlines Inc.|          380|           56| 21.660714285714285|12.854545454545455|                0|            550.0|           25|
|Alaska Airlines Inc.|          305|          273| 19.877777777777776|15.192592592592593|                3|            679.0|           79|
|Alaska Airlines Inc.|          341|          132| 19.015151515151516| 20.21969696969697|                0|            550.0|           29|
|Alaska Airlines Inc.|          223|          334| 17.221556886227546|15.859281437125748|                0|650.8053892215569|           72|
|Alaska Airlines Inc

In [11]:
# Distance vs delay correlation
distance_delay_analysis = fact_flight_silver_df \
    .withColumn("distance_category",
        F.when(F.col("distance") <= 500, "Short Haul (<500mi)")
         .when(F.col("distance") <= 1500, "Medium Haul (500-1500mi)")
         .otherwise("Long Haul (>1500mi)")) \
    .groupBy("distance_category") \
    .agg(
        F.count("*").alias("flight_count"),
        F.avg("distance").alias("avg_distance"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.avg("air_time").alias("avg_air_time"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights")
    ) \
    .orderBy("avg_distance")

In [12]:
distance_delay_analysis.show(5)

+--------------------+------------+------------------+-------------------+------------------+------------------+-----------------+
|   distance_category|flight_count|      avg_distance|avg_departure_delay| avg_arrival_delay|      avg_air_time|cancelled_flights|
+--------------------+------------+------------------+-------------------+------------------+------------------+-----------------+
| Short Haul (<500mi)|     1952115|304.66427643863193|  8.503665444938841| 5.584784577252948| 51.79815138449535|            41566|
|Medium Haul (500-...|     2658222| 868.9030814582078| 10.429913547682359|5.0919199000858155|119.82947353500062|            39694|
| Long Haul (>1500mi)|      718277|2056.4671206233807| 10.750416122829916|2.2365826743466624| 256.6975444965746|             6062|
+--------------------+------------+------------------+-------------------+------------------+------------------+-----------------+



In [13]:
# =============================================================================
# SAVE TO GOLD LAYER TABLES
# =============================================================================

In [14]:
aircraft_utilization.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.aircraft_utilization")
