In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [2]:
# =============================================================================
# AIRPORT ANALYTICS
# =============================================================================

In [3]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...
Silver fact_flights count: 5,328,614


In [4]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [5]:
# Airport performance as origin
origin_airport_stats = fact_flight_silver_df \
    .groupBy("origin_airport_sk") \
    .agg(
        F.count("*").alias("departing_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_departures"),
        F.countDistinct("airline_sk").alias("airlines_operating"),
        F.countDistinct("destination_airport_sk").alias("destinations_served"),
        F.avg("taxi_out").alias("avg_taxi_out_time")
    ) \
    .orderBy(F.col("departing_flights").desc())

In [6]:
origin_airport_stats.show(5)

+-----------------+-----------------+-------------------+--------------------+------------------+-------------------+------------------+
|origin_airport_sk|departing_flights|avg_departure_delay|cancelled_departures|airlines_operating|destinations_served| avg_taxi_out_time|
+-----------------+-----------------+-------------------+--------------------+------------------+-------------------+------------------+
|               21|           346756|  9.340850871027516|                2555|                11|                169|17.407978712587983|
|              229|           285659| 14.146377495685135|                8541|                12|                162|19.544395861292696|
|               87|           239473|  11.53237434711876|                6253|                11|                148|17.242819485223205|
|               86|           195785| 11.829765387572301|                2121|                11|                139|15.972182935012647|
|              177|           194435| 10.

In [7]:
# Airport performance as destination
dest_airport_stats = fact_flight_silver_df \
    .groupBy("destination_airport_sk") \
    .agg(
        F.count("*").alias("arriving_flights"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("diverted") == 1, 1).otherwise(0)).alias("diverted_arrivals"),
        F.countDistinct("airline_sk").alias("airlines_serving"),
        F.countDistinct("origin_airport_sk").alias("origins_served"),
        F.avg("taxi_in").alias("avg_taxi_in_time")
    ) \
    .orderBy(F.col("arriving_flights").desc())

In [8]:
dest_airport_stats.show(5)

+----------------------+----------------+------------------+-----------------+----------------+--------------+------------------+
|destination_airport_sk|arriving_flights| avg_arrival_delay|diverted_arrivals|airlines_serving|origins_served|  avg_taxi_in_time|
+----------------------+----------------+------------------+-----------------+----------------+--------------+------------------+
|                    21|          346790|2.2250067791569355|             1112|              11|           169| 8.795473768962239|
|                   229|          285206| 7.217953236560507|              768|              12|           162|13.207200284183589|
|                    87|          239535| 5.868136563135204|             1069|              11|           149|11.562569561125697|
|                    86|          195976| 5.083611398963731|              667|              11|           140| 8.423960539228345|
|                   177|          194198| 6.125287638209834|              299|            

In [13]:
origin_airport_stats.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("unikargo_dev.03_gold.origin_airport_performance")
dest_airport_stats.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("unikargo_dev.03_gold.destination_airport_performance")

