In [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [3]:
# =============================================================================
# DAILY ANALYTICS
# =============================================================================

In [4]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

Reading fact_flights bronze data...
Silver fact_flights count: 5,328,614


In [5]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

Reading silver dimension tables...
Dimensions loaded - Airlines: 14, Airports: 322, Dates: 5844


In [6]:
# Airline performance ranking
airline_performance = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "iata_code") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        (F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)) / F.count("*") * 100).alias("cancellation_rate"),
        (F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)) / F.count("*") * 100).alias("delay_rate"),
        F.avg("distance").alias("avg_distance"),
        F.sum("distance").alias("total_distance"),
        F.avg("air_time").alias("avg_air_time"),
        F.avg("taxi_out").alias("avg_taxi_out_time"),
        F.avg("taxi_in").alias("avg_taxi_in_time")
    ) \
    .orderBy(F.col("delay_rate").asc())

In [7]:
airline_performance.show(5)

+--------------------+---------+-------------+-------------------+-------------------+-------------------+------------------+------------------+--------------+------------------+------------------+------------------+
|             airline|iata_code|total_flights|avg_departure_delay|  avg_arrival_delay|  cancellation_rate|        delay_rate|      avg_distance|total_distance|      avg_air_time| avg_taxi_out_time|  avg_taxi_in_time|
+--------------------+---------+-------------+-------------------+-------------------+-------------------+------------------+------------------+--------------+------------------+------------------+------------------+
|Hawaiian Airlines...|       HA|        70018| 0.5145326788499792|  2.161955789865765| 0.2270844639949727| 6.962495358336428| 632.7308692050616|    4.430255E7| 84.93013194275318|10.964430385182428|6.8563943250633494|
|Alaska Airlines Inc.|       AS|       157981| 1.9567405129638689|-0.7932488913808043| 0.4025800570954735|10.568359486267335|1197.21

In [12]:
print("Fact schema:")
fact_flight_silver_df.printSchema()

print("Date dim schema:")
dim_date_silver_df.printSchema()

print("Airline dim schema:")
dim_airline_silver_df.printSchema()


Fact schema:
root
 |-- flight_sk: long (nullable = true)
 |-- date_sk: long (nullable = true)
 |-- airline_sk: integer (nullable = true)
 |-- origin_airport_sk: integer (nullable = true)
 |-- destination_airport_sk: integer (nullable = true)
 |-- flight_number: integer (nullable = true)
 |-- tail_number: string (nullable = true)
 |-- day_of_week: long (nullable = true)
 |-- day_name: string (nullable = true)
 |-- scheduled_departure: integer (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- departure_delay: double (nullable = true)
 |-- taxi_out: double (nullable = true)
 |-- wheels_off: integer (nullable = true)
 |-- scheduled_time: double (nullable = true)
 |-- elapsed_time: double (nullable = true)
 |-- air_time: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- wheels_on: integer (nullable = true)
 |-- taxi_in: double (nullable = true)
 |-- scheduled_arrival: integer (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- arrival_

In [17]:
# Airline punctuality by day of week
airline_dow_performance = fact_flight_silver_df \
    .join(dim_date_silver_df.alias("dd"), "date_sk") \
    .join(dim_airline_silver_df.alias("da"), "airline_sk") \
    .groupBy("da.airline", "dd.day_of_week", "dd.day_name") \
    .agg(
        F.count("*").alias("flights"),
        F.avg("departure_delay").alias("avg_delay"),
        (F.sum(F.when(F.col("departure_delay") <= 15, 1).otherwise(0)) / F.count("*") * 100).alias("on_time_rate")
    ) \
    .orderBy("da.airline", "dd.day_of_week")

In [18]:
airline_dow_performance.show(5)

+--------------------+-----------+---------+-------+------------------+-----------------+
|             airline|day_of_week| day_name|flights|         avg_delay|     on_time_rate|
+--------------------+-----------+---------+-------+------------------+-----------------+
|Alaska Airlines Inc.|          1|   Monday|  23155| 2.687475597587957|88.16670265601381|
|Alaska Airlines Inc.|          2|  Tuesday|  22335| 0.963160968596972|90.60219386612938|
|Alaska Airlines Inc.|          3|Wednesday|  22524| 0.655126379494482|90.72988811933938|
|Alaska Airlines Inc.|          4| Thursday|  23072|2.2582680591818973|88.74826629680999|
|Alaska Airlines Inc.|          5|   Friday|  22738| 3.058589783213387|87.63743513061834|
+--------------------+-----------+---------+-------+------------------+-----------------+
only showing top 5 rows


In [19]:
# =============================================================================
# SAVE TO GOLD LAYER TABLES
# =============================================================================

# Write results to Delta tables in Gold layer
airline_performance.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.airline_performance")