In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [None]:
# =============================================================================
# DAILY ANALYTICS
# =============================================================================

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Airline performance ranking
airline_performance = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "iata_code") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        (F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)) / F.count("*") * 100).alias("cancellation_rate"),
        (F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)) / F.count("*") * 100).alias("delay_rate"),
        F.avg("distance").alias("avg_distance"),
        F.sum("distance").alias("total_distance"),
        F.avg("air_time").alias("avg_air_time"),
        F.avg("taxi_out").alias("avg_taxi_out_time"),
        F.avg("taxi_in").alias("avg_taxi_in_time")
    ) \
    .orderBy(F.col("delay_rate").asc())

In [None]:
airline_performance.show(5)

In [None]:
print("Fact schema:")
fact_flight_silver_df.printSchema()

print("Date dim schema:")
dim_date_silver_df.printSchema()

print("Airline dim schema:")
dim_airline_silver_df.printSchema()


In [None]:
# Airline punctuality by day of week
airline_dow_performance = fact_flight_silver_df \
    .join(dim_date_silver_df.alias("dd"), "date_sk") \
    .join(dim_airline_silver_df.alias("da"), "airline_sk") \
    .groupBy("da.airline", "dd.day_of_week", "dd.day_name") \
    .agg(
        F.count("*").alias("flights"),
        F.avg("departure_delay").alias("avg_delay"),
        (F.sum(F.when(F.col("departure_delay") <= 15, 1).otherwise(0)) / F.count("*") * 100).alias("on_time_rate")
    ) \
    .orderBy("da.airline", "dd.day_of_week")

In [None]:
airline_dow_performance.show(5)

In [None]:
# =============================================================================
# SAVE TO GOLD LAYER TABLES
# =============================================================================

# Write results to Delta tables in Gold layer
airline_performance.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.airline_performance")