In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [None]:
# =============================================================================
# DAILY ANALYTICS
# =============================================================================

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Daily flight summary with dimensional joins
daily_summary = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("full_date", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.sum(F.when(F.col("diverted") == 1, 1).otherwise(0)).alias("diverted_flights"),
        F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)).alias("delayed_flights"),
        (F.sum(F.when(F.col("departure_delay") > 15, 1).otherwise(0)) / F.count("*") * 100).alias("delay_rate_pct")
    )


In [None]:
daily_summary.show(5)

In [None]:
# Weekly aggregation
weekly_summary = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", F.weekofyear("full_date").alias("week_of_year"), "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.max("departure_delay").alias("max_departure_delay"),
        F.min("departure_delay").alias("min_departure_delay"),
        F.stddev("departure_delay").alias("delay_std_dev")
    )

In [None]:
weekly_summary.show(5)

In [None]:
# Monthly trends
monthly_trends = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", "month", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.sum("elapsed_time").alias("total_flight_time"),
        F.avg("elapsed_time").alias("avg_flight_time")
    )

In [None]:
monthly_trends.show(5)

In [None]:
# =============================================================================
# SAVE TO GOLD LAYER TABLES
# =============================================================================

# Write results to Delta tables in Gold layer
daily_summary.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.daily_flight_summary")