In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [None]:
# =============================================================================
# ROUTE ANALYTICS
# =============================================================================

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Route performance analysis (assuming you have airport dimension tables)
route_analysis = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("origin_airport_sk", "destination_airport_sk", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.avg("elapsed_time").alias("avg_flight_time"),
        F.avg("air_time").alias("avg_air_time"),
        F.avg("distance").alias("avg_distance"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.countDistinct("tail_number").alias("aircraft_count")
    ) \
    .filter(F.col("total_flights") >= 100) \
    .orderBy(F.col("total_flights").desc())

In [None]:
route_analysis.show(5)

In [None]:
# Busiest routes with airport names
busiest_routes = fact_flight_silver_df \
    .groupBy("origin_airport_sk", "destination_airport_sk") \
    .agg(
        F.count("*").alias("total_flights"),
        F.countDistinct("airline_sk").alias("airlines_serving"),
        F.avg("distance").alias("distance"),
        F.avg("departure_delay").alias("avg_delay"),
        F.countDistinct("tail_number").alias("unique_aircraft")
    ) \
    .join(dim_airport_silver_df.alias("origin_apt"), 
          F.col("origin_airport_sk") == F.col("origin_apt.airport_sk")) \
    .join(dim_airport_silver_df.alias("dest_apt"), 
          F.col("destination_airport_sk") == F.col("dest_apt.airport_sk")) \
    .select(
        "origin_airport_sk",
        "destination_airport_sk", 
        F.col("origin_apt.iata_code").alias("origin_code"),
        F.col("origin_apt.city").alias("origin_city"),
        F.col("origin_apt.state").alias("origin_state"),
        F.col("dest_apt.iata_code").alias("dest_code"),
        F.col("dest_apt.city").alias("dest_city"),
        F.col("dest_apt.state").alias("dest_state"),
        "total_flights",
        "airlines_serving",
        "distance",
        "avg_delay",
        "unique_aircraft"
    ) \
    .orderBy(F.col("total_flights").desc()) \
    .limit(50)

In [None]:
busiest_routes.show(10)

In [None]:
route_analysis.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.route_analysis")

In [None]:
busiest_routes.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("unikargo_dev.03_gold.busiest_routes")