In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [None]:
# =============================================================================
# TIME-BASED ANALYTICS
# =============================================================================

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Delay distribution analysis
delay_distribution = fact_flight_silver_df \
    .withColumn("delay_category",
        F.when(F.col("departure_delay") <= 0, "Early/On-time")
         .when(F.col("departure_delay") <= 15, "Slight Delay (1-15 min)")
         .when(F.col("departure_delay") <= 60, "Moderate Delay (16-60 min)")
         .when(F.col("departure_delay") <= 180, "Significant Delay (1-3 hours)")
         .otherwise("Major Delay (3+ hours)")) \
    .groupBy("delay_category") \
    .agg(
        F.count("*").alias("flight_count"),
        F.avg("departure_delay").alias("avg_delay_in_category"),
        F.avg("arrival_delay").alias("avg_arrival_delay_in_category")
    ) \
    .withColumn("percentage", F.col("flight_count") / F.sum("flight_count").over(Window.partitionBy()) * 100) \
    .orderBy("avg_delay_in_category")

In [None]:
delay_distribution.show(5)

In [None]:
# Flight time efficiency analysis
flight_efficiency = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .withColumn("scheduled_vs_actual", F.col("elapsed_time") - F.col("scheduled_time")) \
    .groupBy("airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("scheduled_time").alias("avg_scheduled_time"),
        F.avg("elapsed_time").alias("avg_actual_time"),
        F.avg("scheduled_vs_actual").alias("avg_time_difference"),
        F.avg("air_time").alias("avg_air_time"),
        F.avg(F.col("taxi_out") + F.col("taxi_in")).alias("avg_ground_time")
    ) \
    .orderBy(F.col("avg_time_difference").asc())

In [None]:
flight_efficiency.show(5)

In [None]:
delay_distribution.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.delay_distribution")
