In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window


In [None]:
# =============================================================================
# TIME-BASED ANALYTICS
# =============================================================================

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Seasonal analysis
seasonal_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .withColumn("season", 
        F.when(F.col("month").isin([12, 1, 2]), "Winter")
         .when(F.col("month").isin([3, 4, 5]), "Spring")
         .when(F.col("month").isin([6, 7, 8]), "Summer")
         .otherwise("Fall")) \
    .groupBy("year", "season") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.avg("distance").alias("avg_distance"),
        F.avg("air_time").alias("avg_air_time")
    ) \
    .orderBy("year", 
        F.when(F.col("season") == "Winter", 1)
         .when(F.col("season") == "Spring", 2)
         .when(F.col("season") == "Summer", 3)
         .otherwise(4))

In [None]:
seasonal_analysis.show(5)

In [None]:
# Weekend vs Weekday analysis
weekend_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "is_weekend") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights")
    ) \
    .withColumn("weekend_flag", F.when(F.col("is_weekend") == 1, "Weekend").otherwise("Weekday")) \
    .orderBy("airline", "is_weekend")

In [None]:
weekend_analysis.show(5)

In [None]:
# Quarter analysis
quarterly_analysis = fact_flight_silver_df \
    .join(dim_date_silver_df, "date_sk") \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("year", "quarter", "airline") \
    .agg(
        F.count("*").alias("total_flights"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights"),
        F.avg("distance").alias("avg_distance")
    ) \
    .orderBy("year", "quarter", "airline")

In [None]:
quarterly_analysis.show(5)

In [None]:
seasonal_analysis.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.seasonal_flight_trends")


In [None]:
weekend_analysis.write.mode("overwrite").saveAsTable("unikargo_dev.03_gold.weekend_vs_weekday_performance")
