In [None]:
import os
import sys
from pathlib import Path
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [None]:

# sys.path.append('../../../src')

# Current working directory
current_dir = os.getcwd()
# Go up 3 levels and append 'src'
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..', 'src'))
# Add src to sys.path
sys.path.append(project_root)

from config import get_table_config, get_cleaning_config, get_streaming_config, ENVIRONMENTS

from io_utils.write_to_table_utils import save_to_table, save_to_bronze, save_to_silver, save_to_gold

In [None]:
# # env = "dev"
# # layer = "bronze"

# # table_cfg = get_table_config(entity="airlines", layer=layer, environment=env)
# # # table_cfg.full_name -> "store1_prod.gold.dimestore_gold"

# # print(table_cfg)
# # print(table_cfg.full_name)

# bronze_airline = get_table_config(entity="airlines", layer="bronze", environment="dev")
# print(bronze_airline.full_name)

# # Silver example: Flights in staging
# silver_flights = get_table_config(entity="flights", layer="silver", environment="staging")
# print(silver_flights.full_name)
# # -> unikargo_staging.02_silver.unikargo_flights_silver

# # Gold example: Daily flight summary in prod
# gold_summary = get_table_config(
#     entity="flights",  
#     layer="gold",
#     environment="dev",
#     table_key="daily_summary"
# )
# print(gold_summary.full_name)

In [None]:
print("Reading fact_flights bronze data...")

fact_flight_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_fact_flight_silver")

print(f"Silver fact_flights count: {fact_flight_silver_df.count():,}")

In [None]:
print("Reading silver dimension tables...")

dim_date_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_date_silver")
dim_airline_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airline_silver")
dim_airport_silver_df = spark.read.table("unikargo_dev.02_silver.unikargo_dim_airport_silver")

print(f"Dimensions loaded - Airlines: {dim_airline_silver_df.count()}, "
      f"Airports: {dim_airport_silver_df.count()}, "
      f"Dates: {dim_date_silver_df.count()}")

In [None]:
# Top 10 busiest routes across all airlines
top_10_busiest_routes = fact_flight_silver_df \
    .groupBy("origin_airport_sk", "destination_airport_sk") \
    .agg(
        F.count("*").alias("total_flights"),
        F.countDistinct("airline_sk").alias("airlines_serving"),
        F.countDistinct("tail_number").alias("unique_aircraft"),
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay")
    ) \
    .join(dim_airport_silver_df.alias("origin_apt"), 
          F.col("origin_airport_sk") == F.col("origin_apt.airport_sk")) \
    .join(dim_airport_silver_df.alias("dest_apt"), 
          F.col("destination_airport_sk") == F.col("dest_apt.airport_sk")) \
    .select(
        F.col("origin_apt.iata_code").alias("origin_code"),
        F.col("origin_apt.city").alias("origin_city"),
        F.col("origin_apt.state").alias("origin_state"),
        F.col("dest_apt.iata_code").alias("dest_code"),
        F.col("dest_apt.city").alias("dest_city"),
        F.col("dest_apt.state").alias("dest_state"),
        "total_flights",
        "airlines_serving",
        "unique_aircraft",
        "avg_departure_delay",
        "avg_arrival_delay"
    ) \
    .orderBy(F.col("total_flights").desc()) \
    .limit(10)

    # Save to Gold layer
# top_10_busiest_routes.write.mode("overwrite").saveAsTable("gold.top_10_busiest_routes")

In [None]:
save_to_table(
    top_10_busiest_routes,
    entity="flights",
    layer="gold",
    environment="dev",
    table_key="top_10_busiest_routes",
    mode="append",
    overwrite_schema=False
)

In [None]:
# Calculate cancellation rate per airline
cancellation_rates_airline = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .groupBy("airline", "iata_code") \
    .agg(
        F.count("*").alias("total_flights"),
        F.sum(F.when(F.col("cancelled") == 1, 1).otherwise(0)).alias("cancelled_flights")
    ) \
    .withColumn("cancellation_rate_pct", 
                (F.col("cancelled_flights") / F.col("total_flights") * 100)) \
    .orderBy(F.col("cancellation_rate_pct").desc())

# Save to Gold layer
# cancellation_rates_airline.write.mode("overwrite").saveAsTable("gold.airline_cancellation_rates")


In [None]:
save_to_table(
    cancellation_rates_airline,
    entity="airlines",
    layer="gold",
    environment="dev",
    table_key="cancellation_rates",
    mode="append",
    overwrite_schema=False
)

In [None]:
save_to_gold(
    cancellation_rates_airline, 
    entity="airlines",
    table_key="cancellation_rates",   # table_key
    environment="dev",
    mode="append",
    overwrite_schema=False
)

In [None]:
# Average departure delay per airline per month
avg_delay_by_airline_month = fact_flight_silver_df \
    .join(dim_airline_silver_df, "airline_sk") \
    .join(dim_date_silver_df, "date_sk") \
    .groupBy("airline", "year", "month") \
    .agg(
        F.avg("departure_delay").alias("avg_departure_delay"),
        F.avg("arrival_delay").alias("avg_arrival_delay"),
        F.count("*").alias("total_flights")
    ) \
    .orderBy("airline", "year", "month")

# Save to Gold layer
# avg_delay_by_airline_month.write.mode("overwrite").saveAsTable("gold.avg_delay_by_airline_month")


In [None]:
save_to_table(
    avg_delay_by_airline_month,
    entity="airlines",
    layer="gold",
    environment="dev",
    table_key="avg_delay_by_month",
    mode="append",
    overwrite_schema=False
)