In [1]:
# ---------------------------------------------------------
# SCRIPT: 03_Gold_Layer
# DESCRIPCI√ìN: Generaci√≥n de KPIs de negocio (Agregaciones)
# listos para visualizaci√≥n en Power BI.
# ---------------------------------------------------------

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, round, first, when

# --- 1. CONFIGURACI√ìN ---
print("üîå Iniciando sesi√≥n de Spark...")
spark = SparkSession.builder \
    .appName("SkyTracker_Gold_ETL") \
    .master("local[*]") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

# --- 2. LECTURA (Capa Silver) ---
print("üìÇ Cargando Tabla Maestra desde Silver...")
df_silver = spark.read.parquet("s3a://silver/master_flights")

# ==============================================================================
# KPI 1: HEATMAP DE FIABILIDAD (¬øCu√°ndo volar?)
# ==============================================================================
print("\nüî• [1/3] Generando KPI: Heatmap de Retrasos...")

df_kpi1 = df_silver.groupBy("DAY_NAME", "DEPARTURE_HOUR") \
    .agg(
        # L√≥gica: Si el retraso es negativo (adelanto), cuenta como 0.
        round(
            avg(
                when(col("DEPARTURE_DELAY") < 0, 0)
                .otherwise(col("DEPARTURE_DELAY"))
            ), 2
        ).alias("RETRASO_MEDIO"),
        count("*").alias("NUM_VUELOS")
    ) \
    .orderBy("DAY_NAME", "DEPARTURE_HOUR")

# Guardado
ruta_kpi1 = "s3a://gold/kpi_reliability_heatmap"
df_kpi1.write.mode("overwrite").parquet(ruta_kpi1)
print(f"‚úÖ Guardado en: {ruta_kpi1}")

# ==============================================================================
# KPI 2: MAPA DE RUTAS (Tr√°fico y Estr√©s)
# ==============================================================================
print("\nüåç [2/3] Generando KPI: Mapa de Rutas...")

df_kpi2 = df_silver.groupBy("ORIGIN_CODE", "DEST_CODE") \
    .agg(
        count("*").alias("TOTAL_VUELOS"),
        round(
            avg(
                when(col("DEPARTURE_DELAY") < 0, 0)
                .otherwise(col("DEPARTURE_DELAY"))
            ), 2
        ).alias("RETRASO_MEDIO"),
        
        # Rescatamos datos Geoespaciales
        first("ORIGIN_CITY").alias("ORIGEN_CIUDAD"),
        first("DEST_CITY").alias("DESTINO_CIUDAD"),
        first("ORIGIN_LAT").alias("LAT_ORIGEN"),
        first("ORIGIN_LONG").alias("LONG_ORIGEN"),
        first("DEST_LAT").alias("LAT_DESTINO"),
        first("DEST_LONG").alias("LONG_DESTINO")
    ) \
    .filter(col("TOTAL_VUELOS") >= 50) # Filtro de relevancia

# Guardado
ruta_kpi2 = "s3a://gold/kpi_route_stress"
df_kpi2.write.mode("overwrite").parquet(ruta_kpi2)
print(f"‚úÖ Guardado en: {ruta_kpi2}")

# ==============================================================================
# KPI 3: EFICIENCIA DE PILOTOS (Recuperaci√≥n en Aire)
# ==============================================================================
print("\n‚úàÔ∏è [3/3] Generando KPI: Eficiencia de Pilotos...")

# Filtro: Solo vuelos que salieron con retraso real (> 0)
df_retrasados = df_silver.filter(
    (col("CANCELLED") == 0) & 
    (col("DEPARTURE_DELAY") > 0) & 
    (col("ARRIVAL_DELAY").isNotNull())
)

df_kpi3 = df_retrasados.groupBy("AIRLINE_NAME") \
    .agg(
        count("*").alias("VUELOS_RETRASADOS_TOTAL"),
        round(avg("DEPARTURE_DELAY"), 2).alias("RETRASO_INICIAL_AVG"),
        round(avg("ARRIVAL_DELAY"), 2).alias("RETRASO_FINAL_AVG"),
        
        # M√©trica clave: Cu√°nto recortaron
        round(
            avg(col("DEPARTURE_DELAY") - col("ARRIVAL_DELAY")), 2
        ).alias("TIEMPO_RECUPERADO_AVG")
    ) \
    .orderBy(col("TIEMPO_RECUPERADO_AVG").desc())

# Guardado
ruta_kpi3 = "s3a://gold/kpi_pilot_efficiency"
df_kpi3.write.mode("overwrite").parquet(ruta_kpi3)
print(f"‚úÖ Guardado en: {ruta_kpi3}")

print("\nüèÜ --- CAPA GOLD FINALIZADA CON √âXITO ---")

üîå Iniciando sesi√≥n de Spark...
üìÇ Cargando Tabla Maestra desde Silver...

üî• [1/3] Generando KPI: Heatmap de Retrasos...
‚úÖ Guardado en: s3a://gold/kpi_reliability_heatmap

üåç [2/3] Generando KPI: Mapa de Rutas...
‚úÖ Guardado en: s3a://gold/kpi_route_stress

‚úàÔ∏è [3/3] Generando KPI: Eficiencia de Pilotos...
‚úÖ Guardado en: s3a://gold/kpi_pilot_efficiency

üèÜ --- CAPA GOLD FINALIZADA CON √âXITO ---
