In [1]:
# ---------------------------------------------------------
# SCRIPT: 01_Bronze_Layer.py
# DESCRIPCI√ìN: Ingesta de CSVs crudos, selecci√≥n de columnas clave
# y guardado en formato Parquet (Optimizado) en el Data Lake.
# ---------------------------------------------------------

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def create_spark_session():
    print("üîå Iniciando sesi√≥n de Spark...")
    return SparkSession.builder \
        .appName("SkyTracker_Bronze_ETL") \
        .master("local[*]") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .getOrCreate()

def process_flights(spark):
    print("\n‚úàÔ∏è Procesando VUELOS (Flights)...")
    # 1. Lectura
    # Ajusta el nombre del archivo si usaste "flights.csv" o "raw_flights.csv"
    df_raw = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("s3a://bronze/raw_data/flights.csv") 
    
    # 2. Transformaci√≥n (Tu selecci√≥n)
    df_clean = df_raw.select(
        "YEAR", "MONTH", "DAY", "DAY_OF_WEEK",
        "AIRLINE", "FLIGHT_NUMBER", "TAIL_NUMBER",
        "ORIGIN_AIRPORT", "DESTINATION_AIRPORT",
        "SCHEDULED_DEPARTURE", "DEPARTURE_TIME",
        "DEPARTURE_DELAY", "ARRIVAL_DELAY",
        "DISTANCE", "AIR_TIME",
        "CANCELLED", "CANCELLATION_REASON"
    )
    
    # 3. Escritura (Guardamos como Parquet en la carpeta 'formatted')
    # mode("overwrite") borra lo anterior si vuelves a ejecutar el script
    ruta_destino = "s3a://bronze/formatted/flights"
    df_clean.write.mode("overwrite").parquet(ruta_destino)
    print(f"‚úÖ Vuelos guardados en: {ruta_destino}")

def process_airlines(spark):
    print("\nüé´ Procesando AEROL√çNEAS (Airlines)...")
    df_raw = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("s3a://bronze/raw_data/airlines.csv")
    
    # Renombramos para estandarizar
    df_clean = df_raw.select(
        col("IATA_CODE").alias("AIRLINE_ID"),
        col("AIRLINE").alias("AIRLINE_NAME")
    )
    
    ruta_destino = "s3a://bronze/formatted/airlines"
    df_clean.write.mode("overwrite").parquet(ruta_destino)
    print(f"‚úÖ Aerol√≠neas guardadas en: {ruta_destino}")

def process_airports(spark):
    print("\nüìç Procesando AEROPUERTOS (Airports)...")
    df_raw = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("s3a://bronze/raw_data/airports.csv")
    
    # Eliminamos COUNTRY
    df_clean = df_raw.select(
        "IATA_CODE", "AIRPORT", "CITY", "STATE", "LATITUDE", "LONGITUDE"
    )
    
    ruta_destino = "s3a://bronze/formatted/airports"
    df_clean.write.mode("overwrite").parquet(ruta_destino)
    print(f"‚úÖ Aeropuertos guardados en: {ruta_destino}")

# --- EJECUCI√ìN DEL PIPELINE ---
if __name__ == "__main__":
    spark = create_spark_session()
    
    try:
        process_flights(spark)
        process_airlines(spark)
        process_airports(spark)
        print("\nüèÜ --- CAPA BRONCE COMPLETADA CON √âXITO ---")
    except Exception as e:
        print(f"\n‚ùå Error cr√≠tico en el pipeline: {e}")
    finally:
        spark.stop()

üîå Iniciando sesi√≥n de Spark...

‚úàÔ∏è Procesando VUELOS (Flights)...
‚úÖ Vuelos guardados en: s3a://bronze/formatted/flights

üé´ Procesando AEROL√çNEAS (Airlines)...
‚úÖ Aerol√≠neas guardadas en: s3a://bronze/formatted/airlines

üìç Procesando AEROPUERTOS (Airports)...
‚úÖ Aeropuertos guardados en: s3a://bronze/formatted/airports

üèÜ --- CAPA BRONCE COMPLETADA CON √âXITO ---
