# **Machine Learning - Proyecto: Predicci칩n de Retrasos de Vuelos** 九걾잺

## Librer칤as

### Librer칤as Generales

In [1]:
import sys
print(sys.executable)


d:\OneDrive\DOCUMENTOS\Personales\2024\uniandes\8 S\seminario\g11-caso-estudio-flights\venv\Scripts\python.exe


In [2]:
import os, time, json, math
import numpy as np
import pandas as pd
from joblib import dump, load
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_recall_curve, auc
)
import warnings

In [5]:
import os, time, json, math
import numpy as np
import pandas as pd
from joblib import dump, load
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_recall_curve, auc
)
import warnings

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

# --- Variables Globales ---
DATA_PATH = r"d:\OneDrive\DOCUMENTOS\Personales\2024\uniandes\8 S\seminario\g11-caso-estudio-flights\data\processed\flights_clean.csv"
TARGET_COL = "RETRASADO_LLEGADA"
RESULTS = []

# ==============================================================================
# PASO 1: FUNCIONES DE PREPARACI칍N DE DATOS (Helpers)
# ==============================================================================

def load_and_prep_data(data_path):
    """Carga y deriva todas las features necesarias del CSV."""
    print(f"Cargando datos desde {data_path}...")
    
    # Columnas m칤nimas necesarias del CSV original
    need_cols = [
        "MONTH", "DAY_OF_WEEK", "AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT",
        "SCHEDULED_DEPARTURE", "ORIGEN_LAT", "ORIGEN_LON", "DEST_LAT", "DEST_LON",
        "SALIDA_SIN", "SALIDA_COS", "RETRASADO_LLEGADA"
    ]
    
    header = pd.read_csv(data_path, nrows=0).columns.tolist()
    present = [c for c in need_cols if c in header]
    
    dtype_map = {
        "MONTH":"int8", "DAY_OF_WEEK":"int8", "AIRLINE":"category", 
        "ORIGIN_AIRPORT":"category", "DESTINATION_AIRPORT":"category",
        "SCHEDULED_DEPARTURE":"int32", "ORIGEN_LAT":"float32", "ORIGEN_LON":"float32",
        "DEST_LAT":"float32", "DEST_LON":"float32", "SALIDA_SIN":"float32", 
        "SALIDA_COS":"float32", "RETRASADO_LLEGADA":"int8"
    }
    dtype_eff = {k:v for k,v in dtype_map.items() if k in present}

    v = pd.read_csv(data_path, usecols=present, dtype=dtype_eff, low_memory=False)

    # --- Derivar features ---
    def haversine_km(lat1, lon1, lat2, lon2):
        R = 6371.0
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1; dlon = lon2 - lon1
        a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
        return (2*R*np.arcsin(np.sqrt(a))).astype(np.float32)

    if "DISTANCIA_HAV" not in v.columns:
        v["DISTANCIA_HAV"] = haversine_km(v["ORIGEN_LAT"], v["ORIGEN_LON"], v["DEST_LAT"], v["DEST_LON"])
    
    if "MONTH_SIN" not in v.columns:
        v["MONTH_SIN"] = np.sin(2*np.pi * v["MONTH"]/12).astype("float32")
        v["MONTH_COS"] = np.cos(2*np.pi * v["MONTH"]/12).astype("float32")

    if "MINUTO_DIA_SALIDA" not in v.columns:
        hs = (v["SCHEDULED_DEPARTURE"] // 100).clip(0, 23).astype("int16")
        ms = (v["SCHEDULED_DEPARTURE"] % 100).clip(0, 59).astype("int16")
        v["MINUTO_DIA_SALIDA"] = (hs * 60 + ms).astype("int16")
        v["HORA_SALIDA"] = hs

    if "RUTA" not in v.columns:
        v["RUTA"] = v["ORIGIN_AIRPORT"].astype(str) + "_" + v["DESTINATION_AIRPORT"].astype(str)
    
    print(f"Datos preparados. Shape: {v.shape}")
    return v

def split_temporal(df, target_col):
    """Split temporal: Train 1-9, Valid 10-12"""
    print("Realizando split temporal (Train 1-9, Valid 10-12)...")
    train_mask = df["MONTH"].between(1, 9)
    valid_mask = df["MONTH"].between(10, 12)
    
    y = df[target_col].astype("int8")
    X = df.drop(columns=[target_col])
    
    X_train, y_train = X.loc[train_mask].copy(), y.loc[train_mask].copy()
    X_valid, y_valid = X.loc[valid_mask].copy(), y.loc[valid_mask].copy()
    
    print(f"X_train: {X_train.shape}, X_valid: {X_valid.shape}")
    return X_train, y_train, X_valid, y_valid

# ==============================================================================
# PASO 2: FUNCIONES DE FEATURE ENGINEERING (Codificadores)
# ==============================================================================

# --- VERSI칍N NUEVA (CORREGIDA v2) ---
def apply_label_encoder(X_train_subset, X_valid_subset):
    """Aplica LabelEncoder a las columnas categ칩ricas."""
    print("Aplicando LabelEncoder...")
    X_train_le = X_train_subset.copy()
    X_valid_le = X_valid_subset.copy()

    # Itera sobre las columnas del DataFrame que ya le pasamos
    cat_cols_in_subset = X_train_subset.columns 
    
    for col in cat_cols_in_subset: 
        le = LabelEncoder()
        X_train_le[col] = le.fit_transform(X_train_le[col].astype(str))
        
        # Manejar categor칤as no vistas en validaci칩n
        le_classes = le.classes_
        X_valid_le[col] = X_valid_le[col].astype(str).apply(lambda x: x if x in le_classes else '<unknown>')
        if '<unknown>' not in le_classes:
            le.classes_ = np.append(le.classes_, '<unknown>')
        X_valid_le[col] = le.transform(X_valid_le[col])
            
    return X_train_le, X_valid_le

def kfold_target_encode(s_train, y_train, s_valid, smoothing=50):
    """Aplica Target Encoding K-Fold (sin fuga) en train y lo mapea a valid."""
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    gmean = float(y_train.mean())
    enc_train = pd.Series(index=s_train.index, dtype="float32")

    for tr_idx, val_idx in skf.split(s_train, y_train):
        s_tr, y_tr = s_train.iloc[tr_idx], y_train.iloc[tr_idx]
        s_val = s_train.iloc[val_idx]

        stats = y_tr.groupby(s_tr.astype(str)).mean()
        cnts = y_tr.groupby(s_tr.astype(str)).size()
        smoothed = ((stats * cnts + gmean * smoothing) / (cnts + smoothing)).to_dict()
        enc_train.iloc[val_idx] = s_val.astype(str).map(smoothed).fillna(gmean)

    # Mapping final para validaci칩n
    full_stats = y_train.groupby(s_train.astype(str)).mean()
    full_cnts = y_train.groupby(s_train.astype(str)).size()
    mapping = ((full_stats * full_cnts + gmean * smoothing) / (full_cnts + smoothing)).to_dict()
    enc_valid = s_valid.astype(str).map(mapping).fillna(gmean).astype("float32")
    
    return enc_train.astype("float32"), enc_valid

def apply_target_encoding(X_train, y_train, X_valid, cat_cols):
    """Aplica TE K-Fold y DEVUELVE SOLO LAS NUEVAS COLUMNAS."""
    print("Aplicando Target Encoding K-Fold...")
    X_train_te = X_train.copy()
    X_valid_te = X_valid.copy()
    new_te_cols = []
    
    for col in cat_cols:
        new_col_name = f"{col}_TE"
        enc_tr, enc_val = kfold_target_encode(X_train[col], y_train, X_valid[col])
        X_train_te[new_col_name] = enc_tr
        X_valid_te[new_col_name] = enc_val
        new_te_cols.append(new_col_name)
        
    # Devolver SOLO las nuevas columnas TE, con el 칤ndice original
    return X_train_te[new_te_cols], X_valid_te[new_te_cols]

def apply_historical_aggs(X_train, y_train, X_valid, agg_specs):
    """Calcula agregados hist칩ricos y DEVUELVE SOLO LAS NUEVAS COLUMNAS."""
    print("Aplicando Agregados Hist칩ricos...")
    X_train_agg = X_train.copy()
    X_valid_agg = X_valid.copy()
    gmean = float(y_train.mean())
    new_agg_cols = []
    
    # DataFrame temporal de entrenamiento para calcular agregados
    df_train = X_train.copy()
    df_train[TARGET_COL] = y_train
    
    for keys, pref in agg_specs:
        rate_col, n_col = f"{pref}_rate", f"{pref}_n"
        new_agg_cols.extend([rate_col, n_col])
        
        agg = df_train.groupby(keys, observed=True)[TARGET_COL].agg(["mean", "size"]).reset_index()
        agg.columns = keys + [rate_col, n_col]
        
        # Merge sin fuga
        X_train_agg = X_train_agg.merge(agg, on=keys, how="left")
        X_valid_agg = X_valid_agg.merge(agg, on=keys, how="left")

        # Llenar NaNs (categor칤as no vistas en train) con la media global
        X_train_agg[rate_col] = X_train_agg[rate_col].fillna(gmean).astype("float32")
        X_valid_agg[rate_col] = X_valid_agg[rate_col].fillna(gmean).astype("float32")
        X_train_agg[n_col] = X_train_agg[n_col].fillna(0).astype("float32")
        X_valid_agg[n_col] = X_valid_agg[n_col].fillna(0).astype("float32")

    # Devolver SOLO las nuevas columnas de agregados, con el 칤ndice original
    return X_train_agg[new_agg_cols], X_valid_agg[new_agg_cols]


# ==============================================================================
# PASO 3: FUNCIONES DE ENTRENAMIENTO Y EVALUACI칍N
# ==============================================================================

def train_lgbm(X_train, y_train, X_valid, y_valid, exp_name):
    """Entrena un modelo LGBM y devuelve el modelo y las m칠tricas."""
    print(f"\n--- Entrenando Experimento: {exp_name} ---")
    
    # Usamos class_weight='balanced' porque funcion칩 en tu Rev 5
    # y scale_pos_weight pareci칩 fallar en las Rev 4, 6, 7.
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'n_estimators': 1000,  # Reducido para una comparaci칩n r치pida
        'learning_rate': 0.05,
        'num_leaves': 127,
        'class_weight': 'balanced',
        'n_jobs': -1,
        'random_state': 42,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'min_child_samples': 200
    }
    
    model = lgb.LGBMClassifier(**params)
    
    t0 = time.time()
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
    )
    t1 = time.time()
    
    print(f"Entrenamiento completado en {t1-t0:.1f}s")
    
    # Calcular m칠tricas
    y_proba = model.predict_proba(X_valid)[:, 1]
    auc_roc = roc_auc_score(y_valid, y_proba)
    
    # Encontrar mejor F1
    prec, rec, thr = precision_recall_curve(y_valid, y_proba)
    f1s = (2 * prec * rec) / (prec + rec)
    best_f1_idx = np.nanargmax(f1s)
    best_f1 = f1s[best_f1_idx]
    best_thr = thr[best_f1_idx]
    
    # AUC-PR
    auc_pr = auc(rec, prec)
    
    metrics = {
        "Experimento": exp_name,
        "ROC-AUC": round(auc_roc, 4),
        "PR-AUC": round(auc_pr, 4),
        "Best_F1": round(best_f1, 4),
        "Best_F1_Threshold": round(best_thr, 3),
        "Tiempo_Entrenamiento (s)": round(t1 - t0, 1)
    }
    
    RESULTS.append(metrics)
    return model, metrics
# ==============================================================================
# PASO 4: EJECUCI칍N DE LOS EXPERIMENTOS (CORREGIDO v4)
# ==============================================================================

# Cargar y preparar datos (UNA SOLA VEZ)
v_full = load_and_prep_data(DATA_PATH)
X_train_base, y_train, X_valid_base, y_valid = split_temporal(v_full, TARGET_COL)

# Columnas para ingenier칤a de features
cat_cols = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "RUTA"]
num_cols = [c for c in X_train_base.columns if c not in cat_cols]
agg_specs = [
    (["RUTA", "HORA_SALIDA"], "RUTA_HORA"),
    (["AIRLINE"], "AIR"),
    (["ORIGIN_AIRPORT"], "ORI")
]

# --- Targets reseteados (se usan en todos los experimentos) ---
y_train_reset = y_train.reset_index(drop=True)
y_valid_reset = y_valid.reset_index(drop=True)


# --- Experimento 1: LabelEncoder (Revisi칩n 4 corregida) ---
X_train_le, X_valid_le = apply_label_encoder(X_train_base[cat_cols], X_valid_base[cat_cols])
# Unir num칠ricas y resetear 칤ndice
X_train_1 = pd.concat([X_train_base[num_cols].reset_index(drop=True), X_train_le.reset_index(drop=True)], axis=1)
X_valid_1 = pd.concat([X_valid_base[num_cols].reset_index(drop=True), X_valid_le.reset_index(drop=True)], axis=1)
train_lgbm(X_train_1, y_train_reset, X_valid_1, y_valid_reset, "LabelEncoder")


# --- Experimento 2: Target Encoding K-Fold (Revisi칩n 5 corregida) ---
X_train_te_cols, X_valid_te_cols = apply_target_encoding(X_train_base[cat_cols], y_train, X_valid_base[cat_cols], cat_cols)
# Unir num칠ricas y resetear 칤ndice
X_train_2 = pd.concat([X_train_base[num_cols].reset_index(drop=True), X_train_te_cols.reset_index(drop=True)], axis=1)
X_valid_2 = pd.concat([X_valid_base[num_cols].reset_index(drop=True), X_valid_te_cols.reset_index(drop=True)], axis=1)
train_lgbm(X_train_2, y_train_reset, X_valid_2, y_valid_reset, "TargetEncoding (TE)")


# --- Experimento 3: TE + Agregados (Revisi칩n 6/7 corregida) ---

# *** FIX: Resetear los inputs ANTES de pasarlos a las funciones de FE ***
X_train_base_r = X_train_base.reset_index(drop=True)
y_train_r = y_train.reset_index(drop=True)
X_valid_base_r = X_valid_base.reset_index(drop=True)
y_valid_r = y_valid.reset_index(drop=True)

# 1. Agregados Hist칩ricos (ahora usa inputs reseteados)
# X_train_agg_cols tendr치 칤ndice 0..M
X_train_agg_cols, X_valid_agg_cols = apply_historical_aggs(X_train_base_r, y_train_r, X_valid_base_r, agg_specs)

# 2. Target Encoding (ahora usa inputs reseteados)
# X_train_te_cols tendr치 칤ndice 0..M
X_train_te_cols, X_valid_te_cols = apply_target_encoding(X_train_base_r[cat_cols], y_train_r, X_valid_base_r[cat_cols], cat_cols)

# 3. Concatenar (TODAS las piezas tienen 칤ndice 0..M y se alinean)
X_train_3 = pd.concat([X_train_base_r[num_cols], X_train_te_cols, X_train_agg_cols], axis=1)
X_valid_3 = pd.concat([X_valid_base_r[num_cols], X_valid_te_cols, X_valid_agg_cols], axis=1)

# 4. Entrenar (X_train_3 e y_train_r est치n ambos reseteados 0..M)
train_lgbm(X_train_3, y_train_r, X_valid_3, y_valid_r, "TE + Agregados Hist칩ricos")


# ==============================================================================
# PASO 5: REPORTE FINAL
# (Esta celda no necesita cambios)
# ==============================================================================

print("\n\n--- Comparaci칩n Final de Alternativas (Validadas en Meses 10-12) ---")
df_results = pd.DataFrame(RESULTS).set_index("Experimento")
print(df_results.to_markdown(floatfmt=".4f"))

# Determinar el ganador
if not df_results.empty:
    winner = df_results['ROC-AUC'].idxmax()
    print(f"\n游끥 Ganador (por ROC-AUC): {winner}")
else:
    print("\nNo se completaron experimentos para determinar un ganador.")
print("---")
print("Nota: Un ROC-AUC m치s alto indica un mejor modelo para distinguir entre clases.")
print("Un PR-AUC m치s alto es mejor para problemas desbalanceados (ignora 'Aciertos a Tiempo').")

Cargando datos desde d:\OneDrive\DOCUMENTOS\Personales\2024\uniandes\8 S\seminario\g11-caso-estudio-flights\data\processed\flights_clean.csv...
Datos preparados. Shape: (5231130, 19)
Realizando split temporal (Train 1-9, Valid 10-12)...
X_train: (4299046, 18), X_valid: (932084, 18)
Aplicando LabelEncoder...

--- Entrenando Experimento: LabelEncoder ---
[LightGBM] [Info] Number of positive: 805372, number of negative: 3493674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2802
[LightGBM] [Info] Number of data points in the train set: 4299046, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc:

ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.