0. Encabezado y utilidades

In [4]:
# Paso 0 · Imports, ruta y helpers
import os, time, json, math
import numpy as np
import pandas as pd
from joblib import dump, load

import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, roc_curve, auc, precision_recall_curve,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)


1. Carga del CSV (eficiente: usecols + dtypes compactos)

In [5]:
# Paso 1 · Carga del CSV (solo columnas útiles + dtypes compactos)
# DATA_PATH = os.path.join("data", "processed", "flights_clean.csv")  # ajusta si lo tienes en otra ruta
DATA_PATH = r"d:\OneDrive\DOCUMENTOS\Personales\2024\uniandes\8 S\seminario\g11-caso-estudio-flights\data\processed\flights_clean.csv"

need_cols = [
    "MONTH","DAY","DAY_OF_WEEK",
    "AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT",
    "SCHEDULED_DEPARTURE",
    "ORIGEN_LAT","ORIGEN_LON","DEST_LAT","DEST_LON",
    "SALIDA_SIN","SALIDA_COS",
    "RETRASADO_LLEGADA"
]

# leer solo el header para ver qué hay
header = pd.read_csv(DATA_PATH, nrows=0).columns.tolist()
present = [c for c in need_cols if c in header]
missing = [c for c in need_cols if c not in header]
print("Columnas cargadas:", present, "\nFaltantes (se derivan si aplica):", missing)

dtype_map = {
    "MONTH":"int8","DAY":"int8","DAY_OF_WEEK":"int8",
    "AIRLINE":"category","ORIGIN_AIRPORT":"category","DESTINATION_AIRPORT":"category",
    "SCHEDULED_DEPARTURE":"int32",
    "ORIGEN_LAT":"float32","ORIGEN_LON":"float32","DEST_LAT":"float32","DEST_LON":"float32",
    "SALIDA_SIN":"float32","SALIDA_COS":"float32",
    "RETRASADO_LLEGADA":"int8"
}
dtype_eff = {k:v for k,v in dtype_map.items() if k in present}

t0 = time.time()
v = pd.read_csv(DATA_PATH, usecols=present, dtype=dtype_eff, low_memory=False)
t1 = time.time()
print(f"✅ Cargado {v.shape} en {t1-t0:.1f}s | Rate retraso={float(v['RETRASADO_LLEGADA'].mean()):.4f}")


Columnas cargadas: ['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'ORIGEN_LAT', 'ORIGEN_LON', 'DEST_LAT', 'DEST_LON', 'SALIDA_SIN', 'SALIDA_COS', 'RETRASADO_LLEGADA'] 
Faltantes (se derivan si aplica): []
✅ Cargado (5231130, 14) en 25.7s | Rate retraso=0.1847


2. Derivar columnas que falten (DISTANCIA_HAV, MONTH_SIN/COS, MINUTO_DIA_SALIDA, RUTA)

In [6]:
# Paso 2 · Derivación de features faltantes
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1 = np.radians(lat1); lon1 = np.radians(lon1)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat1; dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    return (2*R*np.arcsin(np.sqrt(a))).astype(np.float32)

# Distancia
if {"ORIGEN_LAT","ORIGEN_LON","DEST_LAT","DEST_LON"}.issubset(v.columns) and "DISTANCIA_HAV" not in v.columns:
    v["DISTANCIA_HAV"] = haversine_km(v["ORIGEN_LAT"], v["ORIGEN_LON"], v["DEST_LAT"], v["DEST_LON"])

# Estacionalidad mes
if "MONTH" in v.columns and "MONTH_SIN" not in v.columns:
    v["MONTH_SIN"] = np.sin(2*np.pi * v["MONTH"]/12).astype(np.float32)
    v["MONTH_COS"] = np.cos(2*np.pi * v["MONTH"]/12).astype(np.float32)

# Minuto del día (si no vino ya)
if "MINUTO_DIA_SALIDA" not in v.columns and "SCHEDULED_DEPARTURE" in v.columns:
    hs = (v["SCHEDULED_DEPARTURE"]//100).clip(0,23)
    ms = (v["SCHEDULED_DEPARTURE"]%100).clip(0,59)
    v["MINUTO_DIA_SALIDA"] = (hs*60 + ms).astype(np.int16)

# Ruta (texto)
if {"ORIGIN_AIRPORT","DESTINATION_AIRPORT"}.issubset(v.columns) and "RUTA" not in v.columns:
    v["RUTA"] = v["ORIGIN_AIRPORT"].astype(str) + "_" + v["DESTINATION_AIRPORT"].astype(str)

print("✅ Derivadas OK | columnas:", len(v.columns))


✅ Derivadas OK | columnas: 19


3. Definir features y target

In [7]:
# Paso 3 · Selección de variables
target = "RETRASADO_LLEGADA"

cat_cols = ["AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT","RUTA"]
num_cols = ["MONTH","DAY_OF_WEEK","SALIDA_SIN","SALIDA_COS","MONTH_SIN","MONTH_COS","DISTANCIA_HAV","MINUTO_DIA_SALIDA"]

features = [c for c in cat_cols + num_cols if c in v.columns]
X = v[features].copy()
y = v[target].astype("int8").copy()

print("X:", X.shape, "| y rate:", float(y.mean()))


X: (5231130, 12) | y rate: 0.18471362783949166


4. Split temporal (train: meses 1–9, valid: 10–12)

In [8]:
# Paso 4 · Split temporal (evita fuga)
train_mask = v["MONTH"].between(1,9)
valid_mask = v["MONTH"].between(10,12)

X_train = X.loc[train_mask].copy()
y_train = y.loc[train_mask].copy()
X_valid = X.loc[valid_mask].copy()
y_valid = y.loc[valid_mask].copy()

print("Train:", X_train.shape, "Valid:", X_valid.shape,
      "| rate train:", float(y_train.mean()), "| rate valid:", float(y_valid.mean()))


Train: (4299046, 12) Valid: (932084, 12) | rate train: 0.18733737671101913 | rate valid: 0.17261212508743848


5. Target Encoding KFold (sin fuga) para categorías

In [12]:
# ============================================================
# Target Encoding KFold ROBUSTO (sin fuga) + armado de matrices
# ============================================================
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd

# --- 0) PRERREQUISITOS / ALINEACIONES ---
# Deben existir de pasos previos:
#   v, X, y, X_train, X_valid, y_train, y_valid
for obj_name in ["X_train", "X_valid", "y_train", "y_valid"]:
    assert obj_name in globals(), f"Falta {obj_name}. Ejecuta los pasos previos."

# Asegurar que y_train/y_valid están alineados con X_train/X_valid
y_train = y_train.loc[X_train.index]
y_valid = y_valid.loc[X_valid.index]

# Crear RUTA si no existe
if "RUTA" not in X_train.columns and {"ORIGIN_AIRPORT","DESTINATION_AIRPORT"}.issubset(X_train.columns):
    X_train = X_train.copy()
    X_valid = X_valid.copy()
    X_train["RUTA"] = (X_train["ORIGIN_AIRPORT"].astype(str) + "_" + X_train["DESTINATION_AIRPORT"].astype(str))
    X_valid["RUTA"] = (X_valid["ORIGIN_AIRPORT"].astype(str) + "_" + X_valid["DESTINATION_AIRPORT"].astype(str))

# Columnas a codificar (solo si existen)
cols_te = [c for c in ["AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT","RUTA"] if c in X_train.columns]
print("TE sobre:", cols_te)

# --- 1) Funciones TE robustas (operan con Series) ---
def kfold_target_encode_series(s: pd.Series,
                               y: pd.Series,
                               n_splits=5,
                               smoothing=50,
                               seed=42) -> tuple[pd.Series, dict, float]:
    """
    s: Serie categórica (mismo índice que y)
    y: Serie binaria 0/1 (mismo índice que s)
    Devuelve:
      enc     -> Serie con el encoding KFold para s (alineada a s.index)
      mapping -> dict valor_categoria -> encoding_final (con TODO el train)
      gmean   -> media global (fallback)
    """
    # Alineación defensiva por índice
    idx = s.index.intersection(y.index)
    s = s.loc[idx]
    y = y.loc[idx].astype(float)

    # Normalización de tipos
    s = s.astype("string")  # evita NaNs tipo objeto raros
    gmean = float(y.mean())
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    enc = pd.Series(index=s.index, dtype=np.float32)

    for tr_idx, val_idx in skf.split(np.zeros(len(s)), y):
        s_tr, y_tr = s.iloc[tr_idx], y.iloc[tr_idx]
        s_val      = s.iloc[val_idx]

        stats = y_tr.groupby(s_tr).mean()
        cnts  = y_tr.groupby(s_tr).size()

        smoothed = (stats*cnts + gmean*smoothing) / (cnts + smoothing)
        enc.iloc[val_idx] = s_val.map(smoothed).fillna(gmean).astype(np.float32)

    # Mapping final con TODO el train (para producción/valid)
    full_stats = y.groupby(s).mean()
    full_cnts  = y.groupby(s).size()
    mapping = ((full_stats*full_cnts + gmean*smoothing) / (full_cnts + smoothing)).to_dict()

    return enc, mapping, gmean

def apply_te(series: pd.Series, mapping: dict, default: float) -> pd.Series:
    return series.astype("string").map(mapping).fillna(default).astype(np.float32)

# --- 2) Ejecutar TE ---
mappings, defaults = {}, {}
X_train = X_train.copy()
X_valid = X_valid.copy()

for c in cols_te:
    enc_tr, mapping, default = kfold_target_encode_series(X_train[c], y_train, n_splits=5, smoothing=50, seed=42)
    X_train[f"{c}_TE"] = enc_tr
    X_valid[f"{c}_TE"] = apply_te(X_valid[c], mapping, default)
    mappings[c] = mapping
    defaults[c] = default

print("✅ TE aplicado sin fuga.")
print("Ejemplo TE:", {k: list(v)[:2] if hasattr(v, "__iter__") else v for k,v in list(mappings.items())[:1]})

# --- 3) Construir matrices finales: quitamos las categorías crudas ---
X_train_model = X_train.drop(columns=[c for c in cols_te if c in X_train.columns]).copy()
X_valid_model = X_valid.drop(columns=[c for c in cols_te if c in X_valid.columns]).copy()

print("Listo para entrenar:")
print("X_train_model:", X_train_model.shape, "| X_valid_model:", X_valid_model.shape)
print("Columnas (primeras 12):", list(X_train_model.columns)[:12])


TE sobre: ['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'RUTA']
✅ TE aplicado sin fuga.
Ejemplo TE: {'AIRLINE': ['AA', 'AS']}
Listo para entrenar:
X_train_model: (4299046, 12) | X_valid_model: (932084, 12)
Columnas (primeras 12): ['MONTH', 'DAY_OF_WEEK', 'SALIDA_SIN', 'SALIDA_COS', 'MONTH_SIN', 'MONTH_COS', 'DISTANCIA_HAV', 'MINUTO_DIA_SALIDA', 'AIRLINE_TE', 'ORIGIN_AIRPORT_TE', 'DESTINATION_AIRPORT_TE', 'RUTA_TE']


6. Entrenamiento LightGBM (early stopping, balanceo)

In [13]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import time
import numpy as np

# Chequeos rápidos
for n in ["X_train_model","X_valid_model","y_train","y_valid"]:
    assert n in globals(), f"Falta {n}"
print("Shapes:", X_train_model.shape, X_valid_model.shape)

# Balanceo por proporción de clases (pos/neg)
neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = neg / max(pos, 1)
print(f"scale_pos_weight ~ {scale_pos_weight:.2f} (neg={neg}, pos={pos})")

params = dict(
    n_estimators=10000,
    learning_rate=0.03,
    num_leaves=127,
    max_depth=-1,
    min_child_samples=200,
    subsample=0.8,
    colsample_bytree=0.85,
    reg_alpha=0.1,
    reg_lambda=0.5,
    # usa uno u otro balanceo (recomiendo este):
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    random_state=42
)

model = lgb.LGBMClassifier(**params)

t0 = time.time()
model.fit(
    X_train_model, y_train,
    eval_set=[(X_valid_model, y_valid)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(stopping_rounds=300), lgb.log_evaluation(300)]
)
t1 = time.time()

valid_proba = model.predict_proba(X_valid_model)[:, 1]
auc_val = roc_auc_score(y_valid, valid_proba)
print(f"✅ Entrenado en {(t1-t0):.1f}s | best_iter={model.best_iteration_} | ROC-AUC valid={auc_val:.4f}")


Shapes: (4299046, 12) (932084, 12)
scale_pos_weight ~ 4.34 (neg=3493674, pos=805372)
[LightGBM] [Info] Number of positive: 805372, number of negative: 3493674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 4299046, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187337 -> initscore=-1.467405
[LightGBM] [Info] Start training from score -1.467405
Training until validation scores don't improve for 300 rounds
[300]	valid_0's auc: 0.612396	valid_0's binary_logloss: 0.574345
Early stopping, best iteration is:
[2]	valid_0's auc: 0.610956	valid_0's binary_logloss: 0.460388
✅ Entrenado en 117.2s | best_iter=2 | ROC-AUC valid=0.6110


7) Métricas base (0.5) + búsqueda de mejor umbral (por F1) y matriz de confusión

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import numpy as np

def report_metrics(y_true, y_prob, thr=0.5, title=""):
    y_hat = (y_prob >= thr).astype(int)
    acc = accuracy_score(y_true, y_hat)
    pre = precision_score(y_true, y_hat, zero_division=0)
    rec = recall_score(y_true, y_hat, zero_division=0)
    f1  = f1_score(y_true, y_hat, zero_division=0)
    auc = roc_auc_score(y_true, y_prob)
    cm  = confusion_matrix(y_true, y_hat)
    print(f"\n== {title} (thr={thr:.3f}) ==")
    print(f"Accuracy: {acc:.4f} | Precision: {pre:.4f} | Recall: {rec:.4f} | F1={f1:.4f} | ROC-AUC={auc:.4f}")
    print("CM [[TN, FP],[FN, TP]]=\n", cm)
    return dict(acc=acc, pre=pre, rec=rec, f1=f1, auc=auc, thr=thr)

# Base 0.5
base = report_metrics(y_valid, valid_proba, 0.5, "Base 0.5")

# Mejor F1 (búsqueda simple)
best = {"thr":0.5, "f1":-1}
for thr in np.linspace(0.1, 0.9, 33):
    y_hat = (valid_proba >= thr).astype(int)
    f1 = f1_score(y_valid, y_hat, zero_division=0)
    if f1 > best["f1"]:
        best = {"thr":float(thr), "f1":float(f1)}
best_f1_res = report_metrics(y_valid, valid_proba, best["thr"], "Mejor F1")
best_f1_res



== Base 0.5 (thr=0.500) ==
Accuracy: 0.8274 | Precision: 0.0000 | Recall: 0.0000 | F1=0.0000 | ROC-AUC=0.6110
CM [[TN, FP],[FN, TP]]=
 [[771195      0]
 [160889      0]]

== Mejor F1 (thr=0.200) ==
Accuracy: 0.4794 | Precision: 0.2098 | Recall: 0.7285 | F1=0.3258 | ROC-AUC=0.6110
CM [[TN, FP],[FN, TP]]=
 [[329672 441523]
 [ 43679 117210]]


{'acc': 0.47944391278039317,
 'pre': 0.20977819459383998,
 'rec': 0.7285146902522858,
 'f1': 0.3257543543693773,
 'auc': 0.6109562804872528,
 'thr': 0.2}