In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# === Cargar dataset ===
df = pd.read_csv("data/modelado/ds_modelado.csv")

# === Filtrar invierno, viajes válidos y usuarios registrados ===
df = df[(df["estacion_del_anio"] == 3) & 
        (df["duracion_recorrido"] > 1) & 
        (df["usuario_registrado"] == 1)].copy()

# === Generar columna de fecha para ordenar ===
df["fecha_intervalo"] = pd.to_datetime(dict(
    year=df["año_intervalo"],
    month=df["mes_intervalo"],
    day=df["dia_intervalo"],
    hour=df["hora_intervalo"],
    minute=df["minuto_intervalo"]
))

# === Columnas base a excluir ===
target = "N_arribos_intervalo"
cols_a_excluir = [
    "N_arribos_intervalo", "N_salidas_intervalo", "id_recorrido",
    "id_estacion_destino", "barrio_destino", "zona_destino_cluster",
    "cantidad_estaciones_cercanas_destino", "año_destino", "mes_destino", "dia_destino",
    "hora_destino", "minuto_destino", "segundo_destino", "duracion_recorrido",
    "timestamp_intervalo", "salidas_acumuladas_origen", "salidas_intervalo_acumuladas",
    "recurrencia_usuario", "fecha_intervalo"
]

# === Columnas a las que les vamos a hacer LAG ===
cols_base_lags = ["N_arribos_intervalo", "N_salidas_intervalo"] + [
    col for col in df.columns if col.endswith("destino")
]

# === Iterar agregando más LAGs ===
resultados = []

for max_lag in range(4, 16):
    df_lags = df.copy()

    # Agregar nuevos LAGs (desde el 4 hasta max_lag)
    for lag in range(4, max_lag + 1):
        for col in cols_base_lags:
            if col in df_lags.columns:
                group_col = "id_estacion_origen" if "intervalo" in col else "id_estacion_destino"
                df_lags[f"{col}_LAG{lag}"] = (
                    df_lags.sort_values(["fecha_intervalo"])
                    .groupby(group_col)[col]
                    .shift(lag)
                )

    # === Armar features ===
    lag_cols = [col for col in df_lags.columns if "_LAG" in col]
    features = [col for col in df_lags.columns if col not in cols_a_excluir and col != target]

    # === Separar train / val ===
    train_df, val_df = train_test_split(
        df_lags,
        test_size=0.2,
        random_state=42,
        stratify=df_lags["año_intervalo"]
    )

    X_train = train_df[features].fillna(-1)
    y_train = train_df[target]
    X_val = val_df[features].fillna(-1)
    y_val = val_df[target]

    # === Entrenar modelo ===
    model = XGBRegressor(
        n_estimators=800,
        max_depth=15,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="count:poisson",
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    mae = mean_absolute_error(y_val, y_pred)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    r2 = r2_score(y_val, y_pred)

    resultados.append({
        "LAG máx": max_lag,
        "MAE": round(mae, 2),
        "RMSE": round(rmse, 2),
        "R2": round(r2, 4)
    })

# === Mostrar resultados ===
resultados_df = pd.DataFrame(resultados)
print(resultados_df) 

  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[f"{col}_LAG{lag}"] = (
  df_lags[

    LAG máx   MAE  RMSE      R2
0         4  0.98  1.39  0.6301
1         5  0.97  1.39  0.6336
2         6  0.97  1.38  0.6354
3         7  0.97  1.38  0.6349
4         8  0.98  1.39  0.6339
5         9  0.98  1.39  0.6336
6        10  0.98  1.39  0.6329
7        11  0.98  1.39  0.6329
8        12  0.98  1.39  0.6331
9        13  0.98  1.39  0.6318
10       14  0.98  1.39  0.6324
11       15  0.98  1.39  0.6320


