In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb
import numpy as np


In [3]:
import gc 
gc.collect()

0

In [5]:
df = pd.read_csv("data/modelado/ds_modelado.csv")

In [5]:
def entrenar_y_evaluar(X_train, X_val, y_train, y_val, nombre):
    model = XGBRegressor(
        n_estimators=400,
        max_depth=10,
        learning_rate=0.05,
        objective="count:poisson",
        tree_method="gpu_hist",  # Usar GPU
        predictor="gpu_predictor",
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"\n📊 Resultados - {nombre}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.3f}")
    return model

In [6]:
df_filtrado = df[
    (df["duracion_recorrido"] > 1) &
    (df["usuario_registrado"] == 1)
].copy()

In [7]:
columnas_sin_leakage = [
    'id_estacion_origen', 'id_usuario', 'modelo_bicicleta', 'barrio_origen', 'dia_semana',
    'es_finde', 'estacion_del_anio', 'edad_usuario', 'año_alta', 'mes_alta',
    'genero_FEMALE', 'genero_MALE', 'genero_OTHER', 'usuario_registrado',
    'zona_origen_cluster', 'cantidad_estaciones_cercanas_origen',
    'año_origen', 'mes_origen', 'dia_origen', 'hora_origen', 'minuto_origen', 'segundo_origen',
    'año_intervalo', 'mes_intervalo', 'dia_intervalo', 'hora_intervalo', 'minuto_intervalo',
    'N_SALIDAS_PROM_2INT', 'N_ARRIBOS_PROM_2INT', 'N_SALIDAS_LAG1', 'N_ARRIBOS_LAG1',
    'N_SALIDAS_LAG2', 'N_ARRIBOS_LAG2', 'N_SALIDAS_LAG3', 'N_ARRIBOS_LAG3',
    'id_estacion_destino_LAG1', 'id_estacion_destino_LAG2', 'id_estacion_destino_LAG3',
    'barrio_destino_LAG1', 'barrio_destino_LAG2', 'barrio_destino_LAG3',
    'cantidad_estaciones_cercanas_destino_LAG1', 'cantidad_estaciones_cercanas_destino_LAG2',
    'cantidad_estaciones_cercanas_destino_LAG3', 'año_destino_LAG1', 'año_destino_LAG2',
    'año_destino_LAG3', 'mes_destino_LAG1', 'mes_destino_LAG2', 'mes_destino_LAG3',
    'dia_destino_LAG1', 'dia_destino_LAG2', 'dia_destino_LAG3', 'hora_destino_LAG1',
    'hora_destino_LAG2', 'hora_destino_LAG3', 'minuto_destino_LAG1', 'minuto_destino_LAG2',
    'minuto_destino_LAG3', 'segundo_destino_LAG1', 'segundo_destino_LAG2', 'segundo_destino_LAG3'
]
target = "N_arribos_intervalo"

Pruebo el de invierno pero con Dmatrix que es la estructura de datos interna optimizada que usa XGBoost para almacenar las matrices de características y etiquetas.

In [None]:
df_invierno = df[df["estacion_del_anio"] == 3].copy()
train_df, val_df = train_test_split(df_invierno,test_size=0.2,random_state=42,stratify=df_invierno["año_intervalo"])
X_train = train_df[columnas_sin_leakage]
y_train = train_df[target]

X_val = val_df[columnas_sin_leakage]
y_val = val_df[target]

In [20]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "max_depth": 15,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "learning_rate": 0.05,
    "objective": "reg:squarederror",
    "tree_method": "hist",
    "device": "cuda",
    "eval_metric": "rmse"
}

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=300,
    evals=[(dval, "validation")],
    early_stopping_rounds=30,
    verbose_eval=50
)

y_pred = model.predict(dval)

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
r2 = r2_score(y_val, y_pred)

print("📈 XGBoost con GPU")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

[0]	validation-rmse:2.23278
[50]	validation-rmse:1.53323
[100]	validation-rmse:1.45340
[150]	validation-rmse:1.43180
[200]	validation-rmse:1.42534
[250]	validation-rmse:1.42160
[299]	validation-rmse:1.41916
📈 XGBoost con GPU
MAE: 1.02
RMSE: 1.42
R²: 0.615


In [8]:
splits = {
    "invierno": df_filtrado[df_filtrado["estacion_del_anio"] == 3],
    "hora_pico": df_filtrado[df_filtrado["hora_origen"].between(7, 10) | df_filtrado["hora_origen"].between(17, 20)],
    "fin_de_semana": df_filtrado[df_filtrado["es_finde"] == 1],
    "dia_laborable": df_filtrado[df_filtrado["es_finde"] == 0],
    "usuarios_jovenes": df_filtrado[df_filtrado["edad_usuario"] < 30],
    "usuarios_mayores": df_filtrado[df_filtrado["edad_usuario"] >= 60],
    "usuarios_registrados": df_filtrado[df_filtrado["usuario_registrado"] == 1],
    "verano": df_filtrado[df_filtrado["estacion_del_anio"] == 1],
    "otoño": df_filtrado[df_filtrado["estacion_del_anio"] == 2],
    "primavera": df_filtrado[df_filtrado["estacion_del_anio"] == 4],
    "mujeres": df_filtrado[df_filtrado["genero_FEMALE"] == 1],
    "hombres": df_filtrado[df_filtrado["genero_MALE"] == 1],
    "otros_generos": df_filtrado[df_filtrado["genero_OTHER"] == 1],
    "estaciones_cercanas": df_filtrado[df_filtrado["cantidad_estaciones_cercanas_origen"] > 3],
    "zonas_cluster": df_filtrado[df_filtrado["zona_origen_cluster"] > 10],
    "todos": df_filtrado

}

In [8]:
valores = {}
def modelado(nombre, df):
    train_df, val_df = train_test_split(df,test_size=0.2,random_state=42,stratify=df["año_intervalo"])
    X_train = train_df[columnas_sin_leakage]
    y_train = train_df[target]

    X_val = val_df[columnas_sin_leakage]
    y_val = val_df[target]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        "max_depth": 15,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "learning_rate": 0.05,
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "device": "cuda",
        "eval_metric": "rmse"
    }

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        evals=[(dval, "validation")],
        early_stopping_rounds=30,
        verbose_eval=50
    )

    y_pred = model.predict(dval)

    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
    r2 = r2_score(y_val, y_pred)

    print(f"📈 XGBoost con {nombre}")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.3f}")
    valores[nombre] = {
        "mae": mae,
        "rmse": rmse,
        "r2": r2
    }
        
                                    

Invierno

In [9]:
modelado("invierno", df_filtrado[df_filtrado["estacion_del_anio"] == 3])

[0]	validation-rmse:2.21515
[50]	validation-rmse:1.53219
[100]	validation-rmse:1.45423
[150]	validation-rmse:1.43374
[200]	validation-rmse:1.42862
[250]	validation-rmse:1.42404
[299]	validation-rmse:1.42241
📈 XGBoost con invierno
MAE: 1.02
RMSE: 1.42
R²: 0.607


Hora pico

In [11]:
modelado("hora_pico", df_filtrado[df_filtrado["hora_origen"].between(7, 10) | df_filtrado["hora_origen"].between(17, 20)])

[0]	validation-rmse:2.19343
[50]	validation-rmse:1.62763
[100]	validation-rmse:1.55049
[150]	validation-rmse:1.53251
[200]	validation-rmse:1.52439
[250]	validation-rmse:1.51868
[299]	validation-rmse:1.51538
📈 XGBoost con hora_pico
MAE: 1.11
RMSE: 1.52
R²: 0.542


Fin de semana


In [12]:
modelado("fin_de_semana", df_filtrado[df_filtrado["es_finde"] == 1])

[0]	validation-rmse:2.52815
[50]	validation-rmse:1.58945
[100]	validation-rmse:1.50971
[150]	validation-rmse:1.49077
[200]	validation-rmse:1.48545
[250]	validation-rmse:1.48161
[299]	validation-rmse:1.47928
📈 XGBoost con fin_de_semana
MAE: 0.99
RMSE: 1.48
R²: 0.676


Dia laborable

In [13]:
modelado("dia_laborable", df_filtrado[df_filtrado["es_finde"] == 0])

[0]	validation-rmse:2.00952
[50]	validation-rmse:1.51967
[100]	validation-rmse:1.45700
[150]	validation-rmse:1.43988
[200]	validation-rmse:1.43226
[250]	validation-rmse:1.42803
[299]	validation-rmse:1.42487
📈 XGBoost con dia_laborable
MAE: 1.05
RMSE: 1.42
R²: 0.516


Jovenes

In [8]:
modelado("usuarios_jovenes", df_filtrado[df_filtrado["edad_usuario"] < 30])

[0]	validation-rmse:2.10120
[50]	validation-rmse:1.57036
[100]	validation-rmse:1.50715
[150]	validation-rmse:1.49033
[200]	validation-rmse:1.48350
[250]	validation-rmse:1.47974
[299]	validation-rmse:1.47742
📈 XGBoost con usuarios_jovenes
MAE: 1.06
RMSE: 1.48
R²: 0.524


Mayores

In [9]:
modelado("usuarios_mayores", df_filtrado[df_filtrado["edad_usuario"] >= 60])

[0]	validation-rmse:2.53875
[50]	validation-rmse:1.69081
[100]	validation-rmse:1.63323
[150]	validation-rmse:1.62125
[200]	validation-rmse:1.61887
[250]	validation-rmse:1.61838
[280]	validation-rmse:1.61856
📈 XGBoost con usuarios_mayores
MAE: 1.15
RMSE: 1.62
R²: 0.614



    "estaciones_cercanas": df_filtrado[df_filtrado["cantidad_estaciones_cercanas_origen"] > 3],
    "zonas_cluster": df_filtrado[df_filtrado["zona_origen_cluster"] > 10],
    "todos": df_filtrado


Usuarios Registrados

In [10]:
modelado("usuarios_registrados", df_filtrado[df_filtrado["usuario_registrado"] == 1])

[0]	validation-rmse:2.10007
[50]	validation-rmse:1.55385
[100]	validation-rmse:1.48230
[150]	validation-rmse:1.46209
[200]	validation-rmse:1.45298
[250]	validation-rmse:1.44709
[299]	validation-rmse:1.44309
📈 XGBoost con usuarios_registrados
MAE: 1.04
RMSE: 1.44
R²: 0.546


Verano

In [11]:
modelado("verano", df_filtrado[df_filtrado["estacion_del_anio"] == 1])

[0]	validation-rmse:1.99423
[50]	validation-rmse:1.49983
[100]	validation-rmse:1.43190
[150]	validation-rmse:1.41311
[200]	validation-rmse:1.40704
[250]	validation-rmse:1.40340
[299]	validation-rmse:1.40134
📈 XGBoost con verano
MAE: 1.02
RMSE: 1.40
R²: 0.524


Otoño

In [13]:
modelado("otoño", df_filtrado[df_filtrado["estacion_del_anio"] == 2])

[0]	validation-rmse:1.96341
[50]	validation-rmse:1.47507
[100]	validation-rmse:1.40792
[150]	validation-rmse:1.39299
[200]	validation-rmse:1.38856
[250]	validation-rmse:1.38571
[299]	validation-rmse:1.38415
📈 XGBoost con otoño
MAE: 1.02
RMSE: 1.38
R²: 0.523


Primavera

In [14]:
modelado("primavera", df_filtrado[df_filtrado["estacion_del_anio"] == 4])

[0]	validation-rmse:2.19882
[50]	validation-rmse:1.61264
[100]	validation-rmse:1.53713
[150]	validation-rmse:1.51800
[200]	validation-rmse:1.51229
[250]	validation-rmse:1.50758
[299]	validation-rmse:1.50536
📈 XGBoost con primavera
MAE: 1.10
RMSE: 1.51
R²: 0.551


Mujeres

In [15]:
modelado("mujeres", df_filtrado[df_filtrado["genero_FEMALE"] == 1])

[0]	validation-rmse:2.11288
[50]	validation-rmse:1.58381
[100]	validation-rmse:1.51762
[150]	validation-rmse:1.50087
[200]	validation-rmse:1.49487
[250]	validation-rmse:1.49111
[299]	validation-rmse:1.48900
📈 XGBoost con mujeres
MAE: 1.07
RMSE: 1.49
R²: 0.522


Hombres

In [16]:
modelado("hombres", df_filtrado[df_filtrado["genero_MALE"] == 1])

[0]	validation-rmse:2.03065
[50]	validation-rmse:1.52605
[100]	validation-rmse:1.45867
[150]	validation-rmse:1.44322
[200]	validation-rmse:1.43630
[250]	validation-rmse:1.43231
[299]	validation-rmse:1.43002
📈 XGBoost con hombres
MAE: 1.03
RMSE: 1.43
R²: 0.522


Otros

In [17]:
modelado("otros_generos", df_filtrado[df_filtrado["genero_OTHER"] == 1])

[0]	validation-rmse:2.37814
[50]	validation-rmse:1.68373
[100]	validation-rmse:1.62245
[150]	validation-rmse:1.60700
[200]	validation-rmse:1.60331
[250]	validation-rmse:1.60237
[299]	validation-rmse:1.60178
📈 XGBoost con otros_generos
MAE: 1.15
RMSE: 1.60
R²: 0.567


No mujer


In [18]:
modelado("mujeres", df_filtrado[df_filtrado["genero_FEMALE"] == 0])

[0]	validation-rmse:2.09495
[50]	validation-rmse:1.55017
[100]	validation-rmse:1.47982
[150]	validation-rmse:1.46255
[200]	validation-rmse:1.45504
[250]	validation-rmse:1.45049
[299]	validation-rmse:1.44768
📈 XGBoost con mujeres
MAE: 1.04
RMSE: 1.45
R²: 0.541


No hombre

In [19]:
modelado("hombres", df_filtrado[df_filtrado["genero_MALE"] == 0])

[0]	validation-rmse:2.17311
[50]	validation-rmse:1.59481
[100]	validation-rmse:1.52528
[150]	validation-rmse:1.50753
[200]	validation-rmse:1.50020
[250]	validation-rmse:1.49608
[299]	validation-rmse:1.49369
📈 XGBoost con hombres
MAE: 1.08
RMSE: 1.49
R²: 0.547


No otro

In [20]:
modelado("otros_generos", df_filtrado[df_filtrado["genero_OTHER"] == 0])

[0]	validation-rmse:2.06491
[50]	validation-rmse:1.54211
[100]	validation-rmse:1.47331
[150]	validation-rmse:1.45479
[200]	validation-rmse:1.44627
[250]	validation-rmse:1.44064
[299]	validation-rmse:1.43710
📈 XGBoost con otros_generos
MAE: 1.04
RMSE: 1.44
R²: 0.534


Estaciones cercanas

In [21]:
modelado("estaciones_cercanas", df_filtrado[df_filtrado["cantidad_estaciones_cercanas_origen"] > 3])

[0]	validation-rmse:2.06228
[50]	validation-rmse:1.60634
[100]	validation-rmse:1.53668
[150]	validation-rmse:1.52249
[200]	validation-rmse:1.51755
[250]	validation-rmse:1.51479
[299]	validation-rmse:1.51288
📈 XGBoost con estaciones_cercanas
MAE: 1.11
RMSE: 1.51
R²: 0.479


Año 2020

In [23]:
modelado("2020", df_filtrado[df_filtrado["año_intervalo"] == 2020])

[0]	validation-rmse:2.63155
[50]	validation-rmse:1.69908
[100]	validation-rmse:1.60750
[150]	validation-rmse:1.58592
[200]	validation-rmse:1.57948
[250]	validation-rmse:1.57716
[299]	validation-rmse:1.57546
📈 XGBoost con 2020
MAE: 1.13
RMSE: 1.58
R²: 0.661


Año 2021

In [24]:
modelado("2021", df_filtrado[df_filtrado["año_intervalo"] == 2021])

[0]	validation-rmse:2.00655
[50]	validation-rmse:1.52854
[100]	validation-rmse:1.46012
[150]	validation-rmse:1.44676
[200]	validation-rmse:1.44243
[250]	validation-rmse:1.44042
[299]	validation-rmse:1.43926
📈 XGBoost con 2021
MAE: 1.07
RMSE: 1.44
R²: 0.505


Año 2022

In [25]:
modelado("2022", df_filtrado[df_filtrado["año_intervalo"] == 2023])

[0]	validation-rmse:1.75196
[50]	validation-rmse:1.34034
[100]	validation-rmse:1.29768
[150]	validation-rmse:1.28317
[200]	validation-rmse:1.27977
[250]	validation-rmse:1.27769
[299]	validation-rmse:1.27550
📈 XGBoost con 2022
MAE: 0.93
RMSE: 1.28
R²: 0.490


Año 2023

In [26]:
modelado("2023", df_filtrado[df_filtrado["año_intervalo"] == 2023])

[0]	validation-rmse:1.75196
[50]	validation-rmse:1.34034
[100]	validation-rmse:1.29768
[150]	validation-rmse:1.28317
[200]	validation-rmse:1.27977
[250]	validation-rmse:1.27769
[299]	validation-rmse:1.27550
📈 XGBoost con 2023
MAE: 0.93
RMSE: 1.28
R²: 0.490


Año 2024

In [27]:
modelado("2024", df_filtrado[df_filtrado["año_intervalo"] == 2024])

[0]	validation-rmse:2.19544
[50]	validation-rmse:1.53274
[100]	validation-rmse:1.46998
[150]	validation-rmse:1.45280
[200]	validation-rmse:1.44637
[250]	validation-rmse:1.44279
[299]	validation-rmse:1.44028
📈 XGBoost con 2024
MAE: 1.03
RMSE: 1.44
R²: 0.590


Previo 2021 (incluido)

In [28]:
modelado("Previo 2021 (incluido)", df_filtrado[df_filtrado["año_intervalo"] <= 2021])

[0]	validation-rmse:2.29880
[50]	validation-rmse:1.63522
[100]	validation-rmse:1.54342
[150]	validation-rmse:1.52755
[200]	validation-rmse:1.52158
[250]	validation-rmse:1.51741
[299]	validation-rmse:1.51475
📈 XGBoost con Previo 2021 (incluido)
MAE: 1.10
RMSE: 1.51
R²: 0.585


Previo 2022 (incluido)

In [29]:
modelado("Previo 2022 (incluido)", df_filtrado[df_filtrado["año_intervalo"] <= 2022])

[0]	validation-rmse:2.16302
[50]	validation-rmse:1.58842
[100]	validation-rmse:1.51019
[150]	validation-rmse:1.49094
[200]	validation-rmse:1.48310
[250]	validation-rmse:1.47779
[299]	validation-rmse:1.47515
📈 XGBoost con Previo 2022 (incluido)
MAE: 1.07
RMSE: 1.48
R²: 0.553


Previo 2023 (incluido)

In [9]:
modelado("Previo 2023 (incluido)", df_filtrado[df_filtrado["año_intervalo"] <= 2023])

[0]	validation-rmse:2.07320
[50]	validation-rmse:1.54150
[100]	validation-rmse:1.46797
[150]	validation-rmse:1.44913
[200]	validation-rmse:1.44038
[250]	validation-rmse:1.43546
[299]	validation-rmse:1.43187
📈 XGBoost con Previo 2023 (incluido)
MAE: 1.04
RMSE: 1.43
R²: 0.541


Todo

In [10]:
modelado("todos", df_filtrado)

[0]	validation-rmse:2.10007
[50]	validation-rmse:1.55316
[100]	validation-rmse:1.48384
[150]	validation-rmse:1.46193
[200]	validation-rmse:1.45301
[250]	validation-rmse:1.44670
[299]	validation-rmse:1.44322
📈 XGBoost con todos
MAE: 1.04
RMSE: 1.44
R²: 0.546


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

model_xgb_poisson = XGBRegressor(
    n_estimators=800,
    max_depth=15,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="count:poisson",
    random_state=42,
    n_jobs=-1
)

model_xgb_poisson.fit(X_train, y_train)
y_pred = model_xgb_poisson.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = root_mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("📈 XGBoost con Poisson")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")