In [59]:
import pandas as pd
import numpy as np
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error

In [72]:
#Hiperparámetros de modelos previamente encontrados utilizando OPTUNA

catboost_params = {'learning_rate': 0.19589334361676058,
 'iterations': 357,
 'depth': 5,
 'l2_leaf_reg': 2.4383355947131355,
 'bagging_temperature': 0.4880588845509045,
 'random_strength': 1.4984300903776795,
 'rsm': 0.7657298876115406}

xgboost_params = {'learning_rate': 0.19897749523532485,
 'n_estimators': 399,
 'max_depth': 5,
 'min_child_weight': 2,
 'gamma': 0.162854039714699,
 'subsample': 0.6430057260063712,
 'colsample_bytree': 0.976575208389519,
 'reg_alpha': 5.932062886045555,
 'reg_lambda': 1.571273639240366}

lightgbm_params = {'n_estimators': 366,
 'learning_rate': 0.19289032026873984,
 'max_depth': 5,
 'subsample': 0.8448599797005143,
 'colsample_bytree': 0.888292577274356,
 'reg_alpha': 0.7880255790817811,
 'reg_lambda': 0.05129584535565534}

In [74]:
df = pd.read_parquet("trips_weather_merged.parquet") # Cargamos Dataset pre-procesado para el entrenamiento

df= df[df["tpep_pickup_hour"]>"2021-12-31"] # Filtro el dataset para entrenar con datos de 2022 a 2025 (Enero)

df["Location*Hour"] = df.apply(lambda x: str(x["LocationID"])+" "+str(x["pickup_hour"]),axis=1) # Creo columna referencia para estratificar el train/test

df2 = pd.read_csv("df_grouped_location.csv") # Cargo dataset con LocationID y clusters

df = df.merge(df2, on="LocationID", how="left").drop(columns=["Unnamed: 0", "sum","max","mean","median"]) # Uno datasets para agregar columna "cluster"

df = pd.get_dummies(df,columns=["weekday","is_weekend", "rush_hour"],drop_first=True,dtype=float) # Convierto en Dummies las variables categóricas

df_cluster_0 = df[df["cluster"]==0] # Filtro el dataset para entrenar SOLO con el cluster 0

# Dividimos dataset en train y test, shuffle True y estratificación con la columna "Location*Hour"
y = df_cluster_0["Total_Trips"] 
X = df_cluster_0.drop(columns=["Total_Trips","tpep_pickup_hour", "Location*Hour","cluster"])
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, shuffle=True, test_size=0.2, stratify=df_cluster_0["Location*Hour"])

In [76]:
#Instanciamos modelos
LightGBM = LGBMRegressor(**lightgbm_params, random_state=42)
XGBoost = xgb.XGBRegressor(**xgboost_params, random_state=42)
CatBoost = CatBoostRegressor(**catboost_params, random_seed=42)

Models = [LightGBM,XGBoost,CatBoost]

Modelos = []

y_train_log = np.log1p(y_train) # Use np.log1p to handle potential zeros

for model in Models:
    resultados = {}
    model.fit(X_train, y_train_log)
    y_pred = model.predict(X_test)
    y_pred_exp = np.exp(y_pred)
    resultados["modelo"] = model
    resultados["r2"]=r2_score(y_test, y_pred_exp)
    resultados["rmse"]=root_mean_squared_error(y_test, y_pred_exp)
    Modelos.append(resultados)
    

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 914
[LightGBM] [Info] Number of data points in the train set: 346356, number of used features: 18
[LightGBM] [Info] Start training from score 4.473038
0:	learn: 1.0657746	total: 39.6ms	remaining: 14.1s
1:	learn: 0.9440066	total: 78.4ms	remaining: 13.9s
2:	learn: 0.8511011	total: 123ms	remaining: 14.6s
3:	learn: 0.7847599	total: 163ms	remaining: 14.4s
4:	learn: 0.7362537	total: 201ms	remaining: 14.1s
5:	learn: 0.6964593	total: 239ms	remaining: 14s
6:	learn: 0.6677496	total: 278ms	remaining: 13.9s
7:	learn: 0.6434159	total: 316ms	remaining: 13.8s
8:	learn: 0.6262632	total: 364ms	remaining: 14.1s
9:	learn: 0.6137118	total: 403ms	remaining: 14s
10:	learn: 0.6002147	total: 445ms	remaining: 14s
11:	learn: 0.5917012	total: 484ms	remaining: 13

In [78]:
pd.DataFrame(Modelos)

Unnamed: 0,modelo,r2,rmse
0,LGBMRegressor(colsample_bytree=0.8882925772743...,0.90016,35.685307
1,"XGBRegressor(base_score=None, booster=None, ca...",0.903185,35.140446
2,<catboost.core.CatBoostRegressor object at 0x0...,0.873136,40.225825


Teniendo en cuenta lo anterior, XGBoost será el modelo final seleccionado ya que tiene las mejores métricas en comparación con los otros dos.

In [65]:
df = pd.read_parquet("trips_weather_merged.parquet")