In [114]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, root_mean_squared_error
from Funciones import feature_engineering_shuffle
import pickle

In [91]:
df = pd.read_parquet("data/train_data/trips_weather_merged.parquet")

In [93]:
df = feature_engineering_shuffle(df)

### Cluster 0 Pipeline

In [116]:
df_cluster_0 = df[df["cluster"]==0]
y = df_cluster_0["Total_Trips"]
X = df_cluster_0.drop(columns=["Total_Trips","tpep_pickup_hour", "Location*Hour","cluster"])

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, shuffle=False, test_size=0.2)

In [118]:
col_cat = ["weekday","is_weekend", "rush_hour","is_holiday","LocationID"]
Encoder = OneHotEncoder(drop="first",dtype=float, handle_unknown="ignore")
Transformer = ColumnTransformer([("One Hot Encoder", Encoder,col_cat)], remainder="passthrough")

In [120]:
params= {'learning_rate': 0.16147509326207363, 
         'n_estimators': 360, 
         'max_depth': 5, 
         'min_child_weight': 3, 
         'gamma': 0.23255551949345582, 
         'subsample': 0.6919010893031594, 
         'colsample_bytree': 0.828723545372629, 
         'reg_alpha': 0.0353238029105275, 
         'reg_lambda': 9.005881861726515}

In [122]:
model_high_demand = xgb.XGBRegressor(**params)

In [124]:
Pipeline_high_demand = make_pipeline(Transformer,model_high_demand)

In [126]:
Pipeline_high_demand.fit(X_train, y_train)

In [128]:
y_pred = Pipeline_high_demand.predict(X_test)



In [130]:
r2_score(y_test, y_pred)

0.950548513716511

In [134]:
root_mean_squared_error(np.exp(y_test), np.exp(y_pred))

28.4939883336136

In [71]:
filename = 'Pipeline_high_demand.pkl'
pickle.dump(Pipeline_high_demand, open(filename, 'wb'))

### Cluster 1 Pipeline

In [136]:
df_cluster_1 = df[df["cluster"]==1]
y = df_cluster_1["Total_Trips"]
X = df_cluster_1.drop(columns=["Total_Trips","tpep_pickup_hour", "Location*Hour","cluster"])

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, shuffle=False, test_size=0.2)

In [138]:
params= {'learning_rate': 0.18972331006881477, 
         'n_estimators': 360, 
         'max_depth': 5, 
         'min_child_weight': 2, 
         'gamma': 0.1945738960737595, 
         'subsample': 0.6711648020855602, 
         'colsample_bytree': 0.7082182371108593, 
         'reg_alpha': 0.06337472926663015, 
         'reg_lambda': 1.8489691169609355}

In [140]:
Encoder_2 = OneHotEncoder(drop="first",dtype=float, handle_unknown="ignore")
Transformer_2 = ColumnTransformer([("One Hot Encoder", Encoder_2,col_cat)], remainder="passthrough")
model_low_demand = xgb.XGBRegressor(**params)
Pipeline_low_demand = make_pipeline(Transformer_2,model_low_demand)
Pipeline_low_demand.fit(X_train, y_train)

In [25]:
filename = 'Pipeline_low_demand.pkl'
pickle.dump(Pipeline_low_demand, open(filename, 'wb'))

In [142]:
y_pred = Pipeline_low_demand.predict(X_test)



In [144]:
r2_score(y_test, y_pred)

0.8564332511096493

In [146]:
root_mean_squared_error(np.exp(y_test), np.exp(y_pred))

6.486397031687141

### Cluster 2 Mid Demand

In [148]:
df_cluster_2 = df[df["cluster"]==2]
y = df_cluster_2["Total_Trips"]
X = df_cluster_2.drop(columns=["Total_Trips","tpep_pickup_hour", "Location*Hour","cluster"])

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, shuffle=False, test_size=0.2)

In [150]:
params= {'learning_rate': 0.1498124962476959, 
         'n_estimators': 352, 
         'max_depth': 5, 
         'min_child_weight': 2, 
         'gamma': 0.02378074852415671, 
         'subsample': 0.6051839424327529, 
         'colsample_bytree': 0.9939555418901009, 
         'reg_alpha': 4.881402844706467, 
         'reg_lambda': 0.4904264751738619}

In [152]:
Encoder_3 = OneHotEncoder(drop="first",dtype=float, handle_unknown="ignore")
Transformer_3 = ColumnTransformer([("One Hot Encoder", Encoder_3,col_cat)], remainder="passthrough")
model_mid_demand = xgb.XGBRegressor(**params)
Pipeline_mid_demand = make_pipeline(Transformer_3,model_mid_demand)
Pipeline_mid_demand.fit(X_train, y_train)

In [105]:
filename = 'Pipeline_mid_demand.pkl'
pickle.dump(Pipeline_mid_demand, open(filename, 'wb'))

In [154]:
y_pred = Pipeline_mid_demand.predict(X_test)



In [156]:
r2_score(y_test, y_pred)

0.9168064571089257

In [158]:
root_mean_squared_error(np.exp(y_test), np.exp(y_pred))

17.336607906478296