In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from numpy.linalg import inv
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [2]:
data = pd.read_csv("C:/Users/Astron/Desktop/L_Oreal/S23-CUDSI-Loreal-SalesForecasting/traffic_cleaned_forPython_k.csv", parse_dates=['day'])
data.set_index('day', inplace=True)
data.sort_index(inplace=True)

### One hot encoding
data = pd.get_dummies(data, dtype="bool")
data = data.astype('int')

### To aviod "Do not support special JSON characters in feature name" error in LightGBM
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

data_trans = data.reset_index()

data_trans['y'] = data_trans.day.dt.year
data_trans['m'] = data_trans.day.dt.month
data_trans['d'] = data_trans.day.dt.day

data_sincos = encode(data_trans, 'm', 12)
data_sincos = encode(data_trans, 'd', 31)
data_sincos.set_index('day', inplace=True)

In [3]:
X = data_sincos.drop(columns = ['CORE_VL_NbEntry'])
y = data_sincos['CORE_VL_NbEntry']

In [4]:
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.25, random_state=0)

In [12]:
#hyperparameter tuning
#XGboost
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#construct grids for hyperparameter tunning
param_grid_XGB = {'n_estimators':[100], 
                  'max_depth': [12], 
                  'learning_rate': [0.1]}

grid_XGB = GridSearchCV(XGBRegressor(), param_grid_XGB, refit = True, cv = 5, verbose = 3, n_jobs=-1) 
grid_XGB.fit(X_dev, y_dev)

#accuracy check

xgbc = XGBRegressor(learning_rate=grid_XGB.best_params_['learning_rate'], 
                                      max_depth = grid_XGB.best_params_['max_depth'], 
                                      n_estimators = grid_XGB.best_params_['n_estimators']).fit(X_train, y_train)

xgbc_y_test = xgbc.predict(X_test)

print('MSE:', np.sqrt(mean_squared_error(y_test, xgbc_y_test)))
print('xgb_r^2:', xgbc.score(X_test, y_test))
print(np.sqrt(mean_squared_error(y_test, xgbc_y_test)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
MSE: 11.331326308611292
xgb_r^2: 0.8757338123941485
11.331326308611292


In [13]:
# Random Forest
# ! Don't Run!!!!
from sklearn.ensemble import RandomForestRegressor
param_grid_rf = {'n_estimators':[100,200,300,400,500], 
                  'max_depth': [10,12,14,16,18] }


grid_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, refit = True, cv = 5, verbose = 3, n_jobs=-1) 
grid_rf.fit(X_dev, y_dev)


rfr = RandomForestRegressor(max_depth = grid_rf.best_params_['max_depth'], 
                            n_estimators = grid_rf.best_params_['n_estimators']).fit(X_train, y_train)

rfr_y_test = rfr.predict(X_test)
print('rfr_r^2:',rfr.score(X_test, y_test))

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [6]:
#catboost
from catboost import CatBoostRegressor

cat = CatBoostRegressor(logging_level = 'Silent').fit(X_dev, y_dev)
cat_y_pred = cat.predict(X_test) 

print('cat_r^2:',cat.score(X_test, y_test))
print('MSE:', mean_squared_error(y_test, cat_y_pred))
print(np.sqrt(mean_squared_error(y_test, cat_y_pred)))


cat_r^2: 0.892922479268309
MSE: 110.63863894518883
10.518490335841395


In [9]:
#HistGradientBoosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

params={'learning_rate' : [0.05,0.1,0.15,0.2],
        'max_depth' : [10,12,14,16,18]}

hbcv=GridSearchCV(HistGradientBoostingRegressor(),param_grid=params,cv=5)

hbcv.fit(X_dev,y_dev)
hist_y_pred = hbcv.predict(X_test)

print('hist_r^2:',hbcv.score(X_test, y_test))
print('MSE:', mean_squared_error(y_test, hist_y_pred))
print(np.sqrt(mean_squared_error(y_test, hist_y_pred)))

hist_r^2: 0.8820652392628447
MSE: 121.85696234955489
11.038884107986409


In [11]:
from lightgbm import LGBMRegressor
import re
X_dev = X_dev.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

LGB = LGBMRegressor().fit(X_dev, y_dev)
lgb_y_pred = LGB.predict(X_test)

print('lgb_r^2:',LGB.score(X_test, y_test))
print('MSE:', mean_squared_error(y_test, lgb_y_pred))
print(np.sqrt(mean_squared_error(y_test, lgb_y_pred)))

lgb_r^2: 0.8735662466747094
MSE: 130.63860919691456
11.429724808450752


In [16]:
# ! don't run
rf_y_pred_dev = pd.DataFrame(rfr.predict(X_dev))
hist_y_pred_dev = pd.DataFrame(hbcv.predict(X_dev))
cat_y_pred_dev = pd.DataFrame(cat.predict(X_dev))
xgb_y_pred_dev = pd.DataFrame(xgbc.predict(X_dev))
lgb_y_pred_dev = pd.DataFrame(LGB.predict(X_dev))

#concat stacked dev 
df2_dev = pd.concat([rf_y_pred_dev, hist_y_pred_dev, cat_y_pred_dev, xgb_y_pred_dev, lgb_y_pred_dev], axis=1)
df2_dev.columns = ['rf','hist','cat','xgb', 'lgb']

rf_y_pred_test = pd.DataFrame(rfr.predict(X_test))
hist_y_pred_test = pd.DataFrame(hbcv.predict(X_test))
cat_y_pred_test = pd.DataFrame(cat.predict(X_test))
xgb_y_pred_test = pd.DataFrame(xgbc.predict(X_test))
lgb_y_pred_test = pd.DataFrame(LGB.predict(X_test))

#concat stacked test
df2_test=pd.concat([rf_y_pred_test, hist_y_pred_test, cat_y_pred_test, xgb_y_pred_test, lgb_y_pred_test], axis=1)
df2_test.columns = ['rf','hist','cat','xgb', 'lgb']

#construct "mega-model" and fit new trainging data

stack_model = CatBoostRegressor(logging_level = 'Silent', random_state=42)
stack_model.fit(df2_dev, y_dev)

#predict with mega-model
stack_y_pred = stack_model.predict(df2_test)

print('stack_r^2:',stack_model.score(df2_test, y_test))
print('MSE:', mean_squared_error(y_test, stack_y_pred))
print(np.sqrt(mean_squared_error(y_test, stack_y_pred)))

stack_r^2: 0.8805685181960389


In [13]:
# delete random forest (time consuming + poor performance)

hist_y_pred_dev = pd.DataFrame(hbcv.predict(X_dev))
cat_y_pred_dev = pd.DataFrame(cat.predict(X_dev))
xgb_y_pred_dev = pd.DataFrame(xgbc.predict(X_dev))
lgb_y_pred_dev = pd.DataFrame(LGB.predict(X_dev))

#concat stacked dev 
df2_dev = pd.concat([hist_y_pred_dev, cat_y_pred_dev, xgb_y_pred_dev, lgb_y_pred_dev], axis=1)
df2_dev.columns = ['hist','cat','xgb', 'lgb']

hist_y_pred_test = pd.DataFrame(hbcv.predict(X_test))
cat_y_pred_test = pd.DataFrame(cat.predict(X_test))
xgb_y_pred_test = pd.DataFrame(xgbc.predict(X_test))
lgb_y_pred_test = pd.DataFrame(LGB.predict(X_test))

#concat stacked test
df2_test=pd.concat([hist_y_pred_test, cat_y_pred_test, xgb_y_pred_test, lgb_y_pred_test], axis=1)
df2_test.columns = ['hist','cat','xgb', 'lgb']

#construct "mega-model" and fit new trainging data

stack_model = CatBoostRegressor(logging_level = 'Silent', random_state=42)
stack_model.fit(df2_dev, y_dev)

#predict with mega-model
stack_y_pred = stack_model.predict(df2_test)


# stack_absolute_error = mean_absolute_error(y_test, stack_y_pred)
# print('stack_absolute_error: ', stack_absolute_error)
print('stack_r^2:',stack_model.score(df2_test, y_test))
print('MSE:', mean_squared_error(y_test, stack_y_pred))
print(np.sqrt(mean_squared_error(y_test, stack_y_pred)))

stack_r^2: 0.8821581194070621
MSE: 121.76099325472501
11.034536386034757
