In [1]:
import pandas as pd
import numpy as np
import os
import sys
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import json

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
import os
os.getcwd()
os.chdir('../')



In [3]:

# root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.append(root_dir)

from utils.utils     import *
from utils.constants import *
from utils.metrics   import *
from src.model_tuning import *

In [4]:
path_ = os.getcwd()

In [5]:

y_train = pd.read_csv(path_ + '\\data' + '\\y_train.csv')

y_test = pd.read_csv(path_ + '\\data' + '\\y_test.csv')


stack_train = pd.read_csv(path_ + '\\data' + '\\stacked_X_tr.csv')
stack_test  = pd.read_csv(path_ + '\\data' + '\\stacked_X_te.csv')


# y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
# y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


# stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
# stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))




In [6]:

stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)

In [7]:
######## Feature Engineering ##########

# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday',
                       'Season'
                       ]  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # we can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))

scoring=make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

In [19]:
X_train_preprocessed_df.to_csv('aaa.csv')

# Models

In [9]:
######### XGBoost #########

# best_xgb_file = get_absolute_path(
#     file_name = 'best_xgb_model.joblib'
#     , rel_path = 'results'
# )

best_xgb_file =  path_ + '\\results' + '\\best_xgb_model.joblib'


xgb_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200, 500, 800],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'gamma': [0, 0.1, 0.5],
            'min_child_weight': [1, 5, 10, 20],
            'random_state': [RANDOM_SEED]
        },
        model=xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_xgb_file,
        cv=5
) 


print("Success of xgboost!")


Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
best_params: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 20, 'n_estimators': 500, 'random_state': 827, 'subsample': 0.8}
1
eval_metrics: {'rmse': 0.011139175966361239, 'mape': 0.009287988656448153, 'wmape': 0.009435331851106944, 'wbias': 0.0011219685420847771, 'wuforec': 0.00527865019659586, 'woforec': 0.004156681654511083}
Success of xgboost!


In [10]:
with open((path_ + '\\results' + "\\xgb_result.json"), "w") as outfile:
    json.dump(str(xgb_result), outfile)


In [11]:

######### Random Forest #########

# best_rf_file = get_absolute_path(
#     file_name = 'best_rf_model.joblib'
#     , rel_path = 'results'
# )

best_rf_file =  path_ + '\\results' + '\\best_rf_model.joblib'


rf_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
           'n_estimators': [50, 100, 200, 500],
           'max_depth': [None, 3, 5, 10, 20],
           'min_samples_split': [1, 2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [RANDOM_SEED]
        },
        model=RandomForestRegressor(random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_rf_file,
        cv=5
) 

print("Success of rf!")



Fitting 5 folds for each of 240 candidates, totalling 1200 fits


300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "D:\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Inval

best_params: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500, 'random_state': 827}


  return fit_method(estimator, *args, **kwargs)


1
eval_metrics: {'rmse': 0.011711301749508506, 'mape': 0.009704659203291449, 'wmape': 0.009874166809129547, 'wbias': 0.0006525708211954504, 'wuforec': 0.005263368815162498, 'woforec': 0.004610797993967048}
Success of rf!


In [12]:
with open((path_ + '\\results' + "\\rf_result.json"), "w") as outfile:
    json.dump(str(rf_result), outfile)

In [13]:


######### MLP #########

# best_mlp_file = get_absolute_path(
#     file_name = 'best_mlp_model.joblib'
#     , rel_path = 'results'
# )

best_mlp_file =  path_ + '\\results' + '\\best_mlp_model.joblib'



mlp_result = hyperparameter_tuning(
        X_train=X_test_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid= {
            'hidden_layer_sizes': list(zip([1000,2000, 3000, 5000], [300, 500, 800, 1000])),
            #'max_iter': [50, 100],
            'activation': ['tanh', 'relu'],
            #'solver': ['sgd', 'adam'],
            #'alpha': [0.0001, 0.05],
            #'learning_rate': ['constant','adaptive'],
            'random_state': [RANDOM_SEED]
        },
        model=MLPRegressor(random_state=RANDOM_SEED),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_mlp_file,
        cv=5
) 

print("Success of MLP!")



Fitting 5 folds for each of 8 candidates, totalling 40 fits


  y = column_or_1d(y, warn=True)


best_params: {'activation': 'tanh', 'hidden_layer_sizes': (2000, 500), 'random_state': 827}


  y = column_or_1d(y, warn=True)


1
eval_metrics: {'rmse': 0.014420387318514635, 'mape': 0.014541122244407076, 'wmape': 0.014607689864116317, 'wbias': 0.003969996490200376, 'wuforec': 0.009288843177158347, 'woforec': 0.005318846686957972}
Success of MLP!


In [14]:
with open((path_ + '\\results' + "\\mlp_result.json"), "w") as outfile:
    json.dump(str(mlp_result), outfile)

In [15]:

######### ElasticNet #########

# best_lin_file = get_absolute_path(
#     file_name = 'best_lin_model.joblib'
#     , rel_path = 'results'
# )

best_lin_file =  path_ + '\\results' + '\\best_lin_model.joblib'


lin_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            "alpha":[0.01, 0.001, 0.0001, 0.00001, 0.000001],
            "l1_ratio": [0, 0.1, 0.3, 0.5, 0.8, 1],
            'random_state': [RANDOM_SEED]
        },
        model=ElasticNet(random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lin_file,
        cv=5
) 

print("Success of ElasticNet!")


Fitting 5 folds for each of 30 candidates, totalling 150 fits


  model = cd_fast.enet_coordinate_descent(


best_params: {'alpha': 1e-05, 'l1_ratio': 1, 'random_state': 827}
1
eval_metrics: {'rmse': 0.011913692392905538, 'mape': 0.010792014828810443, 'wmape': 0.010957849504397259, 'wbias': 0.0013146181043679776, 'wuforec': 0.006136233804382618, 'woforec': 0.0048216157000146405}
Success of ElasticNet!


  model = cd_fast.enet_coordinate_descent(


In [16]:
with open((path_ + '\\results' + "\\lin_result.json"), "w") as outfile:
    json.dump(str(lin_result), outfile)

In [17]:

######### lightGBM #########

## rename the features for an feature name error (only for lightGBM).
import re
X_train_preprocessed_df = X_train_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test_preprocessed_df = X_test_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


# best_best_lightGBM_file = get_absolute_path(
#     file_name = 'best_lightGBM_model.joblib'
#     , rel_path = 'results'
# )

best_lightGBM_file =  path_ + '\\results' + '\\best_lightGBM_model.joblib'


lightGBM_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
                'max_depth': [3, 5, 7, 15],
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'num_leaves': [20, 31, 50, 100,200],
                'n_estimators': [50, 100, 200, 500, 800],
                'random_state': [RANDOM_SEED]
        },
        model=lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lightGBM_file,
        cv=5
) 

print("Success of LightGBM!")


Fitting 5 folds for each of 400 candidates, totalling 2000 fits
best_params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 800, 'num_leaves': 20, 'random_state': 827}
1
eval_metrics: {'rmse': 0.011040168862164742, 'mape': 0.009217976090447567, 'wmape': 0.009357888399115846, 'wbias': 0.0009469266898897603, 'wuforec': 0.005152407544502803, 'woforec': 0.004205480854613044}
Success of LightGBM!


In [18]:
with open((path_ + '\\results' + "\\lightGBM_model.json"), "w") as outfile:
    json.dump(str(lightGBM_result), outfile)