In [1]:
import pandas as pd
import numpy as np
import os
import sys
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import json

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
import os
os.getcwd()
os.chdir('../')



In [3]:

# root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.append(root_dir)

from utils.utils     import *
from utils.constants import *
from utils.metrics   import *
from src.model_tuning import *

In [4]:
path_ = os.getcwd()

In [5]:

y_train = pd.read_csv(path_ + '\\data' + '\\y_train.csv')

y_test = pd.read_csv(path_ + '\\data' + '\\y_test.csv')


stack_train = pd.read_csv(path_ + '\\data' + '\\stacked_X_tr.csv')
stack_test  = pd.read_csv(path_ + '\\data' + '\\stacked_X_te.csv')


# y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
# y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


# stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
# stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))




In [6]:

stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)

In [7]:
######## Feature Engineering ##########

# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday',
                       'Season'
                       ]  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # we can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))

scoring=make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))




X_train_preprocessed_df = X_train_preprocessed_df[column_names_raw]
X_test_preprocessed_df = X_test_preprocessed_df[column_names_raw]

# Models

In [8]:
######### XGBoost #########

# best_xgb_file = get_absolute_path(
#     file_name = 'best_xgb_model.joblib'
#     , rel_path = 'results'
# )

best_xgb_file =  path_ + '\\results' + '\\best_xgb_model.joblib'


xgb_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200, 500, 800],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'gamma': [0, 0.1, 0.5],
            'min_child_weight': [1, 5, 10, 20],
            'random_state': [RANDOM_SEED]
        },
        model=xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_xgb_file,
        cv=5
) 


print("Success of xgboost!")


Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
best_params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 20, 'n_estimators': 800, 'random_state': 827, 'subsample': 0.8}
eval_metrics: {'rmse': 0.010957830248512337, 'mape': 0.009221277715411659, 'wmape': 0.009363284939395644, 'wbias': 0.0009845337036232251, 'wuforec': 0.005173909321509434, 'woforec': 0.00418937561788621}
Success of xgboost!


In [9]:
with open((path_ + '\\results' + "\\xgb_result.json"), "w") as outfile:
    json.dump(str(xgb_result), outfile)


In [10]:

######### Random Forest #########

# best_rf_file = get_absolute_path(
#     file_name = 'best_rf_model.joblib'
#     , rel_path = 'results'
# )

best_rf_file =  path_ + '\\results' + '\\best_rf_model.joblib'


rf_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
           'n_estimators': [50, 100, 200, 500],
           'max_depth': [None, 3, 5, 10, 20],
           'min_samples_split': [1, 2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [RANDOM_SEED]
        },
        model=RandomForestRegressor(random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_rf_file,
        cv=5
) 

print("Success of rf!")



Fitting 5 folds for each of 240 candidates, totalling 1200 fits


300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "D:\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Inval

best_params: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500, 'random_state': 827}


  return fit_method(estimator, *args, **kwargs)


eval_metrics: {'rmse': 0.011471895705663855, 'mape': 0.009710125117257443, 'wmape': 0.009865323382071042, 'wbias': 0.0008373675309925635, 'wuforec': 0.005351345456531802, 'woforec': 0.004513977925539239}
Success of rf!


In [11]:
with open((path_ + '\\results' + "\\rf_result.json"), "w") as outfile:
    json.dump(str(rf_result), outfile)

In [12]:


######### MLP #########

# best_mlp_file = get_absolute_path(
#     file_name = 'best_mlp_model.joblib'
#     , rel_path = 'results'
# )

best_mlp_file =  path_ + '\\results' + '\\best_mlp_model.joblib'



mlp_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid= {
            'hidden_layer_sizes': list(zip([1000,2000, 3000, 5000], [300, 500, 800, 1000])),
            #'max_iter': [50, 100],
            'activation': ['tanh', 'relu'],
            #'solver': ['sgd', 'adam'],
            #'alpha': [0.0001, 0.05],
            #'learning_rate': ['constant','adaptive'],
            'random_state': [RANDOM_SEED]
        },
        model=MLPRegressor(random_state=RANDOM_SEED),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_mlp_file,
        cv=5
) 

print("Success of MLP!")



Fitting 5 folds for each of 8 candidates, totalling 40 fits


  y = column_or_1d(y, warn=True)


best_params: {'activation': 'relu', 'hidden_layer_sizes': (3000, 800), 'random_state': 827}


  y = column_or_1d(y, warn=True)


eval_metrics: {'rmse': 0.01419767849167211, 'mape': 0.012918802192416545, 'wmape': 0.013160718432543647, 'wbias': -0.005498877971762316, 'wuforec': 0.003830920230390667, 'woforec': 0.009329798202152982}
Success of MLP!


In [13]:
with open((path_ + '\\results' + "\\mlp_result.json"), "w") as outfile:
    json.dump(str(mlp_result), outfile)

In [14]:

######### ElasticNet #########

# best_lin_file = get_absolute_path(
#     file_name = 'best_lin_model.joblib'
#     , rel_path = 'results'
# )

best_lin_file =  path_ + '\\results' + '\\best_lin_model.joblib'


lin_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            "alpha":[0.01, 0.001, 0.0001, 0.00001, 0.000001],
            "l1_ratio": [0, 0.1, 0.3, 0.5, 0.8, 1],
            'random_state': [RANDOM_SEED]
        },
        model=ElasticNet(random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lin_file,
        cv=5
) 

print("Success of ElasticNet!")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
best_params: {'alpha': 1e-06, 'l1_ratio': 0, 'random_state': 827}
eval_metrics: {'rmse': 0.012036930325712495, 'mape': 0.010742294902159272, 'wmape': 0.010915650002954672, 'wbias': 0.00040647847810628844, 'wuforec': 0.0056610642405304805, 'woforec': 0.0052545857624241915}
Success of ElasticNet!


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [15]:
with open((path_ + '\\results' + "\\lin_result.json"), "w") as outfile:
    json.dump(str(lin_result), outfile)

In [16]:

######### lightGBM #########

## rename the features for an feature name error (only for lightGBM).
import re
X_train_preprocessed_df = X_train_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test_preprocessed_df = X_test_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


# best_best_lightGBM_file = get_absolute_path(
#     file_name = 'best_lightGBM_model.joblib'
#     , rel_path = 'results'
# )

best_lightGBM_file =  path_ + '\\results' + '\\best_lightGBM_model.joblib'


lightGBM_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
                'max_depth': [3, 5, 7, 15],
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'num_leaves': [20, 31, 50, 100,200],
                'n_estimators': [50, 100, 200, 500, 800],
                'random_state': [RANDOM_SEED]
        },
        model=lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lightGBM_file,
        cv=5
) 

print("Success of LightGBM!")


Fitting 5 folds for each of 400 candidates, totalling 2000 fits
best_params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'num_leaves': 20, 'random_state': 827}
eval_metrics: {'rmse': 0.010843662111841292, 'mape': 0.009327065730018333, 'wmape': 0.009465059255443473, 'wbias': 0.0008620482732707277, 'wuforec': 0.005163553764357101, 'woforec': 0.004301505491086373}
Success of LightGBM!


In [17]:
with open((path_ + '\\results' + "\\lightGBM_model.json"), "w") as outfile:
    json.dump(str(lightGBM_result), outfile)