In [1]:
import pandas as pd
import numpy as np
import os
import sys
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import json

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
import os
os.getcwd()
os.chdir('../')



In [3]:

# root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
# sys.path.append(root_dir)

from utils.utils     import *
from utils.constants import *
from utils.metrics   import *
from src.model_tuning import *

In [4]:
path_ = os.getcwd()

In [5]:

y_train = pd.read_csv(path_ + '\\data' + '\\y_train.csv')

y_test = pd.read_csv(path_ + '\\data' + '\\y_test.csv')


stack_train = pd.read_csv(path_ + '\\data' + '\\stacked_X_tr.csv')
stack_test  = pd.read_csv(path_ + '\\data' + '\\stacked_X_te.csv')


# y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
# y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


# stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
# stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))




In [6]:

stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)

In [7]:
######## Feature Engineering ##########

# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday',
                       'Season'
                       ]  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler(with_mean = False, with_std = False)  # we can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))

scoring=make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))




X_train_preprocessed_df = X_train_preprocessed_df[column_names_raw]
X_test_preprocessed_df = X_test_preprocessed_df[column_names_raw]

In [8]:
column_names_raw

['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
 'pH, water, unfiltered, field, standard units (Maximum)',
 'pH, water, unfiltered, field, standard units (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
 'Temperature, water, degrees Celsius (Mean)',
 'Temperature, water, degrees Celsius (Minimum)',
 'Temperature, water, degrees Celsius (Maximum)']

In [9]:
X_train_preprocessed_df

Unnamed: 0,"Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)","pH, water, unfiltered, field, standard units (Maximum)","pH, water, unfiltered, field, standard units (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)","Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)","Temperature, water, degrees Celsius (Mean)","Temperature, water, degrees Celsius (Minimum)","Temperature, water, degrees Celsius (Maximum)"
0,0.001131,0.884615,0.001120,0.001113,0.677632,0.841463,0.765152,0.787402,0.293750,0.298077,0.276163
1,0.001170,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.293750,0.301282,0.276163
2,0.001326,0.884615,0.001198,0.001250,0.677632,0.853659,0.750000,0.755906,0.300000,0.298077,0.287791
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.279070
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977
...,...,...,...,...,...,...,...,...,...,...,...
15646,0.005049,0.910256,0.004676,0.004883,0.888158,0.939024,0.719697,0.535433,0.553125,0.490385,0.584302
15647,0.002456,0.884615,0.002417,0.002422,0.644737,0.865854,0.651515,0.622047,0.556250,0.522436,0.584302
15648,0.003821,0.935897,0.003615,0.003750,0.559211,0.890244,0.636364,0.645669,0.571875,0.554487,0.561047
15649,0.001287,0.897436,0.001277,0.001289,0.631579,0.853659,0.712121,0.724409,0.468750,0.464744,0.462209


# Models

In [10]:
######### XGBoost #########

# best_xgb_file = get_absolute_path(
#     file_name = 'best_xgb_model.joblib'
#     , rel_path = 'results'
# )

best_xgb_file =  path_ + '\\results' + '\\best_xgb_model.joblib'


xgb_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            'max_depth': [3, 5, 7],
            'learning_rate': [0.1, 0.01],
            'n_estimators': [100, 200, 500, 800],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'gamma': [0, 0.1, 0.5],
            'min_child_weight': [1, 5, 10, 20],
            'random_state': [RANDOM_SEED]
        },
        model=xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_xgb_file,
        cv=5
) 


print("Success of xgboost!")


Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
best_params: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 20, 'n_estimators': 800, 'random_state': 827, 'subsample': 0.8}
eval_metrics: {'rmse': 0.010956402573296385, 'mape': 0.009226869765460825, 'wmape': 0.009368407710488712, 'wbias': 0.0009815277242380294, 'wuforec': 0.005174967717363371, 'woforec': 0.004193439993125341}
Success of xgboost!


In [11]:
with open((path_ + '\\results' + "\\xgb_result.json"), "w") as outfile:
    json.dump(str(xgb_result), outfile)


In [12]:

######### Random Forest #########

# best_rf_file = get_absolute_path(
#     file_name = 'best_rf_model.joblib'
#     , rel_path = 'results'
# )

best_rf_file =  path_ + '\\results' + '\\best_rf_model.joblib'


rf_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
           'n_estimators': [50, 100, 200, 500],
           'max_depth': [None, 3, 5, 10, 20],
           'min_samples_split': [1, 2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'random_state': [RANDOM_SEED]
        },
        model=RandomForestRegressor(random_state=RANDOM_SEED),
        scoring=scoring,
        eval_func=compute_metrics,
        file_path=best_rf_file,
        cv=5
) 

print("Success of rf!")



Fitting 5 folds for each of 240 candidates, totalling 1200 fits


300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "D:\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "D:\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Inval

best_params: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500, 'random_state': 827}


  return fit_method(estimator, *args, **kwargs)


eval_metrics: {'rmse': 0.011472635648870366, 'mape': 0.009711017192763476, 'wmape': 0.009865953354294495, 'wbias': 0.0008241857087287503, 'wuforec': 0.005345069531511623, 'woforec': 0.004520883822782873}
Success of rf!


In [13]:
with open((path_ + '\\results' + "\\rf_result.json"), "w") as outfile:
    json.dump(str(rf_result), outfile)

In [None]:


######### MLP #########

# best_mlp_file = get_absolute_path(
#     file_name = 'best_mlp_model.joblib'
#     , rel_path = 'results'
# )

best_mlp_file =  path_ + '\\results' + '\\best_mlp_model.joblib'



mlp_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid= {
            'hidden_layer_sizes': list(zip([1000,2000, 3000, 5000], [300, 500, 800, 1000])),
            #'max_iter': [50, 100],
            'activation': ['tanh', 'relu'],
            #'solver': ['sgd', 'adam'],
            #'alpha': [0.0001, 0.05],
            #'learning_rate': ['constant','adaptive'],
            'random_state': [RANDOM_SEED]
        },
        model=MLPRegressor(random_state=RANDOM_SEED),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_mlp_file,
        cv=5
) 

print("Success of MLP!")



Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [15]:
with open((path_ + '\\results' + "\\mlp_result.json"), "w") as outfile:
    json.dump(str(mlp_result), outfile)

In [16]:

######### ElasticNet #########

# best_lin_file = get_absolute_path(
#     file_name = 'best_lin_model.joblib'
#     , rel_path = 'results'
# )

best_lin_file =  path_ + '\\results' + '\\best_lin_model.joblib'


lin_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
            "alpha":[0.01, 0.001, 0.0001, 0.00001, 0.000001],
            "l1_ratio": [0, 0.1, 0.3, 0.5, 0.8, 1],
            'random_state': [RANDOM_SEED]
        },
        model=ElasticNet(random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lin_file,
        cv=5
) 

print("Success of ElasticNet!")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
best_params: {'alpha': 1e-06, 'l1_ratio': 0, 'random_state': 827}
eval_metrics: {'rmse': 0.012034610236826226, 'mape': 0.010723798243071372, 'wmape': 0.010897963796018055, 'wbias': 0.0004093781332156933, 'wuforec': 0.005653670964616873, 'woforec': 0.005244292831401181}
Success of ElasticNet!


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [17]:
with open((path_ + '\\results' + "\\lin_result.json"), "w") as outfile:
    json.dump(str(lin_result), outfile)

In [18]:

######### lightGBM #########

## rename the features for an feature name error (only for lightGBM).
import re
X_train_preprocessed_df = X_train_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_test_preprocessed_df = X_test_preprocessed_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))


# best_best_lightGBM_file = get_absolute_path(
#     file_name = 'best_lightGBM_model.joblib'
#     , rel_path = 'results'
# )

best_lightGBM_file =  path_ + '\\results' + '\\best_lightGBM_model.joblib'


lightGBM_result = hyperparameter_tuning(
        X_train=X_train_preprocessed_df,
        y_train=y_train,
        X_test=X_test_preprocessed_df,
        y_test=y_test,
        param_grid={
                'max_depth': [3, 5, 7, 15],
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'num_leaves': [20, 31, 50, 100,200],
                'n_estimators': [50, 100, 200, 500, 800],
                'random_state': [RANDOM_SEED]
        },
        model=lgb.LGBMRegressor(objective='regression', metric='rmse', random_state=RANDOM_SEED).fit(X_train_preprocessed_df,y_train),
        scoring="neg_root_mean_squared_error",
        eval_func=compute_metrics,
        file_path=best_lightGBM_file,
        cv=5
) 

print("Success of LightGBM!")


Fitting 5 folds for each of 400 candidates, totalling 2000 fits
best_params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'num_leaves': 20, 'random_state': 827}
eval_metrics: {'rmse': 0.010882228939728117, 'mape': 0.009335734029470626, 'wmape': 0.00947571145161268, 'wbias': 0.0009194205660468334, 'wuforec': 0.005197566008829758, 'woforec': 0.004278145442782924}
Success of LightGBM!


In [19]:
with open((path_ + '\\results' + "\\lightGBM_model.json"), "w") as outfile:
    json.dump(str(lightGBM_result), outfile)