In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error,mean_absolute_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *

In [3]:
# Import data


y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
y_test  = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))

stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)

# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday',
                       'Season'
                       ]  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # we can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))


In [4]:
from src.model_tuning import *

In [8]:
best_xgb_file = get_absolute_path(
file_name = 'best_xgb_model.joblib'
, rel_path = 'results'
)

scoring=make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

X_train=X_train_preprocessed_df
y_train=y_train
X_test=X_test_preprocessed_df
y_test=y_test
param_grid={
    'max_depth': [3]#, 5, 7],
    # 'learning_rate': [0.1, 0.01],
    # 'n_estimators': [100, 200, 500],
    # 'subsample': [0.8, 1.0],
    # 'colsample_bytree': [0.8, 1.0],
    # 'gamma': [0, 0.1, 0.5],
    # 'min_child_weight': [1, 5, 10, 20]
}
model=xgb.XGBRegressor(objective='reg:squarederror')
scoring=scoring
eval_func=compute_metrics
file_path=best_xgb_file
cv=5

In [9]:
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=6, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

# Making predictions on the validation data using the best model
y_pred = best_model.predict(X_test)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [10]:
# Making predictions on the validation data using the best model
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
_df = pd.DataFrame(
    {
        'forecast': y_pred,
        'actual': y_test['measurement']
    }
)
eval_metrics = eval_func(_df, EVAL_METRIC_LIST)

# Get feature importance scores for tree based models

if isinstance(best_model, MLPRegressor): # type(best_model) == MLPRegressor
    feature_importance_dict = {}
else:
    # Get feature importance scores
    feature_importance = best_model.feature_importances_
    feature_names = X_train.columns
    feature_importance_dict = dict(zip(feature_names, feature_importance))

# Save the results
best_model_info = {
    'best_params': best_params,
    'best_score': best_score,
    'feature_importance': feature_importance_dict,
    'eval_metrics': eval_metrics
}


In [11]:
best_model_info

{'best_params': {'max_depth': 3},
 'best_score': -0.010876995817059615,
 'feature_importance': {'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)': 0.01882037,
  'pH, water, unfiltered, field, standard units (Maximum)': 0.0058287554,
  'pH, water, unfiltered, field, standard units (Minimum)': 0.0051923725,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)': 0.0035735324,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)': 0.018743103,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)': 0.46717146,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)': 0.002534969,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)': 0.012307011,
  'Temperature, water, degrees Celsius (Mean)': 0.0040799812,
  'Temperature, water, degrees Celsius (Minimum)': 0.010898472,
  'Temperature, water, d