In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *

# Data

To make valid comparison across different methods, we split the original `df_train` into new train and validation data sets.

In [3]:
# Import data

df_train = pd.read_csv(get_absolute_path('X_train.csv', 'data'))
y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
df_test = pd.read_csv(get_absolute_path('X_test.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))



# XGBoost

Adding hyperparameter tuning.

In [4]:

# Converting the data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(df_train, label=y_train)

# Define XGBoost parameters grid for tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

# Create an XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror')

# Define a custom scoring function (negative RMSE since GridSearchCV minimizes the score)
scoring = make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring=scoring)
grid_search.fit(df_train, y_train)

# Get the best hyperparameters and best model
best_xgb_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_
best_xgb_score = -grid_search.best_score_

print("Best Hyperparameters:", best_xgb_params)

# Making predictions on the validation data using the best model
y_pred_xgb = best_xgb_model.predict(df_test)

# Calculating RMSE on the validation data
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGBoost RMSE on Validation Data with Best Model:", rmse_xgb)


NameError: name 'best_params' is not defined

In [None]:
best_xgb_model_info = {'best_params': best_xgb_params, 'best_score': xgb_best_score}

best_xgb_file = get_absolute_path(
    file_name = 'best_xgb_model.joblib'
    , rel_path = 'results'
)

save_model(best_xgb_file, best_xgb_model, best_xgb_model_info)


In [None]:
# # Load the model and its info
# loaded_model, loaded_model_info = load_model(best_xgb_file)

# Random Forest

In [27]:
# Define RandomForest parameters grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a RandomForest model
model_rf = RandomForestRegressor()

# Define a custom scoring function (negative RMSE since GridSearchCV minimizes the score)
scoring = make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring=scoring)
# grid_search.fit(df_train, y_train)
grid_search.fit(df_train, y_train.measurement)

# Get the best hyperparameters and best model
best_rf_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_
best_rf_score = -grid_search.best_score_

print("Best Hyperparameters:", best_rf_params)

# Making predictions on the validation data using the best model
y_pred_rf = best_rf_model.predict(df_test)

# Calculating RMSE on the validation data
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("RandomForest RMSE on Validation Data with Best Model:", rmse_rf)


KeyboardInterrupt: 

In [None]:
# Save the best model and results
best_rf_model_info = {'best_params': best_rf_params, 'best_score': best_rf_score, 'rmse': rmse_rf}
best_rf_file = get_absolute_path(
    file_name='best_rf_model.joblib',
    rel_path='results'
)
save_model(best_rf_file, best_rf_model, best_rf_model_info)