# Week 8 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use a third modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor


### Read data as dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [3]:
# X data path
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')
# Y data path
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

In [4]:
# Reading the parquet files as dataframes
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

X_train_pca = pd.read_parquet(train_pca_path)
X_val_pca = pd.read_parquet(val_pca_path)
X_test_pca = pd.read_parquet(test_pca_path)

### Modeling

In [5]:
# Function to fit and evaluate a model
def fit_and_evaluate(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    train_mse, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    val_mse, val_rmse, val_r2 = evaluate_model(y_val, y_val_pred)
    
    return {
        "train_mse": train_mse,
        "train_rmse": train_rmse,
        "train_r2": train_r2,
        "val_mse": val_mse,
        "val_rmse": val_rmse,
        "val_r2": val_r2
    }

### Random Forest

In [6]:
# Define hyperparameters for Random Forest and XGBoost
rf_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10]
}

In [7]:
# Initialize the models
rf_model = RandomForestRegressor(random_state=42)

In [8]:
# Hyperparameter tuning using GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_results = fit_and_evaluate(rf_grid_search, X_train_scaled, y_train, X_val_scaled, y_val)

KeyboardInterrupt: 

In [None]:
# Print Random Forest results
print("Random Forest Regression Model - Best Params:", rf_grid_search.best_params_)
print("Random Forest Regression Model - Training Metrics:")
print(f"MSE: {rf_results['train_mse']:.4f}, RMSE: {rf_results['train_rmse']:.4f}, R²: {rf_results['train_r2']:.4f}")
print("\nRandom Forest Regression Model - Validation Metrics:")
print(f"MSE: {rf_results['val_mse']:.4f}, RMSE: {rf_results['val_rmse']:.4f}, R²: {rf_results['val_r2']:.4f}")

### XGBoost

In [None]:
# Define hyperparameters for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 10]
}

In [None]:
# Initialize the models
xgb_model = XGBRegressor(random_state=42)

In [None]:
# Hyperparameter tuning using GridSearchCV for XGBoost
xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_results = fit_and_evaluate(xgb_grid_search, X_train_scaled, y_train, X_val_scaled, y_val)

In [None]:
# Print XGBoost results
print("XGBoost Regression Model - Best Params:", xgb_grid_search.best_params_)
print("XGBoost Regression Model - Training Metrics:")
print(f"MSE: {xgb_results['train_mse']:.4f}, RMSE: {xgb_results['train_rmse']:.4f}, R²: {xgb_results['train_r2']:.4f}")
print("\nXGBoost Regression Model - Validation Metrics:")
print(f"MSE: {xgb_results['val_mse']:.4f}, RMSE: {xgb_results['val_rmse']:.4f}, R²: {xgb_results['val_r2']:.4f}")