# Week 8 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use a third modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

### Read data as dataframe

In [3]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [4]:
# X data path
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')
# Y data path
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

In [5]:
# Reading the parquet files as dataframes
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

X_train_pca = pd.read_parquet(train_pca_path)
X_val_pca = pd.read_parquet(val_pca_path)
X_test_pca = pd.read_parquet(test_pca_path)

### Preparation for Modeling
#### Ravel y into numpy array
Sklearn random forest requires y to be an 1D array, so we need to ravel the panda dataframe to a 1D numpy array.

In [6]:
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [7]:
def evaluate_model(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    mse = rmse**2
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

### Modeling
#### Random Forest
Random Forest can be computationally intensive, making cross-validation difficult when tuning hyperparameters. To streamline the process of experimenting with multiple hyperparameters, we created a function that automates the training and evaluation steps on the validation set, helping us identify the best model.

In [8]:
def best_rf(X_train, X_val, y_train, y_val, param_combinations, lowest_rmse = float('inf')):
    for params in param_combinations:
    # Initialize the model with the current parameters
        rf_model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        max_features=params['max_features'],
        random_state=42
    )
    
    # Train the model
        rf_model.fit(X_train, y_train)
    
    # Predict on validation data
        y_val_pred = rf_model.predict(X_val)
    
    # Calculate Mean Squared Error
        rmse = root_mean_squared_error(y_val, y_val_pred)
    
    # Check if this is the best model so far
        if rmse < lowest_rmse:
            lowest_rmse = rmse
            best_model = rf_model
            best_params = params
            print("Best Parameters:", best_params)
            print("Lowest RMSE on validation set:", lowest_rmse)
    return best_model, best_params, lowest_rmse
    
    

##### Tuning `n_estimators`
`n_estimators` refers number of decision trees in the random forest. We first want to experiment how that affect the model performance.


In [9]:
param_combinations = [
    {'n_estimators': 50, 'max_depth': 10, 'max_features':'log2'},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'},
    {'n_estimators': 150, 'max_depth': 10, 'max_features': 'log2'}
]

best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations,lowest_rmse = float('inf'))

Best Parameters: {'n_estimators': 50, 'max_depth': 10, 'max_features': 'log2'}
Lowest RMSE on validation set: 63.2406685389967
Best Parameters: {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'}
Lowest RMSE on validation set: 60.647192316432616


##### Tuning `max_depth`
`max_depth` controls the maximum depth of each decision tree in the random forest. We will experiment with this parameter to understand its impact on model performance and find an optimal value that balances complexity and accuracy.


In [10]:
param_combinations = [
    {'n_estimators': 100, 'max_depth': 5, 'max_features': 'log2'},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'},
    {'n_estimators': 100, 'max_depth': 15, 'max_features': 'log2'}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse)

Best Parameters: {'n_estimators': 100, 'max_depth': 15, 'max_features': 'log2'}
Lowest RMSE on validation set: 41.15554705419018


##### Tuning `max_features`
`max_features` determines the maximum number of features considered when splitting a node in each decision tree. We will experiment with different values to see how this parameter affects model performance, aiming to find a balance between diversity in the trees and overall accuracy.

In [11]:
param_combinations = [
    {'n_estimators': 100, 'max_depth': 15, 'max_features': None},
    {'n_estimators': 100, 'max_depth': 15, 'max_features': 'log2'},
    {'n_estimators': 100, 'max_depth': 15, 'max_features': 'sqrt'}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse)

Best Parameters: {'n_estimators': 100, 'max_depth': 15, 'max_features': None}
Lowest RMSE on validation set: 1.739659226065485


#### Evaluate the Best Model
While the model does perform a lot better, the performance metrics shows that the model is overfitting to the training data.

In [12]:
y_train_pred_tree = best_model.predict(X_train_scaled)
y_test_pred_tree = best_model.predict(X_test_scaled)
y_val_pred_tree = best_model.predict(X_val_scaled)

# Evaluate performance
train_mse_tree, train_rmse_tree, train_r2_tree = evaluate_model(y_train, y_train_pred_tree)
val_mse_tree, val_rmse_tree, val_r2_tree = evaluate_model(y_val, y_val_pred_tree)
test_mse_tree, test_rmse_tree, test_r2_tree = evaluate_model(y_test, y_test_pred_tree)

# Print training metrics
print("Decision Tree Regression Model - Training Metrics:")
print(f"MSE: {train_mse_tree:.4f}, RMSE: {train_rmse_tree:.4f}, R²: {train_r2_tree:.4f}")

# Print validation metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

# Print testing metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

Decision Tree Regression Model - Training Metrics:
MSE: 1.7703, RMSE: 1.3305, R²: 0.9998

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0264, RMSE: 1.7397, R²: 0.9996

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0264, RMSE: 1.7397, R²: 0.9996


Experiment if more estimators and `max_features` = n_features will lead to better model performance.

In [13]:
param_combinations = [
    {'n_estimators': 150, 'max_depth': 15, 'max_features': None},
    {'n_estimators': 200, 'max_depth': 15, 'max_features': None},
    {'n_estimators': 250, 'max_depth': 15, 'max_features': None}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse)

Best Parameters: {'n_estimators': 150, 'max_depth': 15, 'max_features': None}
Lowest RMSE on validation set: 1.7372547398224727
Best Parameters: {'n_estimators': 200, 'max_depth': 15, 'max_features': None}
Lowest RMSE on validation set: 1.73583835395773


Similarly, the model shows slightly improved performance but still overfits the training dataset.

In [14]:
y_train_pred_tree = best_model.predict(X_train_scaled)
y_test_pred_tree = best_model.predict(X_test_scaled)
y_val_pred_tree = best_model.predict(X_val_scaled)

# Evaluate performance
train_mse_tree, train_rmse_tree, train_r2_tree = evaluate_model(y_train, y_train_pred_tree)
val_mse_tree, val_rmse_tree, val_r2_tree = evaluate_model(y_val, y_val_pred_tree)
test_mse_tree, test_rmse_tree, test_r2_tree = evaluate_model(y_test, y_test_pred_tree)

# Print training metrics
print("Decision Tree Regression Model - Training Metrics:")
print(f"MSE: {train_mse_tree:.4f}, RMSE: {train_rmse_tree:.4f}, R²: {train_r2_tree:.4f}")

# Print validation metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

# Print testing metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

Decision Tree Regression Model - Training Metrics:
MSE: 1.7664, RMSE: 1.3290, R²: 0.9998

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0131, RMSE: 1.7358, R²: 0.9996

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0131, RMSE: 1.7358, R²: 0.9996


#### Addressing Overfitting

In [15]:
param_combinations = [
    {'n_estimators': 100, 'max_depth': 10, 'max_features': None},
    {'n_estimators': 150, 'max_depth': 10, 'max_features': None},
    {'n_estimators': 200, 'max_depth': 10, 'max_features': None},
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse= float('inf'))

Best Parameters: {'n_estimators': 100, 'max_depth': 10, 'max_features': None}
Lowest RMSE on validation set: 1.9596762830031542
Best Parameters: {'n_estimators': 150, 'max_depth': 10, 'max_features': None}
Lowest RMSE on validation set: 1.9571073467668085
Best Parameters: {'n_estimators': 200, 'max_depth': 10, 'max_features': None}
Lowest RMSE on validation set: 1.9556063496243887


In [16]:
y_train_pred_tree = best_model.predict(X_train_scaled)
y_test_pred_tree = best_model.predict(X_test_scaled)
y_val_pred_tree = best_model.predict(X_val_scaled)

# Evaluate performance
train_mse_tree, train_rmse_tree, train_r2_tree = evaluate_model(y_train, y_train_pred_tree)
val_mse_tree, val_rmse_tree, val_r2_tree = evaluate_model(y_val, y_val_pred_tree)
test_mse_tree, test_rmse_tree, test_r2_tree = evaluate_model(y_test, y_test_pred_tree)

# Print training metrics
print("Decision Tree Regression Model - Training Metrics:")
print(f"MSE: {train_mse_tree:.4f}, RMSE: {train_rmse_tree:.4f}, R²: {train_r2_tree:.4f}")

# Print validation metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

# Print testing metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

Decision Tree Regression Model - Training Metrics:
MSE: 3.3915, RMSE: 1.8416, R²: 0.9996

Decision Tree Regression Model - Validation Metrics:
MSE: 3.8244, RMSE: 1.9556, R²: 0.9995

Decision Tree Regression Model - Validation Metrics:
MSE: 3.8244, RMSE: 1.9556, R²: 0.9995
