# Week 8 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use a third modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

### Read data as dataframe

In [13]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [14]:
# X data path
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')
# Y data path
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

In [15]:
# Reading the parquet files as dataframes
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

X_train_pca = pd.read_parquet(train_pca_path)
X_val_pca = pd.read_parquet(val_pca_path)
X_test_pca = pd.read_parquet(test_pca_path)

In [16]:
X_train_scaled.shape

(70000, 84)

In [17]:
# def mse(y_true, y_pred):
#     squared_errors = (y_true-y_pred) **2
#     return np.mean(squared_errors)

In [18]:
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [31]:
def evaluate_model(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    mse = rmse**2
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

### Modeling

### Random Forest

In [25]:
def best_rf(X_train, X_val, y_train, y_val, param_combinations, lowest_rmse = float('inf')):
    for params in param_combinations:
    # Initialize the model with the current parameters
        rf_model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        max_features=params['max_features'],
        random_state=42
    )
    
    # Train the model
        rf_model.fit(X_train, y_train)
    
    # Predict on validation data
        y_val_pred = rf_model.predict(X_val)
    
    # Calculate Mean Squared Error
        rmse = root_mean_squared_error(y_val, y_val_pred)
    
    # Check if this is the best model so far
        if rmse < lowest_rmse:
            lowest_rmse = rmse
            best_model = rf_model
            best_params = params
            print("Best Parameters:", best_params)
            print("Lowest RMSE on validation set:", lowest_rmse)
    return best_model, best_params, lowest_rmse
    
    

In [26]:
param_combinations = [
    {'n_estimators': 50, 'max_depth': 10, 'max_features':'log2'},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'},
    {'n_estimators': 150, 'max_depth': 10, 'max_features': 'log2'}
]

best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse = float('inf'))

Best Parameters: {'n_estimators': 50, 'max_depth': 10, 'max_features': 'log2'}
Lowest RMSE on validation set: 63.340226273779564
Best Parameters: {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'}
Lowest RMSE on validation set: 61.211225629502344


In [27]:
param_combinations = [
    {'n_estimators': 50, 'max_depth': 5, 'max_features': 'log2'},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'log2'},
    {'n_estimators': 150, 'max_depth': 15, 'max_features': 'log2'}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse)

Best Parameters: {'n_estimators': 150, 'max_depth': 15, 'max_features': 'log2'}
Lowest RMSE on validation set: 41.19303632062572


In [29]:
param_combinations = [
    {'n_estimators': 150, 'max_depth': 15, 'max_features': None},
    {'n_estimators': 150, 'max_depth': 15, 'max_features': 'log2'},
    {'n_estimators': 150, 'max_depth': 15, 'max_features': 'sqrt'}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse)

Best Parameters: {'n_estimators': 150, 'max_depth': 15, 'max_features': None}
Lowest RMSE on validation set: 1.737282652009884


In [32]:
y_train_pred_tree = best_model.predict(X_train_scaled)
y_test_pred_tree = best_model.predict(X_test_scaled)
y_val_pred_tree = best_model.predict(X_val_scaled)

# Evaluate performance
train_mse_tree, train_rmse_tree, train_r2_tree = evaluate_model(y_train, y_train_pred_tree)
val_mse_tree, val_rmse_tree, val_r2_tree = evaluate_model(y_val, y_val_pred_tree)
test_mse_tree, test_rmse_tree, test_r2_tree = evaluate_model(y_test, y_test_pred_tree)

# Print training metrics
print("Decision Tree Regression Model - Training Metrics:")
print(f"MSE: {train_mse_tree:.4f}, RMSE: {train_rmse_tree:.4f}, R²: {train_r2_tree:.4f}")

# Print validation metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

# Print testing metrics
print("\nDecision Tree Regression Model - Validation Metrics:") 
print(f"MSE: {val_mse_tree:.4f}, RMSE: {val_rmse_tree:.4f}, R²: {val_r2_tree:.4f}")

Decision Tree Regression Model - Training Metrics:
MSE: 1.7658, RMSE: 1.3288, R²: 0.9998

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0182, RMSE: 1.7373, R²: 0.9996

Decision Tree Regression Model - Validation Metrics:
MSE: 3.0182, RMSE: 1.7373, R²: 0.9996


In [35]:
param_combinations = [
    {'n_estimators': 100, 'max_depth': 10, 'max_features': None},
    {'n_estimators': 150, 'max_depth': 10, 'max_features': None}
]
best_model, best_params, lowest_rmse = best_rf(X_train_scaled, X_val_scaled, y_train, y_val, param_combinations, lowest_rmse= float('inf'))

Best Parameters: {'n_estimators': 100, 'max_depth': 10, 'max_features': None}
Lowest RMSE on validation set: 1.9596717214816415
Best Parameters: {'n_estimators': 150, 'max_depth': 10, 'max_features': None}
Lowest RMSE on validation set: 1.9570982820782625


### XGBoost

In [11]:
# Define parameter combinations to try
param_combinations = [
    {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05},
    {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05},
    {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05}
]

# Initialize variables to store best model and score
best_model = None
best_params = None
lowest_rmse = float('inf')


# Loop through each parameter combination
for params in param_combinations:
    # Initialize the model with the current parameters
    xgb_model = XGBRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        random_state=42,
        verbosity=0  # Suppresses warning messages for cleaner output
    )
    
    # Train the model
    xgb_model.fit(X_train_scaled, y_train)
    
    # Predict on validation data
    y_val_pred = xgb_model.predict(X_val_scaled)
    
    # Calculate Mean Squared Error
    rmse = root_mean_squared_error(y_val, y_val_pred)
    
    # Check if this is the best model so far
    if rmse < lowest_rmse:
        lowest_rmse = rmse
        best_model = xgb_model
        best_params = params

# Print the best hyperparameters and score
print("Best Parameters for XGBoost:", best_params)
print("Lowest RMSE on validation set for XGBoost:", lowest_rmse)

Best Parameters for XGBoost: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05}
Lowest RMSE on validation set for XGBoost: 1.6728946006674148
