# Week 8 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use a third modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

### Read data as dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [3]:
# X data path
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')
# Y data path
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

In [4]:
# Reading the parquet files as dataframes
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

X_train_pca = pd.read_parquet(train_pca_path)
X_val_pca = pd.read_parquet(val_pca_path)
X_test_pca = pd.read_parquet(test_pca_path)

### Modeling

### Random Forest

In [5]:
# Define parameter combinations to try
param_combinations = [
    {'n_estimators': 50, 'max_depth': None, 'max_features': None},
    {'n_estimators': 100, 'max_depth': 10, 'max_features': 'sqrt'},
    {'n_estimators': 150, 'max_depth': 20, 'max_features': 'log2'}
]

# Initialize variables to store best model and score
best_model = None
best_params = None
lowest_mse = float('inf')

# Loop through each parameter combination
for params in param_combinations:
    # Initialize the model with the current parameters
    rf_model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        max_features=params['max_features'],
        random_state=42
    )
    
    # Train the model
    rf_model.fit(X_train_scaled, y_train)
    
    # Predict on validation data
    y_val_pred = rf_model.predict(X_val_scaled)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_val, y_val_pred)
    
    # Check if this is the best model so far
    if mse < lowest_mse:
        lowest_mse = mse
        best_model = rf_model
        best_params = params

# Print the best hyperparameters and score
print("Best Parameters:", best_params)
print("Lowest MSE on validation set:", lowest_mse)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Best Parameters: {'n_estimators': 50, 'max_depth': None, 'max_features': None}
Lowest MSE on validation set: 3.40445860460778


### XGBoost

In [6]:
# Define parameter combinations to try
param_combinations = [
    {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1},
    {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05},
    {'n_estimators': 150, 'max_depth': 7, 'learning_rate': 0.01}
]

# Initialize variables to store best model and score
best_model = None
best_params = None
lowest_mse = float('inf')

# Loop through each parameter combination
for params in param_combinations:
    # Initialize the model with the current parameters
    xgb_model = XGBRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        random_state=42,
        verbosity=0  # Suppresses warning messages for cleaner output
    )
    
    # Train the model
    xgb_model.fit(X_train_scaled, y_train)
    
    # Predict on validation data
    y_val_pred = xgb_model.predict(X_val_scaled)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_val, y_val_pred)
    
    # Check if this is the best model so far
    if mse < lowest_mse:
        lowest_mse = mse
        best_model = xgb_model
        best_params = params

# Print the best hyperparameters and score
print("Best Parameters for XGBoost:", best_params)
print("Lowest MSE on validation set for XGBoost:", lowest_mse)

Best Parameters for XGBoost: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05}
Lowest MSE on validation set for XGBoost: 58.14836837530017
