# Week 6 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use one modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_score


### Read data as dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [3]:
# Save PCA DataFrames as Parquet files
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

# Paths for the target variables
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')

In [4]:
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

# Reading the target variables
y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

X_train_pca = pd.read_parquet(train_pca_path)
X_val_pca = pd.read_parquet(val_pca_path)
X_test_pca = pd.read_parquet(test_pca_path)

### Linear Regression

In [5]:
def evaluate_model(y_true, y_pred):

    rmse = root_mean_squared_error(y_true, y_pred)
    mse = rmse**2
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

In [6]:
ols = LinearRegression()

ols.fit(X_train_scaled, y_train)
ols_predictions = ols.predict(X_test_scaled)


ols_train_predictions = ols.predict(X_train_scaled)
ols_predictions = ols.predict(X_test_scaled)

ols_train_mse, ols_train_rmse, ols_train_r2 = evaluate_model(y_train, ols_train_predictions)
ols_mse, ols_rmse, ols_r2 = evaluate_model(y_test, ols_predictions)

# Print the metrics for training and validation sets
print("OLS Regression Model - Training Metrics:")
print(f"MSE: {ols_train_mse:.4f}, RMSE: {ols_train_rmse:.4f}, R²: {ols_train_r2:.4f}")

print("\nOLS Regression Model - Validation Metrics:")
print(f"MSE: {ols_mse:.4f}, RMSE: {ols_rmse:.4f}, R²: {ols_r2:.4f}")


OLS Regression Model - Training Metrics:
MSE: 6865.9794, RMSE: 82.8612, R²: 0.2149

OLS Regression Model - Validation Metrics:
MSE: 8016.5324, RMSE: 89.5351, R²: 0.2443


In [7]:
ols_pca = LinearRegression()

ols_pca.fit(X_train_pca, y_train)
ols_pca_train_predictions = ols_pca.predict(X_train_pca)
ols_pca_predictions = ols_pca.predict(X_test_pca)

ols_pca_train_mse, ols_pca_train_rmse, ols_pca_train_r2 = evaluate_model(y_train, ols_pca_train_predictions)
ols_pca_mse, ols_pca_rmse, ols_pca_r2 = evaluate_model(y_test, ols_pca_predictions)


print("OLS PCA Regression Model - Training Metrics:")
print(f"MSE: {ols_pca_train_mse:.4f}, RMSE: {ols_pca_train_rmse:.4f}, R²: {ols_pca_train_r2:.4f}")

print("\nOLS PCA Regression Model - Validation Metrics:")
print(f"MSE: {ols_pca_mse:.4f}, RMSE: {ols_pca_rmse:.4f}, R²: {ols_pca_r2:.4f}")


OLS PCA Regression Model - Training Metrics:
MSE: 6872.0466, RMSE: 82.8978, R²: 0.2142

OLS PCA Regression Model - Validation Metrics:
MSE: 8009.1325, RMSE: 89.4938, R²: 0.2450


### Regularization
#### L1
Lasso is harder to converge, and it takes very long for lasso to cross validation and grid search. Therefore, the following cell takes very long to run and a lot of values of alpha fails to converge with limited iterations.

In [8]:
lasso_alphas = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}  

lasso = Lasso()
lasso_cv = GridSearchCV(lasso, param_grid=lasso_alphas, cv=5, scoring='neg_mean_squared_error')
lasso_cv.fit(X_train_scaled, y_train)

best_lasso_alpha = lasso_cv.best_params_['alpha']
print(f"Best alpha for Lasso: {best_lasso_alpha}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha for Lasso: 0.1


In [9]:
lasso_best = Lasso(alpha=0.1)
lasso_best.fit(X_train_scaled, y_train)

y_train_pred_lasso = lasso_best.predict(X_train_scaled)
y_val_pred_lasso = lasso_best.predict(X_val_scaled)

train_mse_lasso, train_rmse_lasso, train_r2_lasso = evaluate_model(y_train, y_train_pred_lasso)
val_mse_lasso, val_rmse_lasso, val_r2_lasso = evaluate_model(y_val, y_val_pred_lasso) 

print("Lasso Regression Model - Training Metrics:")
print(f"MSE: {train_mse_lasso:.4f}, RMSE: {train_rmse_lasso:.4f}, R²: {train_r2_lasso:.4f}")

# Print the results for the validation dataset
print("\nLasso Regression Model - Validation Metrics:")
print(f"MSE: {val_mse_lasso:.4f}, RMSE: {val_rmse_lasso:.4f}, R²: {val_r2_lasso:.4f}")

Lasso Regression Model - Training Metrics:
MSE: 6871.0827, RMSE: 82.8920, R²: 0.2143

Lasso Regression Model - Validation Metrics:
MSE: 6472.3002, RMSE: 80.4506, R²: 0.2058


In [10]:
ridge = Ridge()
ridge_alphas = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}  # Define alpha range
ridge_cv = GridSearchCV(ridge, param_grid=ridge_alphas, cv=5, scoring='neg_mean_squared_error')
ridge_cv.fit(X_train_scaled, y_train)
best_ridge_alpha = ridge_cv.best_params_['alpha']
print(f"Best alpha for Ridge: {best_ridge_alpha}")

Best alpha for Ridge: 0.1


In [11]:
ridge_best = Ridge(alpha=best_ridge_alpha)
ridge_best.fit(X_train_scaled, y_train)

y_train_pred_ridge = ridge_best.predict(X_train_scaled)
y_val_pred_ridge = ridge_best.predict(X_val_scaled) 

train_mse_ridge, train_rmse_ridge, train_r2_ridge = evaluate_model(y_train, y_train_pred_ridge)
val_mse_ridge, val_rmse_ridge, val_r2_ridge = evaluate_model(y_val, y_val_pred_ridge) 

print("Ridge Regression Model - Training Metrics:")
print(f"MSE: {train_mse_ridge:.4f}, RMSE: {train_rmse_ridge:.4f}, R²: {train_r2_ridge:.4f}")

# Print the results for the validation dataset
print("\nRidge Regression Model - Validation Metrics:")
print(f"MSE: {val_mse_ridge:.4f}, RMSE: {val_rmse_ridge:.4f}, R²: {val_r2_ridge:.4f}")

Ridge Regression Model - Training Metrics:
MSE: 6866.5889, RMSE: 82.8649, R²: 0.2148

Ridge Regression Model - Validation Metrics:
MSE: 6482.3448, RMSE: 80.5130, R²: 0.2046


### Experiment with polynomial terms
We commented out this part to save memories when running on JupyterHub.

In [12]:
# poly = PolynomialFeatures(degree=2) 
# X_train_poly = poly.fit_transform(X_train_scaled)
# X_val_poly = poly.transform(X_val_scaled)

# poly_model = LinearRegression()
# poly_model.fit(X_train_poly, y_train)

# y_train_pred_poly = poly_model.predict(X_train_poly)
# y_val_pred_poly = poly_model.predict(X_val_poly)

# train_mse_poly, train_rmse_poly, train_r2_poly = evaluate_model(y_train, y_train_pred_poly)
# val_mse_poly, val_rmse_poly, val_r2_poly = evaluate_model(y_val, y_val_pred_poly) 

# # Print results
# print("Polynomial Regression Model - Training Metrics:")
# print("MSE: {:.4f}, RMSE: {:.4f}, R²: {:.4f}".format(train_mse_poly, train_rmse_poly, train_r2_poly))

# print("\nPolynomial Regression Model - Validation Metrics:")
# print("MSE: {:.4f}, RMSE: {:.4f}, R²: {:.4f}".format(val_mse_poly, val_rmse_poly, val_r2_poly))

In [13]:
# ridge_poly_alphas = {'alpha': [0.01, 0.1, 1]} 

# ridge = Ridge()

# ridge_cv = GridSearchCV(ridge, param_grid = ridge_poly_alphas, cv = 5, scoring='neg_mean_squared_error')
# ridge_cv.fit(X_train_poly, y_train)

# best_ridge_alpha = ridge_cv.best_params_['alpha']
# print(f"Best alpha for Ridge: {best_ridge_alpha}")

# ridge_best = Ridge(alpha=best_ridge_alpha)
# ridge_best.fit(X_train_poly, y_train)

# y_train_pred_ridge = ridge_best.predict(X_train_poly)
# y_val_pred_ridge = ridge_best.predict(X_val_poly)

# train_mse_ridge, train_rmse_ridge, train_r2_ridge = evaluate_model(y_train, y_train_pred_ridge)
# val_mse_ridge, val_rmse_ridge, val_r2_ridge = evaluate_model(y_val, y_val_pred_ridge)

# print("Ridge Regression Model (Best alpha={}) - Training Metrics:".format(best_ridge_alpha))
# print("MSE: {:.4f}, RMSE: {:.4f}, R²: {:.4f}".format(train_mse_ridge, train_rmse_ridge, train_r2_ridge))
# print("\nValidation Metrics:")
# print("MSE: {:.4f}, RMSE: {:.4f}, R²: {:.4f}".format(val_mse_ridge, val_rmse_ridge, val_r2_ridge))