# Week 6 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use one modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score


### Read data as dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

In [33]:
# Save PCA DataFrames as Parquet files
X_train_scaled_path = os.path.join(processed_data_folder, 'X_train_scaled.parquet')
X_val_scaled_path = os.path.join(processed_data_folder, 'X_val_scaled.parquet')
X_test_scaled_path = os.path.join(processed_data_folder, 'X_test_scaled.parquet')

# Paths for the target variables
y_train_path = os.path.join(processed_data_folder, 'y_train.parquet')
y_val_path = os.path.join(processed_data_folder, 'y_val.parquet')
y_test_path = os.path.join(processed_data_folder, 'y_test.parquet')

In [None]:
X_train_scaled = pd.read_parquet(X_train_scaled_path)
X_val_scaled = pd.read_parquet(X_val_scaled_path)
X_test_scaled = pd.read_parquet(X_test_scaled_path)

# Reading the target variables
y_train = pd.read_parquet(y_train_path)
y_val = pd.read_parquet(y_val_path)
y_test = pd.read_parquet(y_test_path)

### Linear Regression

#### The first modeling method we will use is linear regression. We will use ridge regression (L2) to vary the regularization strength. 

In [35]:
# Define three models with varying regularization strengths
ridge_model_1 = Ridge(alpha=0.001)  # low regularization
ridge_model_2 = Ridge(alpha=0.01)  # medium regularization
ridge_model_3 = Ridge(alpha=1) # high regularization

In [36]:
# Train each Ridge model
ridge_model_1.fit(X_train_scaled, y_train)
ridge_model_2.fit(X_train_scaled, y_train)
ridge_model_3.fit(X_train_scaled, y_train)

In [37]:
# Predict on the training and validation datasets
y_train_pred_1 = ridge_model_1.predict(X_train_scaled)
y_val_pred_1 = ridge_model_1.predict(X_val_scaled)

y_train_pred_2 = ridge_model_2.predict(X_train_scaled)
y_val_pred_2 = ridge_model_2.predict(X_val_scaled)

y_train_pred_3 = ridge_model_3.predict(X_train_scaled)
y_val_pred_3 = ridge_model_3.predict(X_val_scaled)

In [38]:
# Calculate evaluation metrics (MSE and R-squared) for training and validation datasets
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

In [39]:
# Check for NaN values in the training and validation sets
print("NaN values in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaN values in X_val_scaled:", np.isnan(X_val_scaled).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())
print("NaN values in y_val:", np.isnan(y_val).sum())


NaN values in X_train_scaled: 0
NaN values in X_val_scaled: 0
NaN values in y_train: price    0
dtype: int64
NaN values in y_val: price    0
dtype: int64


In [40]:
# Evaluate Ridge model 1
train_mse_1, train_r2_1 = evaluate_model(y_train, y_train_pred_1)
val_mse_1, val_r2_1 = evaluate_model(y_val, y_val_pred_1)

# Evaluate Ridge model 2
train_mse_2, train_r2_2 = evaluate_model(y_train, y_train_pred_2)
val_mse_2, val_r2_2 = evaluate_model(y_val, y_val_pred_2)

# Evaluate Ridge model 3
train_mse_3, train_r2_3 = evaluate_model(y_train, y_train_pred_3)
val_mse_3, val_r2_3 = evaluate_model(y_val, y_val_pred_3)

In [41]:
# Print results
print("Model 1 (alpha=0.1) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_1, train_r2_1))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_1, val_r2_1))

print("Model 2 (alpha=1.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_2, train_r2_2))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_2, val_r2_2))

print("Model 3 (alpha=10.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_3, train_r2_3))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_3, val_r2_3))

Model 1 (alpha=0.1) - Training MSE: 6868.4819, R2: 0.2146
Validation MSE: 6475.0053, R2: 0.2055
Model 2 (alpha=1.0) - Training MSE: 6868.4965, R2: 0.2146
Validation MSE: 6475.1198, R2: 0.2055
Model 3 (alpha=10.0) - Training MSE: 6868.8433, R2: 0.2145
Validation MSE: 6475.6536, R2: 0.2054


In [42]:
#Investigating the similar outputs
print(ridge_model_1.coef_)
print(ridge_model_2.coef_)
print(ridge_model_3.coef_)
#Based on the similar coefficients for the Ridge models with different alpha values it seems like Ridge regularization is not playing a major role in altering the model's predictions. This might be because the features are already well scaled. 


[[-8.88054871e-01 -6.48325275e+00 -8.96070865e+00  1.59479198e+00
   1.30806395e+00  1.40753263e-01  6.04357959e-02  1.88058422e+01
  -4.63258217e+00 -2.22116946e-01 -1.15274108e-01  1.03061618e+01
   8.25073696e-02 -1.22082874e+00  2.21943328e+01  6.55676712e-01
  -1.18436423e+02  8.42498081e+01  3.31387514e+00 -1.20633913e+02
   6.25660407e+01 -4.11222992e+02 -2.48571953e+00  8.69638276e+01
  -2.01279114e+01 -2.19894965e+00 -8.92567905e-01  3.18794515e-01
  -1.06505890e-01 -1.65180168e+00 -2.74156315e+02  3.29602016e+02
  -4.87967910e-01 -5.03045270e-01 -1.77606906e+02  4.30379457e+00
   5.76577664e+01  1.07437961e+02 -7.60960688e-02 -5.03901127e+00
  -3.05633853e+01 -5.37219128e+01  4.36925197e+02 -5.30218049e-01
  -8.01163085e-02  3.68394959e-01 -2.88799115e+00 -3.23628554e+00
   1.96754770e-01  3.85922586e-01 -2.63877668e+00 -6.07111159e-01
  -4.26911098e-02 -2.84148153e+00  5.52724588e-01 -1.65070858e-01
   1.45600442e+01 -3.14517784e+00 -3.03899909e+00 -2.91868051e+00
   2.00246

In [43]:
#Continuation of ridge regression exploration
#Testing with higher alpha values (alpha=100)
ridge_model_4 = Ridge(alpha=100) 
ridge_model_4.fit(X_train_scaled, y_train)
y_train_pred_4 = ridge_model_4.predict(X_train_scaled)
y_val_pred_4 = ridge_model_4.predict(X_val_scaled)
# Evaluate Ridge model 3
train_mse_4, train_r2_4 = evaluate_model(y_train, y_train_pred_4)
print("Model 4 (alpha=100.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_4, train_r2_4))


#Testing with higher alpha values (alpha=1000)
ridge_model_5 = Ridge(alpha=1000) 
ridge_model_5.fit(X_train_scaled, y_train)
y_train_pred_5 = ridge_model_5.predict(X_train_scaled)
y_val_pred_5 = ridge_model_5.predict(X_val_scaled)
# Evaluate Ridge model 3
train_mse_5, train_r2_5 = evaluate_model(y_train, y_train_pred_5)
print("Model 5 (alpha=1000.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_5, train_r2_5))


Model 4 (alpha=100.0) - Training MSE: 6869.4575, R2: 0.2145
Model 5 (alpha=1000.0) - Training MSE: 6870.2419, R2: 0.2144


In [44]:
#3 different Lasso regularization terms
from sklearn.linear_model import Lasso

# Define three models with varying regularization strengths for Lasso
lasso_model_1 = Lasso(alpha=0.001)  # low regularization
lasso_model_2 = Lasso(alpha=0.01)   # medium regularization
lasso_model_3 = Lasso(alpha=1)      # high regularization

# Train each Lasso model
lasso_model_1.fit(X_train_scaled, y_train)
lasso_model_2.fit(X_train_scaled, y_train)
lasso_model_3.fit(X_train_scaled, y_train)

# Predict on the training and validation datasets
y_train_pred_lasso_1 = lasso_model_1.predict(X_train_scaled)
y_val_pred_lasso_1 = lasso_model_1.predict(X_val_scaled)

y_train_pred_lasso_2 = lasso_model_2.predict(X_train_scaled)
y_val_pred_lasso_2 = lasso_model_2.predict(X_val_scaled)

y_train_pred_lasso_3 = lasso_model_3.predict(X_train_scaled)
y_val_pred_lasso_3 = lasso_model_3.predict(X_val_scaled)

# Calculate evaluation metrics (MSE and R-squared) for training and validation datasets
train_mse_lasso_1, train_r2_lasso_1 = evaluate_model(y_train, y_train_pred_lasso_1)
val_mse_lasso_1, val_r2_lasso_1 = evaluate_model(y_val, y_val_pred_lasso_1)

train_mse_lasso_2, train_r2_lasso_2 = evaluate_model(y_train, y_train_pred_lasso_2)
val_mse_lasso_2, val_r2_lasso_2 = evaluate_model(y_val, y_val_pred_lasso_2)

train_mse_lasso_3, train_r2_lasso_3 = evaluate_model(y_train, y_train_pred_lasso_3)
val_mse_lasso_3, val_r2_lasso_3 = evaluate_model(y_val, y_val_pred_lasso_3)

# Print results
print("Lasso Model 1 (alpha=0.001) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_1, train_r2_lasso_1))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_1, val_r2_lasso_1))

print("Lasso Model 2 (alpha=0.01) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_2, train_r2_lasso_2))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_2, val_r2_lasso_2))

print("Lasso Model 3 (alpha=1.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_3, train_r2_lasso_3))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_3, val_r2_lasso_3))

  model = cd_fast.enet_coordinate_descent(


Lasso Model 1 (alpha=0.001) - Training MSE: 6869.2573, R2: 0.2145
Validation MSE: 6475.0757, R2: 0.2055
Lasso Model 2 (alpha=0.01) - Training MSE: 6869.7284, R2: 0.2144
Validation MSE: 6474.2080, R2: 0.2056
Lasso Model 3 (alpha=1.0) - Training MSE: 6887.8521, R2: 0.2124
Validation MSE: 6476.0189, R2: 0.2054


In [45]:


# Step 1: Generate polynomial features
poly = PolynomialFeatures(degree=2)  # Adjust the degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Step 2: Fit the Linear Regression model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Step 3: Make predictions
y_train_pred_poly = poly_model.predict(X_train_poly)
y_val_pred_poly = poly_model.predict(X_val_poly)

# Step 4: Calculate evaluation metrics (MSE and R-squared)
train_mse_poly, train_r2_poly = evaluate_model(y_train, y_train_pred_poly)
val_mse_poly, val_r2_poly = evaluate_model(y_val, y_val_pred_poly)

# Print results
print("Polynomial Regression Model - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_poly, train_r2_poly))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_poly, val_r2_poly))

Polynomial Regression Model - Training MSE: 2.8405, R2: 0.9997
Validation MSE: 59460830145836358828032.0000, R2: -7295900558419439616.0000


In [None]:
#Polynomial feature with lasso regression
from sklearn.linear_model import Lasso

X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Step 2: Fit the Lasso regression model
lasso_model = Lasso(alpha=1.0)  # Adjust alpha for different regularization strengths
lasso_model.fit(X_train_poly, y_train)

# Step 3: Make predictions
y_train_pred_lasso = lasso_model.predict(X_train_poly)
y_val_pred_lasso = lasso_model.predict(X_val_poly)

# Step 4: Calculate evaluation metrics (MSE and R-squared)
train_mse_lasso, train_r2_lasso = evaluate_model(y_train, y_train_pred_lasso)
val_mse_lasso, val_r2_lasso = evaluate_model(y_val, y_val_pred_lasso)

# Print results
print("Lasso Regression Model - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso, train_r2_lasso))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso, val_r2_lasso))

In [47]:
#Polynomial feature with 3 different ridge regressions 
from sklearn.linear_model import Ridge

# Step 1: Generate polynomial features
poly = PolynomialFeatures(degree=2)  # Adjust the degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Define the alpha values you want to test
alpha_values = [1.0, 10.0, 100.0]  # You can adjust these values as needed

# Loop through the different alpha values
for alpha in alpha_values:
    # Step 2: Fit the Ridge regression model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_poly, y_train)

    # Step 3: Make predictions
    y_train_pred_ridge = ridge_model.predict(X_train_poly)
    y_val_pred_ridge = ridge_model.predict(X_val_poly)

    # Step 4: Calculate evaluation metrics (MSE and R-squared)
    train_mse_ridge, train_r2_ridge = evaluate_model(y_train, y_train_pred_ridge)
    val_mse_ridge, val_r2_ridge = evaluate_model(y_val, y_val_pred_ridge)

    # Print results for the current alpha
    print("Ridge Regression Model (alpha={}) - Training MSE: {:.4f}, R2: {:.4f}".format(alpha, train_mse_ridge, train_r2_ridge))
    print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_ridge, val_r2_ridge))

Ridge Regression Model (alpha=1.0) - Training MSE: 2.8320, R2: 0.9997
Validation MSE: 9.7312, R2: 0.9988
Ridge Regression Model (alpha=10.0) - Training MSE: 2.8392, R2: 0.9997
Validation MSE: 9.6947, R2: 0.9988
Ridge Regression Model (alpha=100.0) - Training MSE: 2.8588, R2: 0.9997
Validation MSE: 9.4152, R2: 0.9988
