In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from mord import OrdinalRidge
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from tabulate import tabulate
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import requests
from io import StringIO

In [2]:
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/train_data.csv?token=GHSAT0AAAAAACC4ZCNLK5WDAXMHGAA2JI24ZGGSK4A"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    train = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'train' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/val_data.csv?token=GHSAT0AAAAAACC4ZCNK6U7OS5YW72PK3GBSZGGSH2A"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'valid' variable
    valid = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'valid' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/test_data.csv?token=GHSAT0AAAAAACC4ZCNL45CUOECJPHCIM43GZGGSIGQ"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    test = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'test' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Dataset downloaded and loaded into 'train' successfully.
Dataset downloaded and loaded into 'valid' successfully.
Dataset downloaded and loaded into 'test' successfully.


###Preprocess the data with variables, and function for converting predicted values into magnitude of 25 basis points

In [3]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

possible_values = [-1.00, -0.75, -0.50, -0.25, 0.00, 0.25, 0.50, 0.75, 1.00]

def round_to_nearest(value, possible_values):
    return min(possible_values, key=lambda x: abs(x - value))

def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

###Ordinal Ridge Regression

In [4]:
# Center and scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Initialize and train the ordinal logistic regression model
ordinal_model = OrdinalRidge()
ordinal_model.fit(X_train_scaled, y_train)

# Make predictions on the validation set
y_valid_pred = ordinal_model.predict(X_valid_scaled)

# Make predictions on the training set
y_train_pred = ordinal_model.predict(X_train_scaled)

In [5]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |      0.0373037 |        0.0729167 |
+--------------+----------------+------------------+
| RMSE         |      0.193142  |        0.270031  |
+--------------+----------------+------------------+
| R^2          |      0.0731313 |       -0.12187   |
+--------------+----------------+------------------+
| Adjusted R^2 |     -0.834428  |        1.36342   |
+--------------+----------------+------------------+
| Accuracy     |      0.706806  |        0.583333  |
+--------------+----------------+------------------+


In [6]:
# Center and scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Initialize and train the ordinal logistic regression model
ordinal_model = OrdinalRidge()
ordinal_model.fit(X_train_scaled, y_train)

# Make predictions on the validation set
y_valid_pred = ordinal_model.predict(X_valid_scaled)

# Make predictions on the training set
y_train_pred = ordinal_model.predict(X_train_scaled)

# Get the initial model performance (MSE) on the validation set
best_mse = mean_squared_error(y_valid, y_valid_pred)

# Initialize the list of selected features with all features
selected_features = list(range(X_train_scaled.shape[1]))

# Perform reverse variable selection
while len(selected_features) > 1:
    # Store the current MSE and feature to be removed
    current_best_mse = best_mse
    feature_to_remove = None
    
    # Iterate over each feature and remove one at a time
    for feature in selected_features:
        # Create a copy of the selected features with the current feature removed
        reduced_X_train_scaled = np.delete(X_train_scaled, feature, axis=1)
        reduced_X_valid_scaled = np.delete(X_valid_scaled, feature, axis=1)
        
        # Initialize and train the ordinal logistic regression model with the reduced features
        ordinal_model = OrdinalRidge()
        ordinal_model.fit(reduced_X_train_scaled, y_train)
        
        # Make predictions on the validation set with the reduced features
        y_valid_pred = ordinal_model.predict(reduced_X_valid_scaled)
        
        # Calculate the MSE with the reduced features
        mse = mean_squared_error(y_valid, y_valid_pred)
        
        # Check if the model performance has improved
        if mse < current_best_mse:
            current_best_mse = mse
            feature_to_remove = feature
            
    # If removing a feature improves the model performance, update the selected features and best MSE
    if feature_to_remove is not None:
        selected_features.remove(feature_to_remove)
        best_mse = current_best_mse
    else:
        # No feature was removed, exit the loop
        break

In [7]:
# Assuming 'selected_features' contains the list of best selected features
selected_X_train = X_train_scaled[:, selected_features]
selected_X_valid = X_valid_scaled[:, selected_features]

# Initialize and train the XGBoost model with the best selected features
xgb_model = xgb.XGBRegressor()
xgb_model.fit(selected_X_train, y_train)

# Make predictions on the training set with the best selected features
y_train_pred = xgb_model.predict(selected_X_train)

# Make predictions on the validation set with the best selected features
y_valid_pred = xgb_model.predict(selected_X_valid)

# Example usage for y_train_pred
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Example usage for y_valid_pred
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [8]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = selected_X_train.shape[0]
p_train = selected_X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |              0 |        0.0208333 |
+--------------+----------------+------------------+
| RMSE         |              0 |        0.144338  |
+--------------+----------------+------------------+
| R^2          |              1 |        0.679466  |
+--------------+----------------+------------------+
| Adjusted R^2 |              1 |        1.10384   |
+--------------+----------------+------------------+
| Accuracy     |              1 |        0.666667  |
+--------------+----------------+------------------+


###XGBoost Model

###All variables

In [9]:
# Initialize and train the XGBoost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = xgb_model.predict(X_train)

# Make predictions on the validation set
y_valid_pred = xgb_model.predict(X_valid)

# Example usage for y_train_pred
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Example usage for y_valid_pred
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [10]:
num_variables = len(X_train)
print("Number of variables in selected_features:", num_variables)

Number of variables in selected_features: 191


In [11]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |              0 |        0.0208333 |
+--------------+----------------+------------------+
| RMSE         |              0 |        0.144338  |
+--------------+----------------+------------------+
| R^2          |              1 |        0.679466  |
+--------------+----------------+------------------+
| Adjusted R^2 |              1 |        1.10384   |
+--------------+----------------+------------------+
| Accuracy     |              1 |        0.666667  |
+--------------+----------------+------------------+


###Reverse Variable Selection

In [12]:
# Step 1: Evaluate initial model performance
initial_model_score = xgb_model.score(X_valid, y_valid)
best_model_score = initial_model_score
best_model = xgb_model

# Step 2 to 6: Reverse variable selection loop
selected_features = list(X_train.columns)  # Assuming X_train is a DataFrame
for feature in X_train.columns:
    # Temporarily remove the feature
    X_train_subset = X_train.drop(feature, axis=1)
    X_valid_subset = X_valid.drop(feature, axis=1)
    
    # Retrain the model without the removed feature
    xgb_model_subset = xgb.XGBRegressor()
    xgb_model_subset.fit(X_train_subset, y_train)
    
    # Evaluate the model performance on the validation set
    model_score = xgb_model_subset.score(X_valid_subset, y_valid)
    
    # Check if the model performance improved
    if model_score > best_model_score:
        best_model_score = model_score
        best_model = xgb_model_subset
        selected_features.remove(feature)
print("Best Model Score:", best_model_score)

Best Model Score: 0.6918644536010811


In [13]:
num_variables = len(selected_features)
print("Number of variables in selected_features:", num_variables)

Number of variables in selected_features: 93


In [14]:
# Assuming 'selected_features' contains the list of best selected features
selected_X_train = X_train[selected_features]
selected_X_valid = X_valid[selected_features]

# Initialize and train the XGBoost model with the best selected features
xgb_model = xgb.XGBRegressor()
xgb_model.fit(selected_X_train, y_train)

# Make predictions on the training set with the best selected features
y_train_pred = xgb_model.predict(selected_X_train)

# Make predictions on the validation set with the best selected features
y_valid_pred = xgb_model.predict(selected_X_valid)

# Example usage for y_train_pred
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Example usage for y_valid_pred
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [15]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |              0 |        0.0208333 |
+--------------+----------------+------------------+
| RMSE         |              0 |        0.144338  |
+--------------+----------------+------------------+
| R^2          |              1 |        0.679466  |
+--------------+----------------+------------------+
| Adjusted R^2 |              1 |        1.10384   |
+--------------+----------------+------------------+
| Accuracy     |              1 |        0.666667  |
+--------------+----------------+------------------+


###Variable selection, greater than 0.01 feature importance

In [16]:
# Extract feature importance
feature_importance = xgb_model.feature_importances_

# Create a dictionary to map feature names to their importance scores
feature_importance_dict = {feature_name: importance_score for feature_name, importance_score in zip(X_train.columns, feature_importance)}

# Sort the feature importance dictionary in descending order based on importance scores
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print the feature importance scores in descending order
print("Feature Importance:")
for feature_name, importance_score in sorted_feature_importance.items():
    print(f"{feature_name}: {importance_score:.4f}")

Feature Importance:
Standardized Sentiment Score: 0.3552
uncertain: 0.1069
LEI: 0.0602
assumed: 0.0505
Negative Frequency: 0.0367
might: 0.0316
Durable Goods Orders: 0.0293
anticipated: 0.0287
could: 0.0270
easier: 0.0244
LAG: 0.0226
believed: 0.0210
LEI_RollingMean: 0.0209
Short-Term Treasury Bond Rate: 0.0174
depend: 0.0125
regain: 0.0121
Level: 0.0115
Long-Term Treasury Bond Rate: 0.0107
prosperity: 0.0094
CPI: 0.0089
bolstering: 0.0082
gaining: 0.0073
apparent: 0.0068
appeared: 0.0064
attain: 0.0062
Bank Reserves: 0.0058
Net Sentiment Score: 0.0055
cautiously: 0.0055
encouragement: 0.0050
unknown: 0.0042
attractiveness: 0.0041
opportunity: 0.0039
positive: 0.0037
Retail Sales: 0.0035
risk: 0.0029
Retail Sales_RollingMean: 0.0027
suggested: 0.0026
stabilization: 0.0023
strengthening: 0.0018
apparently: 0.0017
Short-Term Treasury Diff: 0.0016
Unemployment Rate: 0.0015
volatile: 0.0014
optimistic: 0.0013
Average Hourly Earnings: 0.0009
unclear: 0.0009
better: 0.0008
uncertainty: 0.000

In [17]:
# Filter features with importance greater than 0.01
selected_features = [feature_name for feature_name, importance_score in feature_importance_dict.items() if importance_score > 0.01]

# Create new datasets with selected features
selected_X_train = X_train[selected_features]
selected_X_valid = X_valid[selected_features]

# Initialize and train the XGBoost model with the selected features
xgb_model = xgb.XGBRegressor()
xgb_model.fit(selected_X_train, y_train)

# Make predictions on the training set with the selected features
y_train_pred = xgb_model.predict(selected_X_train)

# Make predictions on the validation set with the selected features
y_valid_pred = xgb_model.predict(selected_X_valid)

# Example usage for y_train_pred
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Example usage for y_valid_pred
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [18]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |              0 |         0.046875 |
+--------------+----------------+------------------+
| RMSE         |              0 |         0.216506 |
+--------------+----------------+------------------+
| R^2          |              1 |         0.278798 |
+--------------+----------------+------------------+
| Adjusted R^2 |              1 |         1.23363  |
+--------------+----------------+------------------+
| Accuracy     |              1 |         0.375    |
+--------------+----------------+------------------+


###Elastic Net Regression Model

In [19]:
# Create the Elastic Net Regression model
elastic_net_model = ElasticNet(random_state=42)

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'alpha': np.logspace(-4, 4, 9),   # Range of alpha values (regularization strength)
    'l1_ratio': np.linspace(0, 1, 11)  # Range of l1_ratio values (mixing parameter between L1 and L2 penalties)
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=elastic_net_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']

# Initialize the Elastic Net Regression model with the best hyperparameters
best_elastic_net_model = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, random_state=42)

# Fit the model to the training data with the best hyperparameters
best_elastic_net_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = best_elastic_net_model.predict(X_train)

# Make predictions on the validation set
y_valid_pred = best_elastic_net_model.predict(X_valid)

# Example usage for y_train_pred
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Example usage for y_valid_pred
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [20]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |    0.0402487   |       0.0651042  |
+--------------+----------------+------------------+
| RMSE         |    0.200621    |       0.255155   |
+--------------+----------------+------------------+
| R^2          |   -4.25677e-05 |      -0.00166945 |
+--------------+----------------+------------------+
| Adjusted R^2 |   -0.979251    |       1.32448    |
+--------------+----------------+------------------+
| Accuracy     |    0.701571    |       0.583333   |
+--------------+----------------+------------------+
