In [309]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [310]:
# Define the start and end dates for the data range
start_date = '1985-01-01'
end_date = '2018-12-31'

# Specify the proportion of test data (e.g., 10%)
test_data_size = 0.1

# Set the number of splits for K-Fold Cross Validation
KFold_split = 5

# Specify a range of alpha values for Ridge regression
alphas = [10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3, 10**4]

In [311]:
# Upload X and y data
X_mef = pd.read_csv("/content/mef_m_clean.csv")
X_mai = pd.read_csv("/content/mai_m_clean.csv")
y_mkt = pd.read_csv("/content/mkt_m_clean.csv")

X_mef['date'] = pd.to_datetime(X_mef['date'])
X_mai['date'] = pd.to_datetime(X_mai['date'])
y_mkt['date'] = pd.to_datetime(y_mkt['date'])

X_mef = X_mef[(X_mef['date'] >= start_date) & (X_mef['date'] <= end_date)]
X_mai = X_mai[(X_mai['date'] >= start_date) & (X_mai['date'] <= end_date)]
y_mkt = y_mkt[(y_mkt['date'] >= start_date) & (y_mkt['date'] <= end_date)]

# Drop the 'date' column from each dataset
X_mef = X_mef.drop('date', axis=1)
X_mai = X_mai.drop('date', axis=1)
y_mkt = y_mkt.drop('date', axis=1)

# Drop the 'GSPCprem' colum from y_mkt
y_mkt = y_mkt.drop('GSPCprem', axis=1)

X_mef = X_mef.values
X_mai = X_mai.values
y_mkt = y_mkt.values

In [312]:
def split_data(X, y, train_size, indices):
    # Split indices into train and test indices
    train_indices, test_indices = indices[:train_size], indices[train_size:]

    # Split data based on the indices
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, y_train, X_test, y_test

In [313]:
def find_optimal_ridge_hyperparameters(X_train, y_train, alphas, KFold_split):
    optimal_degree = None
    optimal_alpha = None
    minimal_mse = float('inf')

    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train)

    for alpha in alphas:
        kf = KFold(n_splits=KFold_split)
        mse_arr = []

        for train_index, test_index in kf.split(X_train_std):
            X_train_n, y_train_n = X_train_std[train_index], y_train[train_index]
            X_train_v, y_train_v = X_train_std[test_index], y_train[test_index]

            # Train Ridge regression
            ridge = Ridge(alpha=alpha)
            ridge.fit(X_train_n, y_train_n)

            # Predict and calculate MSE on the validation set
            y_pred = ridge.predict(X_train_v)
            mse = mean_squared_error(y_train_v, y_pred)
            mse_arr.append(mse)

        # Calculate the average MSE across KFold splits
        avg_mse = np.mean(mse_arr)

        # Update optimal hyperparameters if the current setup is better
        if avg_mse < minimal_mse:
            optimal_alpha = alpha
            minimal_mse = avg_mse

    return optimal_alpha, minimal_mse

In [314]:
def train_and_evaluate_ridge(X_train, y_train, X_test, y_test, alpha):
    # Standardize both training and test data
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)

    # Train Ridge regression on the entire training set
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train_std, y_train)

    # Predict on the training and test sets
    y_pred_train = ridge.predict(X_train_std)
    y_pred_test = ridge.predict(X_test_std)

    # Calculate and print MSE for the training and test sets
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)

    return mse_train, mse_test

In [315]:
def predict_with_ridge(model, scaler, new_X):
    # Standardize the new_X using the same scaler used for training
    new_X_std = scaler.transform(new_X.reshape(1, -1))

    # Predict the target value using the trained Ridge model
    predicted_y = model.predict(new_X_std)

    return predicted_y[0]

In [316]:
# Get same random split of MEF and MAI data
N = X_mef.shape[0]
train_size = int((1-test_data_size) * N)
indices = np.random.permutation(N)

In [317]:
# Training and evaluating Ridge model: MEF
print('MEF Linear Predictor')

# Split data into training and test sets
X_train, y_train, X_test, y_test = split_data(X_mef, y_mkt, train_size, indices)

# Standardize both training and test data using a scaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Find optimal alpha using cross-validation
optimal_alpha, minimal_mse = find_optimal_ridge_hyperparameters(X_train_std, y_train, alphas, KFold_split)
print(f"Optimal alpha: {optimal_alpha}, Minimal MSE: {minimal_mse}")

# Train and evaluate Ridge model with optimal alpha
ridge_model = Ridge(alpha=optimal_alpha)
ridge_model.fit(X_train_std, y_train)

# Evaluate the model on the test set
mse_train, mse_test = train_and_evaluate_ridge(X_train_std, y_train, X_test_std, y_test, optimal_alpha)
print(f"Train MSE: {mse_train}")
print(f"Test MSE: {mse_test}")
print("")

# Save the trained model and scaler for later use
ridge_scaler = scaler

# Example of using the trained model for prediction
new_X = np.array([-3.842472, -3.938737, -2.941030, -0.901442, 0.006793, 0.255578, -0.019217, 0.0237, 0.0284, 0.0481, 0.0047, 0.0111, -0.0111, -0.003194])
predicted_y = predict_with_ridge(ridge_model, ridge_scaler, new_X)

print("new_X:" + str(new_X))
print(f"predicted_y: {predicted_y}")

MEF Linear Predictor
Optimal alpha: 100, Minimal MSE: 1259.894677443825
Train MSE: 1163.8565285750497
Test MSE: 1241.6081749530194

new_X:[-3.842472e+00 -3.938737e+00 -2.941030e+00 -9.014420e-01  6.793000e-03
  2.555780e-01 -1.921700e-02  2.370000e-02  2.840000e-02  4.810000e-02
  4.700000e-03  1.110000e-02 -1.110000e-02 -3.194000e-03]
predicted_y: [6.14049578]


In [318]:
# Training and evaluating Ridge model: MAI
print('MAI Linear Predictor')

# Split data into training and test sets
X_train, y_train, X_test, y_test = split_data(X_mai, y_mkt, train_size, indices)

# Standardize both training and test data using a scaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Find optimal alpha using cross-validation
optimal_alpha, minimal_mse = find_optimal_ridge_hyperparameters(X_train_std, y_train, alphas, KFold_split)
print(f"Optimal alpha: {optimal_alpha}, Minimal MSE: {minimal_mse}")

# Train and evaluate Ridge model with optimal alpha
ridge_model = Ridge(alpha=optimal_alpha)
ridge_model.fit(X_train_std, y_train)

# Evaluate the model on the test set
mse_train, mse_test = train_and_evaluate_ridge(X_train_std, y_train, X_test_std, y_test, optimal_alpha)
print(f"Train MSE: {mse_train}")
print(f"Test MSE: {mse_test}")
print("")

# Save the trained model and scaler for later use
ridge_scaler = scaler

# Example of using the trained model for prediction
new_X = np.array([0.000000, 0.000000, 0.209820, 0.209820, 0.000000, 0.481232, 1.924928, 0.000000])
predicted_y = predict_with_ridge(ridge_model, ridge_scaler, new_X)

print("new_X:" + str(new_X))
print(f"predicted_y: {predicted_y}")

MAI Linear Predictor
Optimal alpha: 1000, Minimal MSE: 1258.218615997463
Train MSE: 1241.333337949458
Test MSE: 1197.106022998405

new_X:[0.       0.       0.20982  0.20982  0.       0.481232 1.924928 0.      ]
predicted_y: [5.8569198]
