In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import pandas_datareader.data as web
from datetime import datetime, timedelta
import scipy.stats as stats
from sklearn.metrics import brier_score_loss, roc_curve, auc, log_loss
from sklearn.preprocessing import StandardScaler



| market_category | feature_name | id |
|-----------------|--------------|----|
| Bank            | bac          |  1 |
| Bank            | citi         |  2 |
| Commodity       | corn         |  3 |
| Currency        | euro         |  4 |
| Commodity       | gold         |  5 |
| Inflation       | infl5y       |  6 |
| Commodity       | iyr          |  7 |
| Currency        | pound        |  8 |
| Commodity       | silver       |  9 |
| Commodity       | soybns       | 10 |
| Equity          | sp12m        | 11 |
| Equity          | sp6m         | 12 |
| Commodity       | wheat        | 13 |
| Currency        | yen          | 14 |


Return Model (Log Price)

In [2]:
df = pd.read_csv("Output_Data/mpd_sp500.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')
# forwards filling
df = df.fillna(method='ffill')
# create a new df that extract the columns of SP_adj_close	SP_lg_pr	SP_lg_ret(%)	VIX
data = df[['SP_adj_close', 'SP_lg_pr', 'SP_lg_ret(%)', 'VIX']]

# keep columns that have names containing f11 and f12 only
df = df.filter(regex='f11|f12')

# merge data to df merge on index
df = pd.merge(df, data, left_index=True, right_index=True, how='left')

In [3]:
# drop columns that has "maturity_target" , "lg_change_decr", and "lg_change_incr" in the column name; those are irrelevant for feature selection
df = df[df.columns.drop(list(df.filter(regex='maturity_target')))]
df = df[df.columns.drop(list(df.filter(regex='lg_change_decr')))]
df = df[df.columns.drop(list(df.filter(regex='lg_change_incr')))]
df = df[df.columns.drop(list(df.filter(regex='SP_adj_close')))]

# drop SP_lg_ret(%)	
df = df.drop(['SP_lg_ret(%)'], axis=1)
df = df.drop(['SP_lg_pr'], axis=1)
# df = df.drop(['VIX'], axis=1)

In [4]:
# Generate lagged variables from f1_mu to SP_lg_pr
lags = 6
for lag in range(1, lags+1):
    for col in df.columns[df.columns.get_loc('f11_mu'):df.columns.get_loc('VIX')+1]: 
        df[f'{col}_lag{lag}'] = df[col].shift(lag)

df_lagged = df.copy()
# drop NA rows
df_lagged = df_lagged.dropna()

df_lagged['Next_Week_VIX'] = df_lagged['VIX'].shift(-1) # align y with X for regression
df_lagged = df_lagged.dropna()

  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)


In [5]:
# Define the target variable
start_colunm = df_lagged.columns.get_loc('VIX')
end_column = df_lagged.columns.get_loc('VIX_lag6')

column_index = list(range(start_colunm, end_column+1))

X = df_lagged.iloc[:, column_index]
y = df_lagged['Next_Week_VIX']

split_index = int(len(X)*0.75)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((427, 115), (143, 115), (427,), (143,))

In [7]:
# run a lasso regression to select features
from sklearn.linear_model import LassoCV

lassoCV = LassoCV(cv=10, random_state=12345, max_iter=10000, tol=0.0001, selection='random')
lassoCV.fit(X_train, y_train)

In [8]:
print("In Sample R^2: ", f'{lassoCV.score(X_train, y_train):.5f}')
print()
print("Out of Sample R^2: ", f'{lassoCV.score(X_test, y_test):.5f}')
print()
# lasso coefficients with corresponding feature names
lasso_coef = pd.DataFrame(lassoCV.coef_, index=X.columns, columns=['coef'])
lasso_coef = lasso_coef[lasso_coef.coef != 0]

print("Number of features selected: ", len(lasso_coef))
print(lasso_coef)

print()
# show the predicted value
lass_y_pred = lassoCV.predict(X_test)
# calculate the MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
lass_mse = mean_squared_error(y_test, lass_y_pred)
lass_rmse = np.sqrt(lass_mse)
lass_mae = mean_absolute_error(y_test, lass_y_pred)
lass_mape = np.mean(np.abs((y_test - lass_y_pred) / y_test)) * 100

print('Out of Sample Test set evaluation:')
print(f'MSE: {lass_mse:.5f}, RMSE: {lass_rmse:.5f}, MAE: {lass_mae:.5f}, MAPE: {lass_mape:.5f}')


In Sample R^2:  0.79446

Out of Sample R^2:  0.63972

Number of features selected:  18
                   coef
VIX            0.832914
f11_kurt_lag1 -0.010343
f12_skew_lag1 -0.015342
VIX_lag1       0.018199
f11_kurt_lag2 -0.010014
f12_kurt_lag2  0.004460
f11_kurt_lag3  0.004299
f12_kurt_lag3  0.006300
f11_kurt_lag4 -0.017029
f12_kurt_lag4  0.012979
f11_skew_lag5 -0.016893
f11_kurt_lag5  0.001705
f12_skew_lag5  0.045283
f12_kurt_lag5  0.013108
f11_skew_lag6 -0.051147
f11_kurt_lag6 -0.005967
f12_kurt_lag6 -0.011371
VIX_lag6       0.006022

Out of Sample Test set evaluation:
MSE: 0.00101, RMSE: 0.03182, MAE: 0.02228, MAPE: 10.53032


Applied StandardScaler

In [9]:
# run a lasso regression to select features
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
lassoCV2 = LassoCV(cv=10, random_state=12345, max_iter=10000, tol=0.0001, selection='random')
lassoCV2.fit(X_train_scaled, y_train)

In [11]:
print("In Sample R^2: ", f'{lassoCV2.score(X_train_scaled, y_train):.5f}')
print()
print("Out of Sample R^2: ", f'{lassoCV2.score(X_test_scaled, y_test):.5f}')
print()

# lasso coefficients with corresponding feature names
lasso_coef = pd.DataFrame(lassoCV2.coef_, index=X.columns, columns=['coef'])
lasso_coef = lasso_coef[lasso_coef.coef != 0]

print("Number of features selected: ", len(lasso_coef))
print(lasso_coef)

print()
# show the predicted value
lassCV2_y_pred = lassoCV2.predict(X_test_scaled)
# calculate the MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
lass_mse = mean_squared_error(y_test, lassCV2_y_pred)
lass_rmse = np.sqrt(lass_mse)
lass_mae = mean_absolute_error(y_test, lassCV2_y_pred)
lass_mape = np.mean(np.abs((y_test - lassCV2_y_pred) / y_test)) * 100

print('Test set evaluation:')
print(f'MSE: {lass_mse:.5f}, RMSE: {lass_rmse:.5f}, MAE: {lass_mae:.5f}, MAPE: {lass_mape:.5f}')


In Sample R^2:  0.78464

Out of Sample R^2:  0.67030

Number of features selected:  13
                   coef
VIX            0.060877
f11_mu_lag1   -0.000391
f11_kurt_lag1 -0.001007
VIX_lag1       0.001355
f11_kurt_lag2 -0.000132
f11_p50_lag2  -0.001087
f12_kurt_lag3  0.001746
f12_p50_lag4   0.001376
f12_mu_lag5    0.001801
f12_p50_lag5  -0.001203
f11_p10_lag6  -0.000038
f12_p50_lag6   0.001207
f12_p90_lag6   0.001624

Test set evaluation:
MSE: 0.00093, RMSE: 0.03044, MAE: 0.02164, MAPE: 10.57509


Using Log Price Lasso Regression, applied scalared has better result (LassoCV) for VIX

Rolling Lasso Model

In [12]:
# run a lasso regression to select features
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [13]:
from sklearn.linear_model import LassoCV

# Initialize the LassoCV model
lassoCV_rolling = LassoCV(cv=10, random_state=12345, max_iter=10000, tol=0.0001, selection='random')

# Initialize an empty array to store predictions
predictions = []

# initialize X_train_rolling and y_train_rolling
X_train_rolling = X_train_scaled.copy()
y_train_rolling = y_train.copy()

# Iterate through the dataset
for i in range(len(X_test_scaled)):
    # Convert X_train back to a DataFrame
    
    
    # Add the new observation to X_train_df and y_train
    X_train_rolling = pd.concat([X_train_rolling, X_test_scaled.iloc[[i]]])
    y_train_rolling = np.append(y_train_rolling, y_test[i])
    
    # Fit the LassoCV model with the updated training data
    lassoCV_rolling.fit(X_train_rolling, y_train_rolling)
    
    # Predict the next day's y
    next_day_prediction = lassoCV_rolling.predict(X_test_scaled.iloc[[i]])
    
    # Store the prediction in the array
    predictions.append(next_day_prediction)

In [14]:
# calculate the MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
# make a copy of the predictions
Rolling_y_pred= np.array(predictions).flatten()
lass_mse = mean_squared_error(y_test, Rolling_y_pred)
lass_rmse = np.sqrt(lass_mse)
lass_mae = mean_absolute_error(y_test, Rolling_y_pred)
lass_mape = np.mean(np.abs((y_test - Rolling_y_pred) / y_test)) * 100

print('Out of Sample Test set evaluation:')
print(f'MSE: {lass_mse:.5f}, RMSE: {lass_rmse:.5f}, MAE: {lass_mae:.5f}, MAPE: {lass_mape:.5f}')

Out of Sample Test set evaluation:
MSE: 0.00084, RMSE: 0.02897, MAE: 0.02105, MAPE: 10.22652
