In [46]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

In [48]:
keras.utils.set_random_seed(1)

In [49]:
tf.config.experimental.enable_op_determinism()

Import data

In [50]:
data = pd.read_csv('daily_rv_daily_data2.csv', index_col=0)

## MLP

In [51]:
def df_to_X_Y(df, window_size=21):
  df_as_np = df.to_numpy()
  X = []
  y = []
  for i in range(len(df_as_np)-window_size):
    row = [r for r in df_as_np[i:i+window_size]]
    X.append(row)
    label = [df_as_np[i+window_size]]
    y.append(label)
  return np.array(X), np.array(y)

In [52]:
X, Y = df_to_X_Y(data)

In [53]:
q_80 = int(len(data.index) * .8)
q_90 = int(len(data.index) * .9)

X_train, Y_train =  X[:q_80], Y[:q_80]

X_val, Y_val =  X[q_80:q_90], Y[q_80:q_90]
X_test, Y_test =  X[q_90:], Y[q_90:]

In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from keras import activations

MLP = Sequential([layers.Input((21, 97)),
                    layers.Dense(100),
                    layers.Dropout(0.2),
                    layers.Dense(100),
                    layers.Dropout(0.2),
                    layers.Dense(97, activation=activations.relu)])

MLP.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 21, 100)           9800      
                                                                 
 dropout_6 (Dropout)         (None, 21, 100)           0         
                                                                 
 dense_8 (Dense)             (None, 21, 100)           10100     
                                                                 
 dropout_7 (Dropout)         (None, 21, 100)           0         
                                                                 
 dense_9 (Dense)             (None, 21, 97)            9797      
                                                                 
Total params: 29697 (116.00 KB)
Trainable params: 29697 (116.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
MLP.compile(loss='mse', 
              optimizer=Adam(learning_rate=0.001),
              metrics=['mean_absolute_error'])

In [95]:
MLP.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x27b022b2b10>

In [96]:
MLP_pred = MLP.predict(X_test)



In [194]:
MLP_pred.shape

(256, 21, 97)

In [97]:
mse = tf.keras.losses.MeanSquaredError()

In [98]:
MSE_MLP = mse(Y_test, MLP_pred).numpy()

## LSTM

In [72]:
LSTM = Sequential([layers.Input((21, 97)),
                    layers.LSTM(100, return_sequences=True),
                    layers.Dropout(0.2),
                    layers.LSTM(100, return_sequences=True),
                    layers.Dropout(0.2),
                    layers.Dense(97, activation=activations.relu)])

LSTM.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 21, 100)           79200     
                                                                 
 dropout_10 (Dropout)        (None, 21, 100)           0         
                                                                 
 lstm_5 (LSTM)               (None, 21, 100)           80400     
                                                                 
 dropout_11 (Dropout)        (None, 21, 100)           0         
                                                                 
 dense_11 (Dense)            (None, 21, 97)            9797      
                                                                 
Total params: 169397 (661.71 KB)
Trainable params: 169397 (661.71 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [73]:
LSTM.compile(loss='mse', 
              optimizer=Adam(learning_rate=0.001),
              metrics=['mean_absolute_error'])

In [74]:
LSTM.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x27b6a6f4ed0>

In [75]:
LSTM_pred = LSTM.predict(X_test)



In [76]:
MSE_LSTM = mse(Y_test, LSTM_pred).numpy()

## Random Walk

In [77]:
Y_test.shape

(256, 1, 97)

In [78]:
train=data.iloc[:-256, ]

In [79]:
test=data.iloc[-256:, ]

In [129]:
rw_pred = pd.DataFrame(index=test.index, columns = test.columns)

In [130]:
for j in range(len(test.columns)):
    prev_val = train.iloc[-1, j]
    st_dev = train.std().iloc[j]
    for i in range(len(test)):
        new_val = max(1e-16, prev_val + np.random.normal(0, st_dev, 1))
        rw_pred.iloc[i, j] = new_val
        prev_val = new_val       

In [149]:
rw_pred

Unnamed: 0_level_0,AAPL.O,MSFT.O,GOOGL.O,AMZN.O,NVDA.O,META.O,BRKb,TSLA.O,LLY,V,...,MDLZ.O,LRCX.O,REGN.O,AMT,PGR,ADP.O,ETN,MMC,ADI.O,CB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/22/2022,0.3596978615341253,0.40721217638257423,0.22414987322905616,0.25180160332983503,0.3516497899131491,0.6373805443775048,0.1807834668597527,0.7724302560148639,0.2783565003979402,0.08218742305092158,...,0.22324078878444337,0.9112117816397765,0.22467091994047164,0.22911399340818858,0.0,0.2659701601015139,0.1970186852723305,0.24586656277971988,0.4143203164432392,0.05705791978550323
12/23/2022,0.5047077054702845,0.2555105134114005,0.10239298641509292,0.2712569778037012,0.4147092641800432,0.7825319323092894,0.05551119178665381,0.9119564700178806,0.5034239236084705,0.17426407175876968,...,0.0,0.9628471566280784,0.2723971667995879,0.45341374943957635,0.0,0.1792559362102821,0.2460887597431093,0.48942905259908154,0.5059598520309163,0.17947667276977897
12/27/2022,0.5330868382634594,0.17581423641842342,0.10769040018752521,0.2499884898337587,0.30724380866879264,0.9276709439758826,0.10699529701363969,1.0621971674937245,0.4190859877513477,0.21241606578114655,...,0.06611834976548546,0.8257298987275898,0.38108087876065994,0.5956294331151347,0.0,0.19342556544723746,0.20928144593228293,0.38814570729822295,0.6734032913299469,0.20377703815077491
12/28/2022,0.6216762522469449,0.19673909031095638,0.0,0.21139509185122402,0.30418921496222956,0.9320431966710729,0.05827712268428653,1.3914117106322565,0.1499574947947494,0.2201425028732636,...,0.052474020052158496,0.9520389332371655,0.5258438787249273,0.8783992995040109,0.10586091678166232,0.21318540363602995,0.28942794853369846,0.4746552733467998,0.4571654651390782,0.15920796494497425
12/29/2022,0.633824174797444,0.18557604079247125,0.0,0.20821098456411746,0.5301097048424472,0.8314849065689208,0.006936855074602712,1.685456506642552,0.18366112594290285,0.37435471264176057,...,0.1010432041942771,0.9958523928097559,0.7559641431738413,0.917820540512137,0.014742915153796388,0.10000897944490701,0.30366524279918233,0.4109609659934432,0.30746992014966235,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/22/2023,1.4990026967242838,3.2401459259976813,0.7414193099414598,0.6009075771703185,2.5460564786040196,0.01135499596756195,1.7413369858893524,9.514757940424353,2.3715672665429492,0.8543125239697424,...,1.3360840017366213,2.0481780747922236,0.17480712672920365,3.558225305690294,1.0187716000119107,1.4776953860199193,0.5414067073357065,0.16056634179955562,0.11967175932367335,2.566590262648768
12/26/2023,1.686851986110891,3.3793085905019464,0.7948185780486524,0.7051192884665655,2.211761126626536,0.0,1.6993971249898112,9.803992028917644,2.064824856259823,0.9347258229110478,...,1.3249366751148233,2.3013085028654516,0.27790394655287876,3.527400301675957,0.8109193657273481,1.5152058037941434,0.4461575431145655,0.08390991022396377,0.13286005617129087,2.6607050384773108
12/27/2023,1.744957837831905,3.097218233266659,0.6753683068685802,0.7885254062016848,2.914086307569919,0.028695511685780457,1.6468981653416397,9.419098544337771,1.7343897836243543,0.8761851498454007,...,1.2861829390093644,2.1792039204752247,0.06671071520238966,3.527529298041054,0.7891558938096599,1.6041854823635882,0.37339749547366874,0.027719767670564367,0.08834600922862032,2.7355854044809305
12/28/2023,1.7183897078317956,3.3650799667847524,0.6699418553597415,0.6588619348813373,2.9897427640754217,0.3114981992853912,1.711962643368723,9.212326146741244,1.765189639711142,0.9244779493256436,...,1.3729052131441728,2.435340923325042,0.1306652014215108,3.4430788404887824,0.6303565888418876,1.407636044733688,0.4156815037033561,0.02995670039482805,0.1270129750934561,2.5085519250989203


array([[0.35969786, 0.40721218, 0.22414987, ..., 0.24586656, 0.41432032,
        0.05705792],
       [0.50470771, 0.25551051, 0.10239299, ..., 0.48942905, 0.50595985,
        0.17947667],
       [0.53308684, 0.17581424, 0.1076904 , ..., 0.38814571, 0.67340329,
        0.20377704],
       ...,
       [1.74495784, 3.09721823, 0.67536831, ..., 0.02771977, 0.08834601,
        2.7355854 ],
       [1.71838971, 3.36507997, 0.66994186, ..., 0.0299567 , 0.12701298,
        2.50855193],
       [1.77465009, 3.23052893, 0.79569475, ..., 0.11381677, 0.21590035,
        2.57099133]])

In [132]:
from sklearn.metrics import mean_squared_error
MSE_RW = mean_squared_error(np.squeeze(Y_test, axis=1), rw_pred)

In [175]:
def qlike(y_test, y_pred):
    return np.mean(np.log(y_pred+1e-16) + y_test/y_pred) 

In [176]:
QLIKE_LSTM = qlike(Y_test, LSTM_pred)   

In [178]:
QLIKE_MLP = qlike(Y_test, MLP_pred)   

In [180]:
QLIKE_RW = qlike(Y_test, rw_pred.to_numpy(dtype='float'))

In [192]:
np.log(rw_pred.to_numpy(dtype='float')+1e-16) + Y_test/rw_pred.to_numpy(dtype='float')

array([[[-1.06008911e-01, -7.89361330e-02, -1.15252084e-02, ...,
         -6.05390604e-01, -1.64439986e-01, -1.90653538e-01],
        [-3.06124161e-02, -5.84625659e-02,  9.69521724e-01, ...,
         -3.13850545e-01, -9.44265131e-02, -8.67918217e-01],
        [-1.06790084e-02,  1.59722506e-01,  8.60168554e-01, ...,
         -4.41159048e-01,  4.55334834e-02, -8.42274495e-01],
        ...,
        [ 7.45649916e-01,  1.23824756e+00,  1.00003651e-01, ...,
          3.48866335e+00,  9.34533500e-01,  1.06209872e+00],
        [ 7.33228055e-01,  1.31261852e+00,  9.59255936e-02, ...,
          3.03801906e+00,  2.74353301e-01,  9.80504808e-01],
        [ 7.59361918e-01,  1.27594295e+00,  1.89484220e-01, ...,
         -4.50245319e-01, -1.57612206e-01,  1.00361412e+00]],

       [[-1.13278103e-01, -8.48446172e-02, -4.05548891e-03, ...,
         -6.05350792e-01, -1.66082378e-01, -1.41878480e-01],
        [-3.57930641e-02, -6.78790344e-02,  9.85873788e-01, ...,
         -3.13830546e-01, -9.57714351e

In [169]:
MSE_LSTM

0.0078836195

In [170]:
MSE_MLP

0.009487135

In [171]:
MSE_RW

1.7680518863778827

In [172]:
QLIKE_LSTM

-0.4291024322426299

In [179]:
QLIKE_MLP

-0.39044982735227746

In [181]:
QLIKE_RW

136296388237057.47

Train, test, and validation split

In [213]:
q_80 = int(len(data.index) * .8)
q_90 = int(len(data.index) * .9)

train, val, test =  data[:q_80], data[q_80:q_90], data[q_90:]

## LSTM

Data preprocessing

In [214]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train)

In [215]:
scaled_train = scaler.transform(train)
scaled_val = scaler.transform(val)
scaled_test = scaler.transform(test)

In [216]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

length=21
batch_size = 1024
generator = TimeseriesGenerator(scaled_train, scaled_train, length = length, batch_size=batch_size)

Model buliding

In [217]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from keras import activations

model = Sequential()
model.add(LSTM(100, input_shape=(length, scaled_train.shape[1])))
model.add(Dense(scaled_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_28 (LSTM)              (None, 100)               79200     
                                                                 
 dense_21 (Dense)            (None, 97)                9797      
                                                                 
Total params: 88997 (347.64 KB)
Trainable params: 88997 (347.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [218]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=1)
validation_generator = TimeseriesGenerator(scaled_val, scaled_val, 
                                           length=length, batch_size=batch_size)

Learning

In [233]:
model.fit_generator(generator, epochs=100,
                   validation_data=validation_generator,
                   callbacks=[early_stop])

Epoch 1/100

  model.fit_generator(generator, epochs=100,


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.src.callbacks.History at 0x1e136aadc10>

Recursive prediction

In [220]:
n_features = scaled_train.shape[1]
LSTM_pred = []

first_eval_batch = scaled_train[-length:]
current_batch = first_eval_batch.reshape((1, length, n_features))

for i in range(len(test)):
    current_pred = model.predict(current_batch)[0]
    LSTM_pred.append(current_pred)
    current_batch = np.append(current_batch[:,1:,:], [[current_pred]], axis=1)
    





In [221]:
LSTM_pred = scaler.inverse_transform(LSTM_pred)

In [222]:
LSTM_pred = pd.DataFrame(data=LSTM_pred, columns=test.columns)

In [224]:
mse = tf.keras.losses.MeanSquaredError()

In [278]:
MSE_LSTM = mse(test, LSTM_pred).numpy()