In [46]:
import numpy as np
import pandas as pd

import tensorflow as tf
import keras

In [48]:
keras.utils.set_random_seed(1)

In [49]:
tf.config.experimental.enable_op_determinism()

Import data

In [50]:
data = pd.read_csv('daily_rv_daily_data2.csv', index_col=0)

## MLP

In [51]:
def df_to_X_Y(df, window_size=21):
  df_as_np = df.to_numpy()
  X = []
  y = []
  for i in range(len(df_as_np)-window_size):
    row = [r for r in df_as_np[i:i+window_size]]
    X.append(row)
    label = [df_as_np[i+window_size]]
    y.append(label)
  return np.array(X), np.array(y)

In [52]:
X, Y = df_to_X_Y(data)

In [53]:
q_80 = int(len(data.index) * .8)
q_90 = int(len(data.index) * .9)

X_train, Y_train =  X[:q_80], Y[:q_80]

X_val, Y_val =  X[q_80:q_90], Y[q_80:q_90]
X_test, Y_test =  X[q_90:], Y[q_90:]

In [54]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from keras import activations

MLP = Sequential([layers.Input((21, 97)),
                    layers.Dense(100),
                    layers.Dropout(0.2),
                    layers.Dense(100),
                    layers.Dropout(0.2),
                    layers.Dense(97, activation=activations.relu)])

MLP.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 21, 100)           9800      
                                                                 
 dropout_6 (Dropout)         (None, 21, 100)           0         
                                                                 
 dense_8 (Dense)             (None, 21, 100)           10100     
                                                                 
 dropout_7 (Dropout)         (None, 21, 100)           0         
                                                                 
 dense_9 (Dense)             (None, 21, 97)            9797      
                                                                 
Total params: 29697 (116.00 KB)
Trainable params: 29697 (116.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [65]:
MLP.compile(loss='mse', 
              optimizer=Adam(learning_rate=0.001),
              metrics=['mean_absolute_error'])

In [95]:
MLP.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x27b022b2b10>

In [96]:
MLP_pred = MLP.predict(X_test)



In [97]:
mse = tf.keras.losses.MeanSquaredError()

In [98]:
MSE_MLP = mse(Y_test, MLP_pred).numpy()

## LSTM

In [72]:
LSTM = Sequential([layers.Input((21, 97)),
                    layers.LSTM(100, return_sequences=True),
                    layers.Dropout(0.2),
                    layers.LSTM(100, return_sequences=True),
                    layers.Dropout(0.2),
                    layers.Dense(97, activation=activations.relu)])

LSTM.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 21, 100)           79200     
                                                                 
 dropout_10 (Dropout)        (None, 21, 100)           0         
                                                                 
 lstm_5 (LSTM)               (None, 21, 100)           80400     
                                                                 
 dropout_11 (Dropout)        (None, 21, 100)           0         
                                                                 
 dense_11 (Dense)            (None, 21, 97)            9797      
                                                                 
Total params: 169397 (661.71 KB)
Trainable params: 169397 (661.71 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [73]:
LSTM.compile(loss='mse', 
              optimizer=Adam(learning_rate=0.001),
              metrics=['mean_absolute_error'])

In [74]:
LSTM.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x27b6a6f4ed0>

In [75]:
LSTM_pred = LSTM.predict(X_test)



In [76]:
MSE_LSTM = mse(Y_test, LSTM_pred).numpy()

## Random Walk

In [77]:
Y_test.shape

(256, 1, 97)

In [78]:
train=data.iloc[:-256, ]

In [79]:
test=data.iloc[-256:, ]

In [80]:
random_walk = pd.DataFrame(index=test.index, columns = test.columns)

In [81]:
for j in range(len(test.columns)):
    prev_val = train.iloc[-1, j]
    st_dev = train.std().iloc[j]
    for i in range(len(test)):
        new_val = max(1e-16, prev_val + np.random.normal(0, st_dev, 1))
        random_walk.iloc[i, j] = new_val
        prev_val = new_val       

In [82]:
random_walk

Unnamed: 0_level_0,AAPL.O,MSFT.O,GOOGL.O,AMZN.O,NVDA.O,META.O,BRKb,TSLA.O,LLY,V,...,MDLZ.O,LRCX.O,REGN.O,AMT,PGR,ADP.O,ETN,MMC,ADI.O,CB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/22/2022,0.5308013084284158,0.581213312408674,0.35390493560553926,0.04014282681420189,0.6601561716598614,0.5875696041917041,0.22285082294354436,0.8325111706989194,0.001088080106447592,0.24212943259588465,...,0.15206978783627045,0.5906377897331915,0.13118269868085922,0.33893661537578884,0.12021905232069852,0.044161289685478666,0.0930576501217634,0.18202398047927715,0.44250808979463935,0.11132905296271052
12/23/2022,0.4552631323979675,0.6545369947777329,0.2191772373385627,0.0,0.9390176996340265,0.5706513508184726,0.22994066169922903,1.0412580562837628,0.027964681183265984,0.22924077663203687,...,0.0315831761825424,0.6518168354809951,0.14404493453132827,0.2332797172506227,0.07445096727557562,0.0,8.59476123452052e-05,0.17809360612136274,0.37690438383562047,0.08731733006560924
12/27/2022,0.39004578459978256,0.7083127873866241,0.2994306702648804,0.0,0.9460886262783498,0.6602174991509789,0.3734016206570201,0.9929092163650164,0.01412603408863552,0.2984562295111538,...,0.028740993922020022,0.5829745299432727,0.046536986242659256,0.21325492102109125,0.12924027710717587,0.03438315472673831,0.0,0.1347954353957727,0.10444034189048951,0.16553845459443295
12/28/2022,0.25755826040368807,0.6323178317039138,0.21168843065986243,0.10030564162725504,1.3426347438018467,0.31113213352976815,0.5204462241547355,1.1793836241166373,0.014459449745997794,0.15667020412092186,...,0.03685820415593621,0.6186298719987813,0.20797778238838635,0.20201601218436044,0.07331504179438633,0.0,0.0,0.12079968688066511,0.2087612464212308,0.3039473585619098
12/29/2022,0.3644166641464119,0.6456166990971235,0.11806869288291627,0.23120661342023147,1.2187081249932616,0.10715862759601294,0.6285995749776281,1.6626269258821997,0.0,0.07881098065545443,...,0.0,0.5109442764487127,0.10442893746071073,0.23216924677821466,0.14159794774875972,0.0,0.026399237598549848,0.10466673805242818,0.4441454078761582,0.5793414765485094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12/22/2023,3.053033006410532,1.1319151199836233,0.8195999073841094,1.3525169723714257,3.3395338160126338,5.185797210259547,0.625178789791912,0.8120171083764014,1.5178481362798617,0.8698551716905614,...,2.9407508777143154,1.6915484530837075,0.9386059499691981,2.2284040408971664,0.679496927947781,2.458136221722683,3.6610726418880204,3.147677774491358,1.7498229238111396,0.620633912893956
12/26/2023,2.7081497461759167,1.0566515203445124,0.8900633605041214,1.303531737705169,3.656666028239712,4.699624702445211,0.7773413930139922,0.5017373436597117,1.437208600678444,0.7704808134872916,...,2.8325941256878227,1.6140678537904607,0.9180632377119009,2.1620643694825237,0.6200776595629763,2.638726661006295,3.5842360993539777,2.9437498979708296,1.9016292794531777,0.5594459350568284
12/27/2023,2.9473910345732843,1.1262707843820643,0.6723001132517057,1.2320293957755613,3.882569372991375,4.362531607165873,0.7934677666167729,0.48308422337501716,1.331219763452801,0.6268828632579379,...,2.8651153200905846,1.2279394931699774,0.7429772965017831,2.044247396377132,0.5991892731110078,2.5084978881632396,3.437032080522848,2.951472205290049,1.9948149680855625,0.737109756784891
12/28/2023,2.992624809949351,1.3383322146609569,0.7298563523322956,1.2004712068966632,4.056237127462498,4.396290758447743,0.8802694313170485,0.48571365476736394,1.2623821277091727,0.6275709700434918,...,2.9138759543799644,1.3399646132094072,0.6497146952714914,2.108066046816477,0.6386681737657403,2.5616442712986007,3.5061512997100848,2.862797456833753,1.9288527417521437,0.6068075160543933


In [83]:
from sklearn.metrics import mean_squared_error
MSE_RW = mean_squared_error(np.squeeze(Y_test, axis=1), random_walk)

In [105]:
MSE_LSTM

0.0078836195

In [99]:
MSE_MLP

0.009487135

In [87]:
MSE_RW

2.0346740112659316

Train, test, and validation split

In [213]:
q_80 = int(len(data.index) * .8)
q_90 = int(len(data.index) * .9)

train, val, test =  data[:q_80], data[q_80:q_90], data[q_90:]

## LSTM

Data preprocessing

In [214]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train)

In [215]:
scaled_train = scaler.transform(train)
scaled_val = scaler.transform(val)
scaled_test = scaler.transform(test)

In [216]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

length=21
batch_size = 1024
generator = TimeseriesGenerator(scaled_train, scaled_train, length = length, batch_size=batch_size)

Model buliding

In [217]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from keras import activations

model = Sequential()
model.add(LSTM(100, input_shape=(length, scaled_train.shape[1])))
model.add(Dense(scaled_train.shape[1]))
model.compile(optimizer='adam', loss='mse')
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_28 (LSTM)              (None, 100)               79200     
                                                                 
 dense_21 (Dense)            (None, 97)                9797      
                                                                 
Total params: 88997 (347.64 KB)
Trainable params: 88997 (347.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [218]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=1)
validation_generator = TimeseriesGenerator(scaled_val, scaled_val, 
                                           length=length, batch_size=batch_size)

Learning

In [233]:
model.fit_generator(generator, epochs=100,
                   validation_data=validation_generator,
                   callbacks=[early_stop])

Epoch 1/100

  model.fit_generator(generator, epochs=100,


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.src.callbacks.History at 0x1e136aadc10>

Recursive prediction

In [220]:
n_features = scaled_train.shape[1]
LSTM_pred = []

first_eval_batch = scaled_train[-length:]
current_batch = first_eval_batch.reshape((1, length, n_features))

for i in range(len(test)):
    current_pred = model.predict(current_batch)[0]
    LSTM_pred.append(current_pred)
    current_batch = np.append(current_batch[:,1:,:], [[current_pred]], axis=1)
    





In [221]:
LSTM_pred = scaler.inverse_transform(LSTM_pred)

In [222]:
LSTM_pred = pd.DataFrame(data=LSTM_pred, columns=test.columns)

In [224]:
mse = tf.keras.losses.MeanSquaredError()

In [278]:
MSE_LSTM = mse(test, LSTM_pred).numpy()