In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer

In [2]:
pwd

'/Users/abefarkas/code/abefarkas/Thalassa_Regime_Classifier'

In [3]:
#small sample for initial testing
data = pd.read_csv('/Users/abefarkas/Project OB Data/data_set.csv')
data.head(2)

Unnamed: 0.1,Unnamed: 0,primary_key,bp1,bs1,bp2,bs2,bp3,bs3,bp4,bs4,...,ap16,as16,ap17,as17,ap18,as18,ap19,as19,ap20,as20
0,0,2022-05-19 00:00:15,28720.075362,3.77208,28719.533333,1.405377,28719.011594,0.626551,28718.371014,0.505087,...,28724.006522,0.418891,28724.167391,0.487558,28724.361594,0.706101,28724.511594,0.707326,28724.657971,1.104145
1,1,2022-05-19 00:00:45,28755.960345,1.878828,28755.57069,0.335776,28755.05,0.573517,28754.684483,0.737259,...,28760.52069,7.383086,28760.674138,4.193586,28760.872414,0.620983,28761.067241,0.773897,28761.25,0.854017


In [4]:
data['WAP'] = (data['bp1']*data['bs1']
               +data['bp2']*data['bs2']
               +data['ap1']*data['as1']
               +data['ap2']*data['as2'])/(data['bs1']+
                                         data['bs2']+
                                         data['as1']+
                                         data['as2'])

data['spread'] = ((data['ap1']/data['bp1']) - 1)

def log_price(list_stock_prices):
    return np.log(list_stock_prices)

data.insert(0, 'log_price', log_price(data['WAP']))

data['log_returns'] = data.log_price.diff()

def realized_volatility():
    list_vol = []
    i = 0
    for i in data.index:
        x = np.std(data.log_returns.iloc[:i])
        i += 1
        list_vol.append(x)
    
    return list_vol

data['realized_volatility'] = realized_volatility()

In [5]:
data.head(2)

Unnamed: 0.1,log_price,Unnamed: 0,primary_key,bp1,bs1,bp2,bs2,bp3,bs3,bp4,...,ap18,as18,ap19,as19,ap20,as20,WAP,spread,log_returns,realized_volatility
0,10.265354,0,2022-05-19 00:00:15,28720.075362,3.77208,28719.533333,1.405377,28719.011594,0.626551,28718.371014,...,28724.361594,0.706101,28724.511594,0.707326,28724.657971,1.104145,28720.131688,1.5e-05,,
1,10.266608,1,2022-05-19 00:00:45,28755.960345,1.878828,28755.57069,0.335776,28755.05,0.573517,28754.684483,...,28760.872414,0.620983,28761.067241,0.773897,28761.25,0.854017,28756.179152,1e-05,0.001254,


In [6]:
imputer = SimpleImputer(strategy="constant", fill_value = 0) # Instantiate a SimpleImputer object with your strategy of choice

imputer.fit(data[['realized_volatility']]) # Call the "fit" method on the object

data['realized_volatility'] = imputer.transform(data[['realized_volatility']]) # Call the "transform" method on the object

imputer.statistics_ # The mean is stored in the transformer's memory

array([0.])

In [7]:
imputer2 = SimpleImputer(strategy="constant", fill_value = 0.001254)
imputer2.fit(data[['log_returns']]) # Call the "fit" method on the object

data['log_returns'] = imputer2.transform(data[['log_returns']]) # Call the "transform" method on the object

imputer2.statistics_ # The mean is stored in the transformer's memory

array([0.001254])

In [8]:
data.head(2)

Unnamed: 0.1,log_price,Unnamed: 0,primary_key,bp1,bs1,bp2,bs2,bp3,bs3,bp4,...,ap18,as18,ap19,as19,ap20,as20,WAP,spread,log_returns,realized_volatility
0,10.265354,0,2022-05-19 00:00:15,28720.075362,3.77208,28719.533333,1.405377,28719.011594,0.626551,28718.371014,...,28724.361594,0.706101,28724.511594,0.707326,28724.657971,1.104145,28720.131688,1.5e-05,0.001254,0.0
1,10.266608,1,2022-05-19 00:00:45,28755.960345,1.878828,28755.57069,0.335776,28755.05,0.573517,28754.684483,...,28760.872414,0.620983,28761.067241,0.773897,28761.25,0.854017,28756.179152,1e-05,0.001254,0.0


In [9]:
#sum of all bid quantities 2 levels of depth to try to mitigate costless spoofing in crypto
data['first2_bid_depth'] = data[['bs1', 'bs2']].sum(axis=1)      

#sum of all bid quantities 2 levels of depth to try to mitigate costless spoofing in crypto
data['first2_bid_depth'] = data[['bs1', 'bs2']].sum(axis=1)

#sum of all bid quantities
data['full_bid_depth'] = data[['bs1', 'bs2', 'bs3','bs4', 'bs5', 'bs6','bs7', 'bs8', 'bs9','bs10',
                         'bs11', 'bs12', 'bs13','bs14', 'bs15', 'bs16','bs17', 'bs18', 'bs19','bs20']].sum(axis=1)
#sum of all bid quantities
data['full_ask_depth'] = data[['as1', 'as2', 'as3','as4', 'as5', 'as6','as7', 'as8', 'as9','as10',
                         'as11', 'as12', 'as13','as14', 'as15', 'as16','as17', 'as18', 'as19','as20']].sum(axis=1)
#Order Flow Imbalance (OFI) 
#relative quantities of bids vs asks
#BBA depth (Best Bid/ASK, first level) OFI
data['BBAOFI'] = (data['bs1']-data['as1'])/(data['bs1']+data['as1'])

#OFI
#First 2 levels of depth to try to mitigate costless spoofing
data['First2OFI'] = ((data['bs1']+data['bs2']) - (data['as1']+data['as2'])) \
/ ((data['bs1']+data['bs2']) + (data['as1']+data['as2']))

#Order Flow Imbalance (OFI) 
#relative quantities of bids vs asks
#full depth (approx 20 levels) OFI
data['FDOFI'] = (data['full_bid_depth']-data['full_ask_depth'])/(data['full_bid_depth']+data['full_ask_depth'])

#Exponential Moving Averages of WAP
data['WAP_trend5'] = data['WAP'].ewm(span=2).mean()
data['WAP_trend10'] = data['WAP'].ewm(span=5).mean()
data['WAP_trend20'] = data['WAP'].ewm(span=10).mean()
data['WAP_trend50'] = data['WAP'].ewm(span=20).mean()
data['WAP_trend100'] = data['WAP'].ewm(span=50).mean()
data['WAP_trend200'] = data['WAP'].ewm(span=100).mean()
data['WAP_trend1000'] = data['WAP'].ewm(span=200).mean()

#Exponential Moving Averages of First2OFI
data['First2OFI_trend5'] = data['First2OFI'].ewm(span=2).mean()
data['First2OFI_trend10'] = data['First2OFI'].ewm(span=5).mean()
data['First2OFI_trend20'] = data['First2OFI'].ewm(span=10).mean()
data['First2OFI_trend50'] = data['First2OFI'].ewm(span=20).mean()
data['First2OFI_trend100'] = data['First2OFI'].ewm(span=50).mean()
data['First2OFI_trend200'] = data['First2OFI'].ewm(span=100).mean()
data['First2OFI_trend1000'] = data['First2OFI'].ewm(span=200).mean()

#Exponential Moving Averages of FDOFI
data['FDOFI_trend5'] = data['FDOFI'].ewm(span=2).mean()
data['FDOFI_trend10'] = data['FDOFI'].ewm(span=5).mean()
data['FDOFI_trend20'] = data['FDOFI'].ewm(span=10).mean()
data['FDOFI_trend50'] = data['FDOFI'].ewm(span=20).mean()
data['FDOFI_trend100'] = data['FDOFI'].ewm(span=50).mean()
data['FDOFI_trend200'] = data['FDOFI'].ewm(span=100).mean()
data['FDOFI_trend1000'] = data['FDOFI'].ewm(span=200).mean()

data['primary_key'] = pd.to_datetime(data['primary_key'])
data=data.set_index('primary_key')

In [10]:
# # SCALE DATA
# data_scaled = data.copy()

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# data_scaled[data.columns] = scaler.fit_transform(data[data.columns])


In [11]:
data.head(2)

Unnamed: 0_level_0,log_price,Unnamed: 0,bp1,bs1,bp2,bs2,bp3,bs3,bp4,bs4,...,First2OFI_trend100,First2OFI_trend200,First2OFI_trend1000,FDOFI_trend5,FDOFI_trend10,FDOFI_trend20,FDOFI_trend50,FDOFI_trend100,FDOFI_trend200,FDOFI_trend1000
primary_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-05-19 00:00:15,10.265354,0,28720.075362,3.77208,28719.533333,1.405377,28719.011594,0.626551,28718.371014,0.505087,...,0.394011,0.394011,0.394011,0.35634,0.35634,0.35634,0.35634,0.35634,0.35634,0.35634
2022-05-19 00:00:45,10.266608,1,28755.960345,1.878828,28755.57069,0.335776,28755.05,0.573517,28754.684483,0.737259,...,0.223241,0.224916,0.225753,-0.130584,-0.033199,-0.000737,0.015493,0.025232,0.028478,0.030101


In [12]:
print(data.columns.tolist())

['log_price', 'Unnamed: 0', 'bp1', 'bs1', 'bp2', 'bs2', 'bp3', 'bs3', 'bp4', 'bs4', 'bp5', 'bs5', 'bp6', 'bs6', 'bp7', 'bs7', 'bp8', 'bs8', 'bp9', 'bs9', 'bp10', 'bs10', 'bp11', 'bs11', 'bp12', 'bs12', 'bp13', 'bs13', 'bp14', 'bs14', 'bp15', 'bs15', 'bp16', 'bs16', 'bp17', 'bs17', 'bp18', 'bs18', 'bp19', 'bs19', 'bp20', 'bs20', 'ap1', 'as1', 'ap2', 'as2', 'ap3', 'as3', 'ap4', 'as4', 'ap5', 'as5', 'ap6', 'as6', 'ap7', 'as7', 'ap8', 'as8', 'ap9', 'as9', 'ap10', 'as10', 'ap11', 'as11', 'ap12', 'as12', 'ap13', 'as13', 'ap14', 'as14', 'ap15', 'as15', 'ap16', 'as16', 'ap17', 'as17', 'ap18', 'as18', 'ap19', 'as19', 'ap20', 'as20', 'WAP', 'spread', 'log_returns', 'realized_volatility', 'first2_bid_depth', 'full_bid_depth', 'full_ask_depth', 'BBAOFI', 'First2OFI', 'FDOFI', 'WAP_trend5', 'WAP_trend10', 'WAP_trend20', 'WAP_trend50', 'WAP_trend100', 'WAP_trend200', 'WAP_trend1000', 'First2OFI_trend5', 'First2OFI_trend10', 'First2OFI_trend20', 'First2OFI_trend50', 'First2OFI_trend100', 'First2OFI_t

In [13]:
# tss = TimeSeriesSplit(n_splits=5,test_size=2)
# tss.get_n_splits()

In [14]:
# train_size = 0.65
# index = round(train_size*data.shape[0])
# df_train = data.iloc[:index]
# df_test = data.iloc[index:]

In [15]:
horizon = 5
gap = horizon - 1

len_ = int(0.8*data.shape[0])

df_train = data[:len_]
df_test = data[len_+gap:]

In [16]:
def subsample_sequence(df, length):
    # $CHALLENGIFY_BEGIN
    last_possible = df.shape[0] - length - 5
    
    random_start = np.random.randint(0, last_possible)
    X = df[random_start: random_start+length].values
    y = df.iloc[random_start+length+1]['realized_volatility']
    # $CHALLENGIFY_END
    return X, y

subsample_sequence(data, 10)

(array([[ 1.02723120e+01,  8.60000000e+01,  2.89206143e+04, ...,
         -3.83717165e-03,  1.44054267e-02,  2.88018126e-02],
        [ 1.02721039e+01,  8.70000000e+01,  2.89146155e+04, ...,
          4.87015470e-03,  1.91223499e-02,  3.19103722e-02],
        [ 1.02713546e+01,  8.80000000e+01,  2.88929569e+04, ...,
          1.38613552e-02,  2.40886418e-02,  3.52147810e-02],
        ...,
        [ 1.02714463e+01,  9.30000000e+01,  2.88955783e+04, ...,
          1.53448369e-02,  2.40431261e-02,  3.43658050e-02],
        [ 1.02718314e+01,  9.40000000e+01,  2.89067217e+04, ...,
          1.46960142e-02,  2.34639662e-02,  3.37947456e-02],
        [ 1.02720624e+01,  9.50000000e+01,  2.89133966e+04, ...,
          1.32991535e-02,  2.24517642e-02,  3.29248029e-02]]),
 0.0005963729855529179)

In [17]:
# !pip install tensorflow

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# $CHALLENGIFY_BEGIN
X_train_pad = pad_sequences(X_train, dtype='float32', value=-1)
# $CHALLENGIFY_END


In [None]:
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers.experimental.preprocessing import Normalization

In [None]:
normalizer = Normalization()
normalizer.adapt(X_train_pad)

model = Sequential()
model.add(normalizer)
model.add(layers.Masking(mask_value=-1))
model.add(layers.LSTM(10, activation='tanh'))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

In [None]:
model.summary()

In [None]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import MAPE
model.compile(loss='mse', optimizer=RMSprop(learning_rate=0.01), metrics=MAPE)

In [None]:
model.fit(X_train_pad, np.array(y_train), epochs=100, batch_size=64, validation_split=0.3)

In [None]:
model.e