# Importing Libraries

In [1]:
import hdf5storage
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import TimeSeriesSplit
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from keras.layers import Dense, LSTM, Dropout, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D

import warnings
warnings.filterwarnings("ignore")

# Getting the data

In [2]:
LOB = hdf5storage.loadmat('S092215-v50-AMZN_OCT2_states.mat')
df = pd.DataFrame.from_dict(LOB['LOB'])

In [3]:
df.columns = [
                "Time", "Mid_Price", "Spread", 
                "AskPrice1", "AskVolume1","BidPrice1", "BidVolume1", # Level 1
                "AskPrice2", "AskVolume2","BidPrice2", "BidVolume2", # Level 2
                "AskPrice3", "AskVolume3","BidPrice3", "BidVolume3", # Level 3
                "AskPrice4", "AskVolume4","BidPrice4", "BidVolume4", # Level 4
                "AskPrice5", "AskVolume5","BidPrice5", "BidVolume5", # Level 5
                "AskPrice6", "AskVolume6","BidPrice6", "BidVolume6", # Level 6
                "AskPrice7", "AskVolume7","BidPrice7", "BidVolume7", # Level 7
                "AskPrice8", "AskVolume8","BidPrice8", "BidVolume8", # Level 8
                "AskPrice9", "AskVolume9","BidPrice9", "BidVolume9", # Level 9
                "AskPrice10", "AskVolume10","BidPrice10", "BidVolume10", # Level 10
            ]

In [4]:
df.head()

Unnamed: 0,Time,Mid_Price,Spread,AskPrice1,AskVolume1,BidPrice1,BidVolume1,AskPrice2,AskVolume2,BidPrice2,...,BidPrice8,BidVolume8,AskPrice9,AskVolume9,BidPrice9,BidVolume9,AskPrice10,AskVolume10,BidPrice10,BidVolume10
0,1442894410000,0,0,0,0,5427400,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1442894410000,5452100,49400,5476800,100,5427400,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1442894410073,5451900,49000,5476400,100,5427400,100,5476800,100,0,...,0,0,0,0,0,0,0,0,0,0
3,1442894410073,5452000,48800,5476400,100,5427600,100,5476800,100,5427400,...,0,0,0,0,0,0,0,0,0,0
4,1442894410073,5452000,48800,5476400,100,5427600,100,5476800,100,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562650 entries, 0 to 562649
Data columns (total 43 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Time         562650 non-null  int64
 1   Mid_Price    562650 non-null  int64
 2   Spread       562650 non-null  int64
 3   AskPrice1    562650 non-null  int64
 4   AskVolume1   562650 non-null  int64
 5   BidPrice1    562650 non-null  int64
 6   BidVolume1   562650 non-null  int64
 7   AskPrice2    562650 non-null  int64
 8   AskVolume2   562650 non-null  int64
 9   BidPrice2    562650 non-null  int64
 10  BidVolume2   562650 non-null  int64
 11  AskPrice3    562650 non-null  int64
 12  AskVolume3   562650 non-null  int64
 13  BidPrice3    562650 non-null  int64
 14  BidVolume3   562650 non-null  int64
 15  AskPrice4    562650 non-null  int64
 16  AskVolume4   562650 non-null  int64
 17  BidPrice4    562650 non-null  int64
 18  BidVolume4   562650 non-null  int64
 19  AskPrice5    562650 non

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,562650.0,1442922000000.0,11791680.0,1442894000000.0,1442917000000.0,1442924000000.0,1442932000000.0,1442952000000.0
Mid_Price,562650.0,5344375.0,388087.9,0.0,5349950.0,5364100.0,5388650.0,5471300.0
Spread,562650.0,10285.24,38773.14,0.0,3100.0,4100.0,5700.0,511000.0
AskPrice1,562650.0,5366329.0,250796.6,0.0,5352400.0,5366700.0,5399200.0,5507700.0
AskVolume1,562650.0,147.7321,204.5865,0.0,99.0,100.0,132.0,5050.0
BidPrice1,562650.0,5350351.0,303861.2,0.0,5348000.0,5362300.0,5388300.0,5439100.0
BidVolume1,562650.0,112.2059,132.1014,0.0,47.0,100.0,112.0,5100.0
AskPrice2,562650.0,6956918.0,58486160.0,0.0,5352300.0,5365900.0,5386800.0,2000000000.0
AskVolume2,562650.0,138.5495,345.3173,0.0,40.0,100.0,100.0,5050.0
BidPrice2,562650.0,5219930.0,857213.0,0.0,5346300.0,5359900.0,5379900.0,5432400.0


# Data formatting and cleaning

In [7]:
#converting the timestamps to a readable format
new_time = []
for data in df["Time"]:
    current = datetime.datetime.fromtimestamp(data/1000.0) 
    new_time.append(current)


readable_time = []
for i in range(0,len(new_time)):
    current_time = new_time[i].strftime("%Y:%m:%d %H:%M:%S")
    readable_time.append(new_time)   

# Replace Unix time with the Readable Time Format
df['Time'] = readable_time 

In [8]:
df.head(10)

Unnamed: 0,Time,Mid_Price,Spread,AskPrice1,AskVolume1,BidPrice1,BidVolume1,AskPrice2,AskVolume2,BidPrice2,...,BidPrice8,BidVolume8,AskPrice9,AskVolume9,BidPrice9,BidVolume9,AskPrice10,AskVolume10,BidPrice10,BidVolume10
0,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",0,0,0,0,5427400,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5452100,49400,5476800,100,5427400,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5451900,49000,5476400,100,5427400,100,5476800,100,0,...,0,0,0,0,0,0,0,0,0,0
3,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5452000,48800,5476400,100,5427600,100,5476800,100,5427400,...,0,0,0,0,0,0,0,0,0,0
4,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5452000,48800,5476400,100,5427600,100,5476800,100,0,...,0,0,0,0,0,0,0,0,0,0
5,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5452000,48800,5476400,100,5427600,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5452000,48800,5476400,100,5427600,100,5484000,100,0,...,0,0,0,0,0,0,0,0,0,0
7,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5451900,48600,5476200,100,5427600,100,5476400,100,0,...,0,0,0,0,0,0,0,0,0,0
8,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5451900,48600,5476200,100,5427600,100,5484000,100,0,...,0,0,0,0,0,0,0,0,0,0
9,"[2015-09-22 09:30:10, 2015-09-22 09:30:10, 201...",5455800,56400,5484000,100,5427600,100,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(df.columns)

Index(['Time', 'Mid_Price', 'Spread', 'AskPrice1', 'AskVolume1', 'BidPrice1',
       'BidVolume1', 'AskPrice2', 'AskVolume2', 'BidPrice2', 'BidVolume2',
       'AskPrice3', 'AskVolume3', 'BidPrice3', 'BidVolume3', 'AskPrice4',
       'AskVolume4', 'BidPrice4', 'BidVolume4', 'AskPrice5', 'AskVolume5',
       'BidPrice5', 'BidVolume5', 'AskPrice6', 'AskVolume6', 'BidPrice6',
       'BidVolume6', 'AskPrice7', 'AskVolume7', 'BidPrice7', 'BidVolume7',
       'AskPrice8', 'AskVolume8', 'BidPrice8', 'BidVolume8', 'AskPrice9',
       'AskVolume9', 'BidPrice9', 'BidVolume9', 'AskPrice10', 'AskVolume10',
       'BidPrice10', 'BidVolume10'],
      dtype='object')


# Feature Selection

In [10]:
#Defining X and y
Features = df.iloc[1:,3:]
Forecast_Variable = df.iloc[1:,2]

In [11]:
# Create thresholder
thresholder = VarianceThreshold(threshold=0.95)
# Create high variance feature matrix
Features = thresholder.fit_transform(Features)

In [12]:
Features.shape

(562649, 40)

In [13]:
# Create correlation matrix
correlation_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape)
                                         , k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print('to_drop ->', to_drop)

to_drop -> ['BidPrice7', 'BidPrice8', 'BidPrice9', 'BidPrice10']


In [14]:
Features = np.delete(Features, [26,30,34,39], axis=1)

In [15]:
Features.shape

(562649, 36)

In [16]:
Forecast_Variable.shape

(562649,)

# LSTM and CNN with Cross Validation

In [17]:
#Defining the look-back window and forecast horizon
LookBack = 10

#creating an object of TimeSeriesSplit
rows, cols = Features.shape

#defining the training and testing window
train_window = 100000
test_window = 50000

#creating a timeseries split object
#for creating squential and non-overlapping
#train and test splits
n_windows = int( (rows-train_window)/test_window )
tscv = TimeSeriesSplit(n_splits=n_windows, 
    max_train_size=train_window, 
    test_size=test_window)


print('Number of folds :{}'.format(n_windows))

Number of folds :9


In [18]:
#defining the structure of LSTM
LSTM_model = Sequential()
LSTM_model.add(LSTM(64, input_shape=(LookBack, 36)
                    , return_sequences=True)) 
LSTM_model.add(Dropout(0.2))
LSTM_model.add(LSTM(32))
LSTM_model.add(Dropout(0.2))
LSTM_model.add(Dense(1))

# compiling the model
LSTM_model.compile(optimizer=Adam(learning_rate=0.001), 
                loss='mse')

#displaying the model summary
LSTM_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 64)            25856     
                                                                 
 dropout (Dropout)           (None, 10, 64)            0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 38,305
Trainable params: 38,305
Non-trainable params: 0
_________________________________________________________________


In [19]:
#defining the structure of CNN 
CNN_model = Sequential()
CNN_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(LookBack, 36)))
CNN_model.add(Dropout(0.2))
CNN_model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
CNN_model.add(Dropout(0.2))
CNN_model.add(MaxPooling1D(pool_size=2))
CNN_model.add(Flatten())
CNN_model.add(Dense(100, activation='relu'))

#compiling the model
CNN_model.add(Dense(1, activation='linear')) 
CNN_model.compile(optimizer=Adam(learning_rate=0.001), 
                loss='mse')

#displaying the model summary
CNN_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 8, 64)             6976      
                                                                 
 dropout_2 (Dropout)         (None, 8, 64)             0         
                                                                 
 conv1d_1 (Conv1D)           (None, 6, 32)             6176      
                                                                 
 dropout_3 (Dropout)         (None, 6, 32)             0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 3, 32)            0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 96)                0         
                                                      

In [20]:
#empty lists for storing results for both models
LSTMTestScores = []
CNNTestScores = []

for i, (train_index, test_index) in enumerate(tscv.split(Features)):
    print('Fold: {}'.format(i+1))
    print("TRAIN INDEX:", train_index, "\nTEST INDEX:", test_index)
    
    #Splitting the data into train and test splits
    X_train, X_test = Features[train_index,:], Features[test_index,:]
    Y_train, Y_test = Forecast_Variable[train_index], Forecast_Variable[test_index]

    #converting the training data into a format compatible with LSTM and CNN
    X_train_Numpy = pd.DataFrame(X_train).to_numpy()
    Y_train_Numpy = pd.DataFrame(Y_train).to_numpy()

    x_train_append_matrix = []
    y_train_append_matrix = []

    for i in range(len(X_train)-LookBack):         
        feat_current_train = X_train_Numpy[i:i+LookBack, :]
        label_current_train = Y_train_Numpy[i+LookBack]
        x_train_append_matrix.append(feat_current_train)
        y_train_append_matrix.append(label_current_train)
    
    X_train_final = np.array(x_train_append_matrix)
    Y_train_final = np.array(y_train_append_matrix)

    #converting the testing data into a format compatible with LSTM and CNN
    X_test_Numpy = pd.DataFrame(X_test).to_numpy()
    Y_test_Numpy = pd.DataFrame(Y_test).to_numpy()

    x_test_append_matrix = []
    y_test_append_matrix = []

    for i in range(len(X_test)-LookBack):         
        feat_current_test = X_test_Numpy[i:i+LookBack, :]
        label_current_test = Y_test_Numpy[i+LookBack]
        x_test_append_matrix.append(feat_current_test)
        y_test_append_matrix.append(label_current_test)
    
    X_test_final = np.array(x_test_append_matrix)
    Y_test_final = np.array(y_test_append_matrix)
    
    print(62*'-')
    print('LSTM')
    print(62*'-')       
    
    #fitting the models with the training data
    LSTM_model_fit = LSTM_model.fit(X_train_final, Y_train_final, epochs=1)
    LSTM_model_fit_results = pd.DataFrame(LSTM_model_fit.history)

    #calculating predicted values for testing set
    LSTM_y_predicted = LSTM_model.predict(X_test_final)
    
    #calculating MSE for the validation set
    LSTM_Test_Score = mean_squared_error(Y_test_final, LSTM_y_predicted)
    LSTMTestScores.append(LSTM_Test_Score)
    print('Mean Squared Error: {:.4f}'.format(LSTM_Test_Score))

    print()

    print(62*'-')
    print('CNN')
    print(62*'-')

    #fitting the models with the training data
    CNN_model_fit = CNN_model.fit(X_train_final, Y_train_final, epochs=1)
    CNN_model_fit_results = pd.DataFrame(CNN_model_fit.history)

    #calculating predicted values for testing set
    CNN_y_predicted = CNN_model.predict(X_test_final)

    #calculating MSE for the validation set
    CNN_Test_Score = mean_squared_error(Y_test_final, CNN_y_predicted)
    CNNTestScores.append(CNN_Test_Score)
    print('Mean Squared Error: {:.4f}'.format(CNN_Test_Score))

    print()
    print(62*'#')
    print()

Fold: 1
TRAIN INDEX: [ 12649  12650  12651 ... 112646 112647 112648] 
TEST INDEX: [112649 112650 112651 ... 162646 162647 162648]
--------------------------------------------------------------
LSTM
--------------------------------------------------------------
Mean Squared Error: 27805333.4898

--------------------------------------------------------------
CNN
--------------------------------------------------------------
Mean Squared Error: 28799879.8126

##############################################################

Fold: 2
TRAIN INDEX: [ 62649  62650  62651 ... 162646 162647 162648] 
TEST INDEX: [162649 162650 162651 ... 212646 212647 212648]
--------------------------------------------------------------
LSTM
--------------------------------------------------------------
Mean Squared Error: 12809127.9423

--------------------------------------------------------------
CNN
--------------------------------------------------------------
Mean Squared Error: 13807455.7919

##############

In [21]:
print(62*'-')
print('LSTM')
print(62*'-')
print('Average Loss: {:.4f}'.format(np.sum(LSTM_model_fit_results['loss'].values)
                                /len(LSTM_model_fit_results)))
print('Average MSE: {:.4f}'.format(np.sum(LSTMTestScores)/len(LSTMTestScores)))

print()

print(62*'-')
print('CNN')
print(62*'-')
print('Average Loss: {:.4f}'.format(np.sum(CNN_model_fit_results['loss'].values)
                                /len(CNN_model_fit_results)))
print('Average MSE: {:.4f}'.format(np.sum(CNNTestScores)/len(CNNTestScores)))

--------------------------------------------------------------
LSTM
--------------------------------------------------------------
Average Loss: 10099561.0000
Average MSE: 14399545.2161

--------------------------------------------------------------
CNN
--------------------------------------------------------------
Average Loss: 15383899.0000
Average MSE: 17090282.0627
