In [43]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model

In [44]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [45]:
lob_list = []

for side in ['bid','ask']:
    for i in range(100):
        lob_list.append(side + 'price' + str(i+1))
        lob_list.append(side + 'size' + str(i+1))
header_list = ['timestamp','last'];
header_list.extend(lob_list)

In [46]:
new_lob_order = []
for i in range(100):
    for side in ['bid','ask']:
        new_lob_order.append(side + 'price' + str(i+1))
        new_lob_order.append(side + 'size' + str(i+1))
new_lob_order = ['last'] + new_lob_order

In [47]:
def get_model_data(data, sample_size=600, feature_num=200):
    data = data.values
    shape = data.shape
    X = np.zeros((shape[0]-sample_size, sample_size, feature_num))
    Y = np.zeros(shape=(shape[0]-sample_size, 1))
    for i in range(shape[0]-sample_size):
        X[i] = data[i:i+sample_size,0:feature_num]# take the first feature_num columns as features
        Y[i] = data[i+sample_size-1,-1:]# take the last one column as labels
    X = X.reshape(X.shape[0], sample_size, feature_num, 1)# add the 4th dimension: 1 channel
    
    return X,Y

In [48]:
def define_y_labels(y, prediction_period, band_size = 0.001):
    bins = [-np.inf, -band_size, band_size, np.inf]
    names = [0, 1, 2]
    y_labels = pd.cut(y.pct_change(periods=prediction_period), bins, labels=names)
    return y_labels

In [49]:
datasample_period = 600
feature_columns = 40
prediction_period = 60

In [50]:
predict_model = load_model('my_model_ethbtc_v1.1_2021-01-17_2200923866.h5')

In [51]:
test_path = 'BinanceLOB/binance_dataset_2021-01-17_2199048273.csv'
test_data = pd.read_csv(test_path, names=header_list, index_col='timestamp')
test_data = test_data[new_lob_order]

In [52]:
test_data['delta_cat'] = define_y_labels(test_data['last'], prediction_period)
test_data['delta_cat'] .describe()
test_data.groupby('delta_cat').count()

Unnamed: 0_level_0,last,bidprice1,bidsize1,askprice1,asksize1,bidprice2,bidsize2,askprice2,asksize2,bidprice3,...,askprice98,asksize98,bidprice99,bidsize99,askprice99,asksize99,bidprice100,bidsize100,askprice100,asksize100
delta_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1626,1626,1626,1626,1626,1626,1626,1626,1626,1626,...,1626,1626,1626,1626,1626,1626,1626,1626,1626,1626
1,15267,15267,15267,15267,15267,15267,15267,15267,15267,15267,...,15267,15267,15267,15267,15267,15267,15267,15267,15267,15267
2,1889,1889,1889,1889,1889,1889,1889,1889,1889,1889,...,1889,1889,1889,1889,1889,1889,1889,1889,1889,1889


In [53]:
test_X, test_Y = get_model_data(test_data.drop(['last'], axis=1), datasample_period, feature_columns)

In [54]:
test_y = to_categorical(test_Y[:])

In [55]:
predict_model.evaluate(test_X, test_y)



[1.3478938470832598, 0.6140226]

In [56]:
df = pd.DataFrame(predict_model.predict(test_X))

In [57]:
df['Y'] = test_Y

In [58]:
df[(df['Y']==0)&(df[0]>=0.5)].count()

0    540
1    540
2    540
Y    540
dtype: int64

In [59]:
df[(df['Y']==0)&(df[0]<0.5)].count()

0    1045
1    1045
2    1045
Y    1045
dtype: int64

In [60]:
df[(df['Y']==1)&(df[1]>=0.5)].count()

0    9999
1    9999
2    9999
Y    9999
dtype: int64

In [61]:
df[(df['Y']==1)&(df[1]<0.5)].count()

0    4881
1    4881
2    4881
Y    4881
dtype: int64

In [62]:
df[(df['Y']==2)&(df[2]>=0.5)].count()

0    586
1    586
2    586
Y    586
dtype: int64

In [63]:
df[(df['Y']==2)&(df[2]<0.5)].count()

0    1191
1    1191
2    1191
Y    1191
dtype: int64