# Preprocessing

In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [2]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import gc
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report

import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# Data Loading and Visualization

In [4]:
from google.colab import drive
drive.mount('/content/download')

Mounted at /content/download


In [6]:
df = pd.read_csv("/content/download/MyDrive/1-09-1-20.csv")
print(df.shape)
df.tail()

(3730870, 43)


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,32,33,34,35,36,37,38,39,40,41
3730865,3730865,1674238247671,2023-01-20 18:10:47,21337.6,0.167,21337.5,0.1,21337.4,0.014,21336.9,...,21338.2,0.022,21338.3,0.028,21338.4,0.235,21338.5,1.657,21338.6,0.234
3730866,3730866,1674238247921,2023-01-20 18:10:47,21337.5,0.108,21337.4,0.014,21336.9,0.257,21336.8,...,21338.2,0.022,21338.3,0.036,21338.4,0.236,21338.5,1.657,21338.6,0.235
3730867,3730867,1674238248172,2023-01-20 18:10:48,21337.4,0.891,21337.2,0.552,21337.1,1.095,21336.9,...,21338.1,0.002,21338.2,0.048,21338.3,0.036,21338.4,0.001,21338.5,1.658
3730868,3730868,1674238248422,2023-01-20 18:10:48,21337.4,2.456,21337.3,1.149,21337.2,0.002,21336.9,...,21338.1,0.002,21338.2,0.048,21338.3,0.036,21338.4,0.001,21338.5,1.658
3730869,3730869,1674238248672,2023-01-20 18:10:48,21337.4,2.513,21337.3,1.149,21337.2,0.002,21336.9,...,21338.1,0.002,21338.2,0.048,21338.3,0.036,21338.4,0.001,21338.5,1.047


In [7]:
new_columns = [
    'index', 'timestamp', 'time', 'bid1', 'bid1vol', 'bid2', 'bid2vol',
    'bid3', 'bid3vol', 'bid4', 'bid4vol', 'bid5', 'bid5vol',
    'bid6', 'bid6vol', 'bid7', 'bid7vol', 'bid8', 'bid8vol',
    'bid9', 'bid9vol', 'bid10', 'bid10vol', 'ask1', 'ask1vol',
    'ask2', 'ask2vol', 'ask3', 'ask3vol', 'ask4', 'ask4vol',
    'ask5', 'ask5vol', 'ask6', 'ask6vol', 'ask7', 'ask7vol',
    'ask8', 'ask8vol', 'ask9', 'ask9vol', 'ask10', 'ask10vol'
]
df.columns = new_columns
df['mid'] = 0.5*(df['bid1'] + df['ask1'])

print(df.head())

   index      timestamp                 time     bid1  bid1vol     bid2  \
0      0  1673302660926  2023-01-09 22:17:40  17181.6   23.371  17181.5   
1      1  1673302661177  2023-01-09 22:17:41  17181.6   24.232  17181.5   
2      2  1673302661427  2023-01-09 22:17:41  17181.6   24.403  17181.5   
3      3  1673302661678  2023-01-09 22:17:41  17181.6   24.874  17181.5   
4      4  1673302661928  2023-01-09 22:17:41  17181.6   24.403  17181.5   

   bid2vol     bid3  bid3vol     bid4  ...  ask6vol     ask7  ask7vol  \
0    0.746  17181.4    5.428  17181.2  ...    5.168  17182.3     0.02   
1    0.694  17181.4    5.428  17181.2  ...    6.043  17182.3     0.02   
2    0.694  17181.4    5.428  17181.2  ...    6.043  17182.3     0.02   
3    0.694  17181.4    5.428  17181.2  ...    6.043  17182.3     0.02   
4    0.694  17181.4    5.428  17181.2  ...    6.043  17182.3     0.02   

      ask8  ask8vol     ask9  ask9vol    ask10  ask10vol       mid  
0  17182.4    6.692  17182.5    1.904  17

In [8]:
interval_set = 5 * 1000 #predicted timeslot
df['timestamp_predict'] = df['timestamp'] + interval_set
df['mom'] = df['mid']
# Use numpy to find the closest indices efficiently
timestamps = df['timestamp'].values
predicted_timestamps = df['timestamp_predict'].values
closest_indices = np.searchsorted(timestamps, predicted_timestamps)

for ind, row in df.iterrows():
    if ind == len(df) - 1:
        df.at[ind, 'mom'] = 0
        break

    # Find the closest timestamp index
    closest_index = closest_indices[ind]
    if closest_index >= len(df):
        closest_index = len(df) - 1

    if row['mid'] < df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = 2
    if row['mid'] > df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = 0
    if row['mid'] == df.loc[closest_index, 'mid']:
        df.at[ind, 'mom'] = 1


In [9]:
display(df['mom'].describe())

count    3.730870e+06
mean     9.959235e-01
std      8.370131e-01
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      2.000000e+00
Name: mom, dtype: float64

In [10]:
end_time = df['timestamp'].tolist()[-1] - interval_set
df = df[df['timestamp'] <= end_time]

In [11]:
df = df.iloc[:len(df)//3]
leng = len(df)
print(leng)

1243616


In [12]:
columns_order = [
    'bid1', 'bid1vol', 'ask1', 'ask1vol',
    'bid2', 'bid2vol', 'ask2', 'ask2vol',
    'bid3', 'bid3vol', 'ask3', 'ask3vol',
    'bid4', 'bid4vol', 'ask4', 'ask4vol',
    'bid5', 'bid5vol', 'ask5', 'ask5vol',
    'bid6', 'bid6vol', 'ask6', 'ask6vol',
    'bid7', 'bid7vol', 'ask7', 'ask7vol',
    'bid8', 'bid8vol', 'ask8', 'ask8vol',
    'bid9', 'bid9vol', 'ask9', 'ask9vol',
    'bid10', 'bid10vol', 'ask10', 'ask10vol'
]

dataset_ordered = df[columns_order]
target_ordered = df['mom']

In [13]:
del df
gc.collect()

13

In [14]:
def make_dataset(data, targets, T):
    time_dataset = []
    time_targets = []
    for i in range(T, len(data)):
        cur = data[i-T:i].values
        time_dataset.append(cur)
        time_targets.append(targets[i])
    return time_dataset, time_targets

# Preparing the dataset
T = 100  #  window size
time_dataset, time_targets = make_dataset(dataset_ordered, target_ordered, T)

In [15]:
del dataset_ordered
gc.collect()

del target_ordered
gc.collect()

0

In [16]:
class Dataset(data.Dataset):
    """Characterizes a dataset for PyTorch"""
    def __init__(self, time_dataset, time_targets, T):
        """Initialization"""
        self.T = T
        x, y = np.array(time_dataset), np.array(time_targets)
        self.length = len(x)
        x = torch.from_numpy(x)
        self.x = torch.unsqueeze(x, 1)
        self.y = torch.from_numpy(y)

    def __len__(self):
        """Denotes the total number of samples"""
        return self.length

    def __getitem__(self, index):
        """Generates samples of data"""
        return self.x[index], self.y[index]

In [17]:
len(time_dataset[0])

100

In [18]:
split_index_train = int(leng * 0.7)
split_index_val = int(leng * 0.8)


time_dataset_train = time_dataset[:split_index_train]
time_targets_train = time_targets[:split_index_train]
time_dataset_val = time_dataset[split_index_train:split_index_val]
time_targets_val = time_targets[split_index_train:split_index_val]
time_dataset_test = time_dataset[split_index_val:]
time_targets_test = time_targets[split_index_val:]

In [19]:
del time_dataset
gc.collect()

del time_targets
gc.collect()

0

In [20]:
batch_size = 32

dataset_train = Dataset(time_dataset_train, time_targets_train, 100)
print('here0')
dataset_val = Dataset(time_dataset_val, time_targets_val, 100)
print('here1')
dataset_test = Dataset(time_dataset_test, time_targets_test, 100)
print('here2')
train_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
print('here3')
val_loader = torch.utils.data.DataLoader(dataset=dataset_val, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False)

print(dataset_train.x.shape, dataset_train.y.shape)

here0
here1
here2
here3
torch.Size([870531, 1, 100, 40]) torch.Size([870531])


In [None]:
torch.save(dataset_train, '/content/download/MyDrive/download/dataset_train.pt')
torch.save(dataset_val, '/content/download/MyDrive/download/dataset_val.pt')
torch.save(dataset_test, '/content/download/MyDrive/download/dataset_test.pt')

In [21]:
tmp_loader = torch.utils.data.DataLoader(dataset=dataset_train, batch_size=1, shuffle=True)
for x, y in tmp_loader:
    print(x)
    print(y)
    print(x.shape, y.shape)
    break

tensor([[[[1.7181e+04, 4.5305e+01, 1.7181e+04,  ..., 7.0300e-01,
           1.7182e+04, 1.3100e-01],
          [1.7181e+04, 1.6076e+01, 1.7181e+04,  ..., 9.3300e-01,
           1.7182e+04, 4.9010e+00],
          [1.7181e+04, 1.4175e+01, 1.7181e+04,  ..., 8.7300e-01,
           1.7182e+04, 4.9010e+00],
          ...,
          [1.7170e+04, 2.8713e+01, 1.7170e+04,  ..., 2.1270e+00,
           1.7171e+04, 9.0100e-01],
          [1.7170e+04, 3.5235e+01, 1.7170e+04,  ..., 2.1270e+00,
           1.7171e+04, 9.0100e-01],
          [1.7170e+04, 3.7121e+01, 1.7170e+04,  ..., 4.6880e+00,
           1.7171e+04, 9.0100e-01]]]], dtype=torch.float64)
tensor([2.], dtype=torch.float64)
torch.Size([1, 1, 100, 40]) torch.Size([1])


# Training DeepLOB


In [23]:
class deeplob(nn.Module):
    def __init__(self, y_len):
        super().__init__()
        self.y_len = y_len

        # convolution blocks
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.LeakyReLU(negative_slope=0.01),
#             nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,2), stride=(1,2)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.Tanh(),
            nn.BatchNorm2d(32),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1,10)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4,1)),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(32),
        )

        # inception moduels
        self.inp1 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )
        self.inp3 = nn.Sequential(
            nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1,1), padding='same'),
            nn.LeakyReLU(negative_slope=0.01),
            nn.BatchNorm2d(64),
        )

        # lstm layers
        self.lstm = nn.LSTM(input_size=192, hidden_size=64, num_layers=1, batch_first=True)
        self.fc1 = nn.Linear(64, self.y_len)

    def forward(self, x):
        # h0: (number of hidden layers, batch size, hidden size)
        h0 = torch.zeros(1, x.size(0), 64).to(device)
        c0 = torch.zeros(1, x.size(0), 64).to(device)

        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        x_inp1 = self.inp1(x)
        x_inp2 = self.inp2(x)
        x_inp3 = self.inp3(x)

        x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)

#         x = torch.transpose(x, 1, 2)
        x = x.permute(0, 2, 1, 3)
        x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))

        x, _ = self.lstm(x, (h0, c0))
        x = x[:, -1, :]
        x = self.fc1(x)
        forecast_y = torch.softmax(x, dim=1)

        return forecast_y

In [24]:
model = deeplob(3)
#model.to(device)

In [25]:
model.to(device)

deeplob(
  (conv1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(1, 2), stride=(1, 2))
    (1): LeakyReLU(negative_slope=0.01)
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (4): LeakyReLU(negative_slope=0.01)
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (7): LeakyReLU(negative_slope=0.01)
    (8): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv2): Sequential(
    (0): Conv2d(32, 32, kernel_size=(1, 2), stride=(1, 2))
    (1): Tanh()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Conv2d(32, 32, kernel_size=(4, 1), stride=(1, 1))
    (4): Tanh()
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Conv2d(32, 32, kernel_size=(4, 1), stride

In [25]:
summary(model, (1, 1, 100, 40))

  return F.conv2d(input, weight, bias, self.stride,


Layer (type:depth-idx)                   Output Shape              Param #
deeplob                                  [1, 3]                    --
├─Sequential: 1-1                        [1, 32, 94, 20]           --
│    └─Conv2d: 2-1                       [1, 32, 100, 20]          96
│    └─LeakyReLU: 2-2                    [1, 32, 100, 20]          --
│    └─BatchNorm2d: 2-3                  [1, 32, 100, 20]          64
│    └─Conv2d: 2-4                       [1, 32, 97, 20]           4,128
│    └─LeakyReLU: 2-5                    [1, 32, 97, 20]           --
│    └─BatchNorm2d: 2-6                  [1, 32, 97, 20]           64
│    └─Conv2d: 2-7                       [1, 32, 94, 20]           4,128
│    └─LeakyReLU: 2-8                    [1, 32, 94, 20]           --
│    └─BatchNorm2d: 2-9                  [1, 32, 94, 20]           64
├─Sequential: 1-2                        [1, 32, 88, 10]           --
│    └─Conv2d: 2-10                      [1, 32, 94, 10]           2,080
│    └

In [26]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [27]:
# A function to encapsulate the training loop
def batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs):

    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    best_test_loss = np.inf
    best_test_epoch = 0

    for it in tqdm(range(epochs)):

        model.train()
        t0 = datetime.now()
        train_loss = []
        for inputs, targets in train_loader:
            # move data to GPU
            inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)
            # print("inputs.shape:", inputs.shape)
            # zero the parameter gradients
            optimizer.zero_grad()
            # Forward pass
            # print("about to get model output")
            outputs = model(inputs)
            # print("done getting model output")
            # print("outputs.shape:", outputs.shape, "targets.shape:", targets.shape)
            loss = criterion(outputs, targets)
            # Backward and optimize
            # print("about to optimize")
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        # Get train loss and test loss
        train_loss = np.mean(train_loss) # a little misleading

        model.eval()
        test_loss = []
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())
        test_loss = np.mean(test_loss)

        # Save losses
        train_losses[it] = train_loss
        test_losses[it] = test_loss

        if test_loss < best_test_loss:
            torch.save(model, '/content/download/MyDrive/best_val_model_pytorch')
            best_test_loss = test_loss
            best_test_epoch = it
            print('model saved')

        dt = datetime.now() - t0
        print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, \
          Validation Loss: {test_loss:.4f}, Duration: {dt}, Best Val Epoch: {best_test_epoch}')

    return train_losses, test_losses

In [None]:
train_losses, val_losses = batch_gd(model, criterion, optimizer, train_loader, val_loader, epochs=50)

  return F.conv2d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  2%|▏         | 1/50 [07:38<6:14:31, 458.60s/it]

model saved
Epoch 1/50, Train Loss: 0.9041,           Validation Loss: 0.9705, Duration: 0:07:38.595991, Best Val Epoch: 0


  4%|▍         | 2/50 [15:17<6:07:01, 458.78s/it]

Epoch 2/50, Train Loss: 0.8788,           Validation Loss: 0.9830, Duration: 0:07:38.909224, Best Val Epoch: 0


  6%|▌         | 3/50 [22:56<5:59:25, 458.84s/it]

Epoch 3/50, Train Loss: 0.8670,           Validation Loss: 1.0995, Duration: 0:07:38.909273, Best Val Epoch: 0


  8%|▊         | 4/50 [30:33<5:51:14, 458.13s/it]

Epoch 4/50, Train Loss: 0.8528,           Validation Loss: 1.2014, Duration: 0:07:37.049196, Best Val Epoch: 0


 10%|█         | 5/50 [38:11<5:43:40, 458.23s/it]

Epoch 5/50, Train Loss: 0.8372,           Validation Loss: 1.0000, Duration: 0:07:38.389241, Best Val Epoch: 0


 12%|█▏        | 6/50 [45:48<5:35:40, 457.73s/it]

Epoch 6/50, Train Loss: 0.8217,           Validation Loss: 1.0641, Duration: 0:07:36.774915, Best Val Epoch: 0


 14%|█▍        | 7/50 [53:28<5:28:36, 458.51s/it]

Epoch 7/50, Train Loss: 0.8089,           Validation Loss: 1.0223, Duration: 0:07:40.116336, Best Val Epoch: 0


 16%|█▌        | 8/50 [1:01:07<5:20:56, 458.49s/it]

Epoch 8/50, Train Loss: 0.7978,           Validation Loss: 1.0208, Duration: 0:07:38.441790, Best Val Epoch: 0


 18%|█▊        | 9/50 [1:08:44<5:13:01, 458.09s/it]

Epoch 9/50, Train Loss: 0.7884,           Validation Loss: 1.0278, Duration: 0:07:37.190254, Best Val Epoch: 0


 20%|██        | 10/50 [1:16:21<5:05:08, 457.71s/it]

Epoch 10/50, Train Loss: 0.7803,           Validation Loss: 1.0360, Duration: 0:07:36.875219, Best Val Epoch: 0


In [None]:
plt.plot(history.history['loss'],label='Train Loss')
plt.plot(history.history['val_loss'],label='Validation Loss')
plt.title('Loss Per epoch')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

In [None]:
plt.plot(history.history['accuracy'],label='Train Accuracy')
plt.plot(history.history['val_accuracy'],label='Validation Accuracy')
plt.title('Accuracy Per epoch')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

# Results

In [32]:
model = torch.load('/content/download/MyDrive/best_val_model_pytorch')

all_targets = []
all_predictions = []

for inputs, targets in test_loader:
    # Move to GPU
    inputs, targets = inputs.to(device, dtype=torch.float), targets.to(device, dtype=torch.int64)

    # Forward pass
    outputs = model(inputs)

    # Get prediction
    # torch.max returns both max and argmax
    _, predictions = torch.max(outputs, 1)

    all_targets.append(targets.cpu().numpy())
    all_predictions.append(predictions.cpu().numpy())

all_targets = np.concatenate(all_targets)
all_predictions = np.concatenate(all_predictions)

In [33]:
print('accuracy_score:', accuracy_score(all_targets, all_predictions))
print(classification_report(all_targets, all_predictions, digits=4))

accuracy_score: 0.5342887251431881
              precision    recall  f1-score   support

           0     0.5248    0.6477    0.5798     84287
           1     0.5713    0.4391    0.4966     80711
           2     0.5184    0.5118    0.5151     83626

    accuracy                         0.5343    248624
   macro avg     0.5382    0.5329    0.5305    248624
weighted avg     0.5377    0.5343    0.5310    248624

