# Overview
* a pipeline for a PyTorch DNN to predict stck movements for the Optiver Trading Challenge.
    * Customizable Neural Network
    * Preprocessing and Normalization of Input Data
    * Feature Engineering
    * Decaying Learing Rate and Early Stopping
* Possible Improvements
    * PyTorch Profiler for Bottlenecks
    * Hyperparameter Tuning
    * Flag Filled NaN for near_price and far_price
    * Regularization methods for deeper networks(batch norm)

In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

plt.rcParams['figure.figsize'] = (6,3)
pd.set_option("display.max_columns", None)

In [20]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from fastai.tabular.all import *

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [22]:
# Data
df_raw = pd.read_csv('../../data/train.csv')
df_raw.isna().mean(axis=0)

stock_id                   0.000000
date_id                    0.000000
seconds_in_bucket          0.000000
imbalance_size             0.000042
imbalance_buy_sell_flag    0.000000
reference_price            0.000042
matched_size               0.000042
far_price                  0.552568
near_price                 0.545474
bid_price                  0.000042
bid_size                   0.000000
ask_price                  0.000042
ask_size                   0.000000
wap                        0.000042
target                     0.000017
time_id                    0.000000
row_id                     0.000000
dtype: float64

In [23]:
def add_historic_features(df, cols, shifts=3, add_first=True):
    for col in cols:
        grouped_vals = df[['stock_id', 'date_id', col]].groupby(['stock_id', 'date_id'])
        fill_value = df[col].mean()
        
        for shift in np.arange(shifts):
            df[col+'_shift'+str(shift+1)] = grouped_vals.shift(shift+1).fillna(fill_value)
        if add_first:
            df = df.merge(grouped_vals.first().reset_index(), on=["date_id","stock_id"], suffixes=["","_first"])
    return df

In [24]:
def fill_mean(df, cols):
    for col in cols:
        mean_val = df[col].mean()
        df[col] = df[col].fillna(mean_val)
    return df

In [25]:
def add_info_columns(df_):
    df = df_.copy()
    df[["reference_price", "far_price","near_price","bid_price","ask_price","wap"]] = df[["reference_price", "far_price","near_price","bid_price","ask_price","wap"]].fillna(1.0)
    df['imbalance_ratio'] = df['imbalance_size'] / (df['matched_size'] + 1.0e-8)
    df["imbalance"] = df["imbalance_size"] * df["imbalance_buy_sell_flag"]
    df['ordersize_imbalance'] = (df['bid_size']-df['ask_size']) / ((df['bid_size']+df['ask_size'])+1.0e-8)
    df['matching_imbalance'] = (df['imbalance_size']-df['matched_size']) / ((df['imbalance_size']+df['matched_size'])+1.0e-8)
    df = add_historic_features(df, ["imbalance","imbalance_ratio","reference_price","wap","matched_size","far_price","near_price"], shifts=6, add_first=True)
    return df

In [28]:
df = add_info_columns(df_raw)
df.isna().sum(axis=0)

stock_id                     0
date_id                      0
seconds_in_bucket            0
imbalance_size             220
imbalance_buy_sell_flag      0
                          ... 
near_price_shift3            0
near_price_shift4            0
near_price_shift5            0
near_price_shift6            0
near_price_first             0
Length: 70, dtype: int64

In [30]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5237760 entries, 0 to 5237979
Data columns (total 70 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
 17  imbalance_ratio          float64
 18  imbalance                float64
 19  ordersize_imbalance      float64
 20  matching_imbalance       float64
 21  imbalance_shi

In [31]:
x_cols = [c for c in df.columns if c not in ['row_id', 'time_id', 'date_id', 'target']]
y_cols = ["target"]

In [35]:
means = df[x_cols].mean(axis=0)
stds = df[x_cols].std(axis=0)

In [36]:
def normalize_features(x):
    return (x-means)/(stds+1e-8)

In [37]:
def get_xy(df):
    x = df[x_cols]
    x = normalize_features(x)
    
    y = df[y_cols]
    
    return x.values, y.values

In [38]:
def get_dataloaders(df, batch_size=512):
    (x,y) = get_xy(df)
    
    x_tensor = torch.Tensor(x).to(device)
    y_tensor = torch.Tensor(y).to(device)
    
    full_dataset = TensorDataset(x_tensor, y_tensor)
    train_dataset, test_dataset = random_split(full_dataset, [0.8,0.2])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = DataLoader(test_dataset, batch_size=min(batch_size*4, len(test_dataset)), drop_last=True)
    return (train_dataloader, test_dataloader)

In [39]:
train_dataloader, test_dataloader = get_dataloaders(df)

In [None]:
# Model
layers = [512,256,128,64]
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.relu_stack = nn.Sequential(
            nn.Linear(len(x_cols), layers[0]),
            nn.ReLU()
        )
        
        for i in range(len(layers)-1):
            self.relu_stack.append(nn.Dropout(0.25))
            self.relu_stack.append(nn.Linear(layers[i], layers[i+1]))
            self.relu_stack.append(nn.ReLU())
        self.relu_stack.append(nn.Linear(layers[-1],1))

    def forward(self, x):
        output = self.relu_stack(x)
        return output
    
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_normal_(m.weight)
        m.bias.data.fill_(0.01)
        if m.out_features == 1:
            torch.nn.init.xavier_normal_(m.weight)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, shortcut=0):
    size = len(dataloader.dataset)
    model.train()
    num_batches = len(dataloader)

    train_loss = 0
    
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)

        loss = loss_fn(pred, y)
        train_loss += loss

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if pred.std() < 0.000001:
            print("WARNING: std() is zero, stopping")
            break
        
        if shortcut > 0 and batch == shortcut:
            return train_loss.detach().cpu().numpy() / shortcut
    return train_loss.detach().cpu().numpy() / num_batches


def test_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).detach().cpu().numpy()
    
        scheduler.step(test_loss)
    return test_loss / num_batches
        
def predict(X, model):
    model.eval()
    with torch.no_grad():
        pred = model(X)
    return pred.detach().cpu().numpy().flatten()

In [None]:
model = NeuralNetwork().to(device)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
model.apply(init_weights)