In [1]:
import time
import pandas as pd
import os
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [236]:
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [3]:
DATA_DIRECTORY = os.path.join("..","input","optiver-realized-volatility-prediction")
TRADE_TRAIN_DIRECTORY = os.path.join(DATA_DIRECTORY,"trade_train.parquet")
TRADE_TEST_DIRECTORY = os.path.join(DATA_DIRECTORY,"trade_test.parquet")
BOOK_TRAIN_DIRECTORY = os.path.join(DATA_DIRECTORY,"book_train.parquet")
BOOK_TEST_DIRECTORY = os.path.join(DATA_DIRECTORY,"book_test.parquet")
OUTPUT_DIRECTORY = os.path.join("..","output")
os.makedirs(OUTPUT_DIRECTORY,exist_ok=True)

In [4]:
train_df = pd.read_csv(os.path.join(DATA_DIRECTORY,"train.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIRECTORY,"test.csv"))

In [501]:
class OptiverRealizedVolatilityDataset(Dataset):
    def __init__(self, data_directory, mode="train"):
        """initializes Optiver Competition dataset
        `mode`: train|test
        `data_directory`: the datadirectory of the input data, where there are test.csv, train.csv, and parquet folders for trade_train.parquet and other relevant folders
        """
        if mode.lower() not in ['train','test']:
            raise Exception("Invalid mode passed for Optiver dataset. Valid values:train|test")
        self.data_directory = data_directory
        self.mode = mode.lower()
        self.main_df = pd.read_csv(os.path.join(self.data_directory,f'{self.mode}.csv'))
#         if self.mode == 'train':
#             self.main_df['row_id'] = self.main_df.apply(lambda x: f"{x['stock_id']:.0f}-{x['time_id']:.0f}", axis=1)
        if self.mode == 'test':
            self.main_df['target'] = 0
        
        self.cache_stocks_done_set = set()
        self.cache_rowid_feature_map = {}
        row_id_series = self.main_df['stock_id'].astype(str) + "-" +self.main_df['time_id'].astype(str)
        targets = self.main_df['target'].tolist()
        for idx, row_id in enumerate(row_id_series.tolist()):
            self.cache_rowid_feature_map[row_id] = {'target':targets[idx], 'stock_id':row_id.split('-')[0],'time_id':row_id.split('-')[1],'row_id':row_id}
    
    
    def __get_row_id(self, stock_id, time_id):
        return f"{stock_id:.0f}-{time_id:.0f}"
    
    def __generate_realized_volatility_of_book_df(self, book_df):
        """ expects the wap1 to be present and log_return to be present"""
        return np.sqrt(np.sum(book_df['log_return']**2))
    
    def __generate_per2min_wap1_book_df0(self, book_df):
        """`seconds_in_bucket_120s_groupkey` needs to be present"""
        
        prices_120s = []
        
        for gkey, gdata in book_df.groupby('seconds_in_bucket_120s_groupkey'):
            prices_120s.append(gdata['wap1'].mean())
        
#              
        if len(prices_120s) == 0:
            prices_120s += [0] * 5
        if len(prices_120s) < 5:
            prices_120s += ([prices_120s[0]] * (5-len(prices_120s)))
        return prices_120s
        
    
    def __cache_generate_features(self, main_stock_id, main_time_id):
            
            
            main_row_id = self.__get_row_id(main_stock_id, main_time_id)
            if main_stock_id not in self.cache_stocks_done_set:
#                 trade_df = pd.read_parquet(os.path.join(self.data_directory, f"trade_{self.mode}.parquet", f"stock_id={stock_id}"))
                
                book_df = pd.read_parquet(os.path.join(self.data_directory, f"book_{self.mode}.parquet", f"stock_id={main_stock_id}"))
                book_df['wap1'] = (book_df['bid_price1'] * book_df['ask_size1'] + book_df['ask_price1'] * book_df['bid_size1'])/(book_df['bid_size1'] + book_df['ask_size1'])
    #             book_df['wap2'] = (book_df['bid_price2'] * book_df['ask_size2'] + book_df['ask_price2'] * book_df['bid_size2'])/(book_df['bid_size2'] + book_df['ask_size2'])
                #NOTE: use wap1 ; until we figure out in 01. study which price wap1 closely resembles the trade price, or maybe wap1&wap2 mean
                book_df['log_return'] = book_df.groupby('time_id')['wap1'].apply(lambda x: np.log(x).diff())
                
                book_df['seconds_in_bucket_120s_groupkey'] = (book_df['seconds_in_bucket']/120).astype(int)
                
#                 print(book_df)
                # ACTUAL FEATURES HERE!
                for groupkey, groupdf in book_df.groupby('time_id'):
                    rowid = self.__get_row_id(main_stock_id, groupkey)
                    self.cache_rowid_feature_map[rowid]['realized_volatility'] = self.__generate_realized_volatility_of_book_df(groupdf)
                    
                    self.cache_rowid_feature_map[rowid]['wap_120s_interval'] = self.__generate_per2min_wap1_book_df0(groupdf)
                
                    
                self.cache_stocks_done_set.add(main_stock_id)
                
            return self.cache_rowid_feature_map[main_row_id]
        
    
    def __transform_to_01_realized_volatility_linear_data(self, features_dict):
        return {'row_id':features_dict['row_id'],
                'x_realized_volatility':torch.tensor([features_dict.get('realized_volatility',0)]),
                'x_wap_120s':torch.tensor(features_dict.get('wap_120s_interval',[0]*int(600/120)))
               }, torch.tensor([features_dict['target']])
    
    def __len__(self):
        return len(self.main_df)
    
    def __getitem__(self, idx):
        #TODO: handle for num_workers more than 0
        #      using https://pytorch.org/docs/stable/data.html
        #      using torch.util.data.get_worker_info()
        if torch.is_tensor(idx):
            idx = idx.tolist()
        stock_id = self.main_df.at[idx, 'stock_id']
        time_id = self.main_df.at[idx, 'time_id']
        features_dict = self.__cache_generate_features(stock_id,time_id)
        x, y = self.__transform_to_01_realized_volatility_linear_data(features_dict)
        return x,y

In [502]:
dataset = OptiverRealizedVolatilityDataset(DATA_DIRECTORY, mode="train")

In [503]:
dataset[0]

({'row_id': '0-5',
  'x_realized_volatility': tensor([0.0045], dtype=torch.float64),
  'x_wap_120s': tensor([1.0030, 1.0041, 1.0039, 1.0040, 1.0034], dtype=torch.float64)},
 tensor([0.0041]))

In [528]:
# for x in range(0,9):
#     print(dataset[x])

In [507]:
# for key, val in dataset.cache_rowid_feature_map.items():
#     dataset.main_df.at[0,'time_id']
#     dataset.main_df.at[0,'stock_id']
# for idx in range(0, len(dataset)):
#     if dataset[idx] is None:
#         print("______________")

In [529]:
# data where only 4 values for x_wap_120 was there
dataset[10000*11+6888]

({'row_id': '33-16526',
  'x_realized_volatility': tensor([0.0058], dtype=torch.float64),
  'x_wap_120s': tensor([1.0000, 0.9977, 0.9993, 1.0002, 1.0000], dtype=torch.float64)},
 tensor([0.0087]))

In [509]:
# buffer data for performance
dataloader = DataLoader(dataset, batch_size=10000,
                        shuffle=False, num_workers=0, pin_memory=True)
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
#     continue
#     print(X['row_id'][0])
    print(loss_fn(y, X['x_realized_volatility']))

tensor(0.3441, dtype=torch.float64)
tensor(0.3484, dtype=torch.float64)
tensor(0.3244, dtype=torch.float64)
tensor(0.3237, dtype=torch.float64)
tensor(0.3159, dtype=torch.float64)
tensor(0.3361, dtype=torch.float64)
tensor(0.3728, dtype=torch.float64)
tensor(0.3382, dtype=torch.float64)
tensor(0.3129, dtype=torch.float64)
tensor(0.3626, dtype=torch.float64)
tensor(0.5738, dtype=torch.float64)
tensor(0.5318, dtype=torch.float64)
tensor(0.2776, dtype=torch.float64)
tensor(0.3827, dtype=torch.float64)
tensor(0.3319, dtype=torch.float64)
tensor(0.2900, dtype=torch.float64)
tensor(0.3247, dtype=torch.float64)
tensor(0.2969, dtype=torch.float64)
tensor(0.3073, dtype=torch.float64)
tensor(0.3155, dtype=torch.float64)
tensor(0.3413, dtype=torch.float64)
tensor(0.2933, dtype=torch.float64)
tensor(0.3229, dtype=torch.float64)
tensor(0.3178, dtype=torch.float64)
tensor(0.3267, dtype=torch.float64)
tensor(0.3283, dtype=torch.float64)
tensor(0.2959, dtype=torch.float64)
tensor(0.3582, dtype=torch.f

In [531]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(6, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
        )

    def forward(self, x):
#         x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

def loss_fn(y, pred):
    return torch.sqrt(torch.mean(torch.square((y-pred)/y)))

### LEarning rate: our base line is 0.34 loss as that's what the optiver guys have when they use current 10 min realize vol and use it as target (copy to prediction). We create simplest neural network and work with learning rates to figure out what's best and when we see something in range of 0.35 then we've found good Learning rate
- #### GBM: 1e-7 works best
- #### ADAM: 1e-3 works best

In [534]:
learning_rate = 1e-3
batch_size = 64
epochs = 100

model = NeuralNetwork()
model.to(device)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-11)


In [535]:

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    
    dataloader_train = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=0, pin_memory=True)
    dataset_size = len(dataloader.dataset)
    model.train()
    losses_train = []
    for batch, (X, y) in enumerate(dataloader_train):
        X = torch.cat((X['x_realized_volatility'], X['x_wap_120s']), 1)
        
        
        X = X.type(torch.cuda.FloatTensor)
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(y, pred)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses_train.append(loss.item())
        # we want 5 spread out output per epoch
        if batch % int(dataset_size/5/batch_size) == 0:
            loss, current = loss.item(), batch * len(X)
            
            print(f"loss: {loss:>7f}  [{current:>5d}/{dataset_size:>5d}]")
            
    dataloader_test = DataLoader(test_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=0, pin_memory=True)
    dataset_size = len(dataloader.dataset)
    model.eval()

    losses_test = []
    for batch, (X, y) in enumerate(dataloader_test):
        with torch.no_grad():
            X = torch.cat((X['x_realized_volatility'], X['x_wap_120s']), 1)
            X = X.type(torch.cuda.FloatTensor)
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            loss = loss_fn(y, pred)
            losses_test.append(loss.item())
            
    print("LOSSES TRAIN:", np.mean(losses_train), "LOSSES TEST:", np.mean(losses_test))
            

Epoch 1
-------------------------------
loss: 117.719505  [    0/428932]
loss: 0.627800  [85760/428932]
loss: 0.409063  [171520/428932]
loss: 0.380505  [257280/428932]
loss: 0.280221  [343040/428932]
LOSSES TRAIN: 1.1016393478255315 LOSSES TEST: 0.28767515236677116
Epoch 2
-------------------------------
loss: 0.293525  [    0/428932]
loss: 0.402002  [85760/428932]
loss: 0.274830  [171520/428932]
loss: 0.502549  [257280/428932]
loss: 0.278513  [343040/428932]
LOSSES TRAIN: 0.36125285879294405 LOSSES TEST: 0.26691711317861727
Epoch 3
-------------------------------
loss: 0.298576  [    0/428932]
loss: 0.310002  [85760/428932]
loss: 0.402453  [171520/428932]
loss: 0.263494  [257280/428932]
loss: 0.401027  [343040/428932]
LOSSES TRAIN: 0.33885335577363535 LOSSES TEST: 0.34954684589399854
Epoch 4
-------------------------------
loss: 0.325481  [    0/428932]
loss: 0.324930  [85760/428932]
loss: 0.270447  [171520/428932]
loss: 0.903530  [257280/428932]
loss: 0.262508  [343040/428932]
LOSSES

KeyboardInterrupt: 

In [536]:
torch.save(model.state_dict(), os.path.join(OUTPUT_DIRECTORY,"03_wap120s_model_2hidden_layer.pth"))