### Can our model predict current volatility?  (forget future; first it should be capable of predicting current one with given features)

In [1]:
import os
import time
import multiprocessing
from multiprocessing import Pool

import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np

from optiver_features_handler import get_features_map_for_stock, get_row_id

In [2]:
DATA_DIRECTORY = os.path.join("..","input","optiver-realized-volatility-prediction")
OUTPUT_DIRECTORY = os.path.join("..","output")
os.makedirs(OUTPUT_DIRECTORY,exist_ok=True)

In [3]:
class OptiverRealizedVolatilityDataset(Dataset):
    def __init__(self, data_directory, mode="train", lazy_load=True):
        """initializes Optiver Competition dataset
        `mode`: train|test
        `data_directory`: the datadirectory of the input data, where there are test.csv, train.csv, and parquet folders for trade_train.parquet and other relevant folders
        """
        print("INIT: OptiverRealizedVolatilityDataset")
        if mode.lower() not in ['train','test']:
            raise Exception("Invalid mode passed for Optiver dataset. Valid values:train|test")
        self.data_directory = data_directory
        self.mode = mode.lower()
        self.main_df = pd.read_csv(os.path.join(self.data_directory,f'{self.mode}.csv'))
#         if self.mode == 'train':
#             self.main_df['row_id'] = self.main_df.apply(lambda x: f"{x['stock_id']:.0f}-{x['time_id']:.0f}", axis=1)
        if self.mode == 'test':
            self.main_df['target'] = 0
        
        self.cache_stocks_done_set = set()
        # this is our final features lookup where we park all our features which can be addressed by row_id
        # which is individual train/test.csv row id using 'stock_id`-`time_id`
        self.cache_rowid_feature_map = {}
        row_id_series = self.main_df['stock_id'].astype(str) + "-" +self.main_df['time_id'].astype(str)
        targets = self.main_df['target'].tolist()
        self.stock_possible_timeids_list = {}
        for idx, row_id in enumerate(row_id_series.tolist()):
            stock_id = int(row_id.split('-')[0])
            time_id = int(row_id.split('-')[1])
            self.cache_rowid_feature_map[row_id] = {'target':targets[idx], 'stock_id':stock_id,'time_id':time_id,'row_id':row_id}
            
            # below code is to make sure what timeids we expect from stock data extractor
            # in case of missing parquet files we'll have to know the keys to fill default values into
            if stock_id not in self.stock_possible_timeids_list:
                self.stock_possible_timeids_list[stock_id] = []
            self.stock_possible_timeids_list[stock_id].append(time_id)
            
        
        if lazy_load == False:
            worker_data = []
            for gkey, gdf in self.main_df.groupby(['stock_id']):
                worker_data.append((self.data_directory, self.mode, gkey))
#             print("---------- CPU COUNG:", multiprocessing.cpu_count())
            # NOTE: this was hell of a hunt; this windows and pytorch and jupyter combination is too tedious
            #       make sure the function that we distribute don't call pytorch
            with Pool(multiprocessing.cpu_count()) as p:
                feature_set_list = p.starmap(get_features_map_for_stock, worker_data)
                for feature_map in feature_set_list:
                    for rowid, features_dict in feature_map.items():
                        for fkey,fval in features_dict.items():
                            self.cache_rowid_feature_map[rowid][fkey] = fval
                        self.cache_rowid_feature_map[rowid]  = OptiverRealizedVolatilityDataset.transform_to_01_realized_volatility_linear_data(self.cache_rowid_feature_map[rowid])
                    # udpate the indications that we've already fetched this stock and the lazy loader code won't fetch this again
                    self.cache_stocks_done_set.add(int(rowid.split('-')[0]))
    
    def __cache_generate_features(self, main_stock_id, main_time_id):
            
            
            main_row_id = get_row_id(main_stock_id, main_time_id)
            if main_stock_id not in self.cache_stocks_done_set:
#                 trade_df = pd.read_parquet(os.path.join(self.data_directory, f"trade_{self.mode}.parquet", f"stock_id={stock_id}"))   
                # we'll combine the featureset with the bigger feature set of all stocks
                feature_map = get_features_map_for_stock(self.data_directory, self.mode, main_stock_id)
                # NOTE: sometime we might now have parquet files in that case we'll have 3 entried in .csv while only 1 gets returned in feature map
                # we need to cover for that disparity
                for time_id in self.stock_possible_timeids_list[main_stock_id]:
                    expected_row_id = get_row_id(main_stock_id, time_id)
                    if expected_row_id not in feature_map:
                        feature_map[expected_row_id] = {}
                for rowid, features_dict in feature_map.items():
                    for fkey,fval in features_dict.items():
                        self.cache_rowid_feature_map[rowid][fkey] = fval
                    self.cache_rowid_feature_map[rowid]  = OptiverRealizedVolatilityDataset.transform_to_01_realized_volatility_linear_data(self.cache_rowid_feature_map[rowid])
                self.cache_stocks_done_set.add(main_stock_id)
#             print(self.cache_rowid_feature_map[main_row_id])
#             print(torch.tensor([self.cache_rowid_feature_map[main_row_id].get('book_realized_volatility',0)]))
#             print(torch.tensor(self.cache_rowid_feature_map[main_row_id].get('log_return1_2s', [0]*(int(600/2)))))
#             print(torch.tensor(self.cache_rowid_feature_map.get('book_directional_volume1_2s', [0]*(int(600/2)))))
            return self.cache_rowid_feature_map[main_row_id]
        
    @staticmethod
    def transform_to_01_realized_volatility_linear_data(features_dict):
        return (
                {
                    'row_id':features_dict['row_id'],
                    'book_realized_volatility':torch.tensor([features_dict.get('book_realized_volatility',0)]),
                    'log_return1_2s':torch.tensor(features_dict.get('log_return1_2s', [0]*(int(600/2)))),
                    'book_directional_volume1_2s':torch.tensor(features_dict.get('book_directional_volume1_2s', [0]*(int(600/2)))) 
                },
                torch.tensor([features_dict['target']])
#                 [features_dict['target']]
        )
    
    def __len__(self):
        return len(self.main_df)
    
    def __getitem__(self, idx):
        #TODO: handle for num_workers more than 0
        #      using https://pytorch.org/docs/stable/data.html
        #      using torch.util.data.get_worker_info()
        if torch.is_tensor(idx):
            idx = idx.tolist()
        stock_id = self.main_df.at[idx, 'stock_id']
        time_id = self.main_df.at[idx, 'time_id']
        x,y = self.__cache_generate_features(stock_id,time_id)
#         x, y = self.__transform_to_01_realized_volatility_linear_data(features_dict)
        return x,y

In [None]:
if __name__=="__main__":
    dataset = OptiverRealizedVolatilityDataset(DATA_DIRECTORY, mode="train", lazy_load=False)

INIT: OptiverRealizedVolatilityDataset


In [9]:
# for x in range(0,9):
#     print(dataset[x])
dataset[10000] #[0]['book_wap1_1s'].tolist()

({'row_id': '2-19309',
  'book_realized_volatility': tensor([0.0022], dtype=torch.float64),
  'log_return1_2s': tensor([ 6.0413e-05,  6.0413e-05,  4.8357e-05, -1.2085e-04, -5.1354e-05,
          -3.7564e-05,  1.0946e-04,  3.5530e-05,  4.3171e-05,  3.3897e-07,
          -1.2371e-04, -3.6098e-05, -9.8656e-05,  1.0582e-04,  1.0305e-04,
          -1.9261e-05, -3.5741e-05,  1.8883e-05, -2.5424e-05, -8.3551e-05,
          -7.4092e-05,  2.4368e-04, -1.4165e-04,  3.5744e-05,  3.1450e-05,
           1.2085e-04, -1.2451e-04, -3.2420e-05, -3.4293e-05,  1.7588e-05,
          -5.4811e-06, -1.3497e-04, -1.7910e-04,  2.8383e-05, -2.4535e-05,
           5.5848e-05, -4.6457e-05,  7.8601e-06,  1.3846e-04,  1.1608e-04,
          -2.7190e-05, -5.9829e-05,  4.1800e-05, -4.7252e-05, -1.7348e-04,
          -4.8430e-06, -1.1340e-04,  5.7795e-05,  0.0000e+00,  8.4624e-05,
          -1.1776e-04, -2.5630e-05,  1.3132e-04,  1.6596e-05,  1.1527e-04,
           5.3982e-05,  8.1596e-06, -1.5717e-05,  5.7415e-05,  1.

In [10]:
# for key, val in dataset.cache_rowid_feature_map.items():
#     dataset.main_df.at[0,'time_id']
#     dataset.main_df.at[0,'stock_id']
tradevola = []
for idx in range(0, len(dataset)):
    X, y = dataset[idx]
    tradevola.append(X['book_realized_volatility'].item())
#     print('book',X['book_realized_volatility'].item(),'trade', X['trade_realized_volatility'].item(),'traget', y.item())
#     print('bookdiff', abs(X['book_realized_volatility'].item()-y.item()), 'tradediff', abs(X['trade_realized_volatility'].item()-y.item()))
#     break
#     input()
#     except:
#         print("ERRR")
#         print(idx)
#         print(dataset[idx])

In [11]:
pd.Series(tradevola).describe()

count    428932.000000
mean          0.004233
std           0.003586
min           0.000081
25%           0.002065
50%           0.003159
75%           0.005108
max           0.086421
dtype: float64

In [154]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
#         self.flatten = nn.Flatten()
#         self.cnn_stack = nn.Sequential(
#             nn.Conv1d(1, 10, kernel_size=8, stride=2, padding=0),
# #             nn.Linear(input_size, 1024),
#             nn.Hardswish(),
#             nn.Dropout(0.1),
#             nn.Conv1d(10, 10, kernel_size=2, stride=2, padding=0),
#             nn.Hardswish(),
#             nn.Dropout(0.2),
#             nn.Conv1d(10, 10, kernel_size=4, stride=1, padding=0), 
#             nn.Hardswish(),
#             nn.Dropout(0.1),
#         )
#         self.linear_stack = nn.Sequential(
#             nn.LazyLinear(128),
# #             nn.Hardswish(),
# #             nn.Linear(2048, 1024),
#             nn.Hardswish(),
#             nn.Dropout(),
#             nn.Linear(128, 64),
#             nn.Hardswish(),
#             nn.Dropout(),
#             nn.Linear(64, 1),
#         )
        self.basic_stack = nn.Sequential(
            nn.Linear(int(600/2),512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512,1024),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(1024,512),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512,256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256,128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128,1)
        )
        
    def forward(self, x):
        logits = self.basic_stack(x)
#         x = self.flatten(x)
#         logits = self.cnn_stack(x)
#         logits = self.flatten(logits)
#         logits = self.linear_stack(logits)
        return logits



def loss_fn_mse(y, pred):
    return torch.mean(torch.square((y-pred)))

def loss_fn_mspe(y, pred):
    return torch.mean(torch.square((y-pred)/y))

def loss_fn_orig(y, pred):
    return torch.sqrt(torch.mean(torch.square((y-pred)/y)))

In [155]:
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [156]:
model = NeuralNetwork()
model.to(device)

NeuralNetwork(
  (basic_stack): Sequential(
    (0): Linear(in_features=300, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=512, out_features=1024, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=1024, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=256, out_features=128, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.1, inplace=False)
    (15): Linear(in_features=128, out_features=1, bias=True)
  )
)

#### analyze the initial weights (or change them)

In [157]:
# # @torch.no_grad()
# def init_weights(m):
# #     print(m)
#     if type(m) == nn.Linear:
# #         m.weight.fill_(1.0)
#         torch.nn.init.xavier_uniform_(m.weight,gain=10)
#         m.bias.data.uniform_(-1,1)
# #     elif type(m) == nn.ReLU:
# #         print(m.data)
#     else:
#         print(type(m))
# #         print(m.weight)
# model.apply(init_weights)
# # for param in model.parameters():
# # #     print(param)
# #       print(param.data.size(), param.data)

### LEarning rate: our base line is 0.34 loss as that's what the optiver guys have when they use current 10 min realize vol and use it as target (copy to prediction). We create simplest neural network and work with learning rates to figure out what's best and when we see something in range of 0.35 then we've found good Learning rate
- #### SGD: 1e-7 works best
- #### ADAM: 1e-3, 1e-4 works best

In [158]:
learning_rate = 1e-5
batch_size = 4096
epochs = 1000


# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-13)


In [None]:
print("DEVICE:", device)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
data_interval_len = int(600/2)
data_ohlc_sample_len = 1 # 1 for each of open high low close
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    
    dataloader_train = DataLoader(train_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=0, pin_memory=True)
    dataset_size = len(dataloader_train.dataset)
    model.train()
    losses_train = []
    for batch, (Feature_X, feature_y) in enumerate(dataloader_train):

#         xlogreturn1 = Feature_X['log_return1_2s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
#         xbookdirvolume1 = Feature_X['book_directional_volume1_2s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
#         X = torch.stack([xlogreturn1, 
#                          xbookdirvolume1], dim=2)
#         X = X.reshape(-1,1,data_interval_len*data_ohlc_sample_len*2)
        X = Feature_X['log_return1_2s'] * 1000
        
        y = feature_y * 100
#         y = Feature_X['book_realized_volatility'] * 100
        X = X.type(torch.cuda.FloatTensor)    
        y = y.type(torch.cuda.FloatTensor)
        
        
        X = X.to(device)
        y = y.to(device)
        feature_y = feature_y.to(device)
        pred = model(X)
        pred.to(device)
#         print(X.size(), "input")
#         print(pred.size(), "pred")
#         print(y.size(),'y')
#         input()
#         loss_fn_mse
#         loss_mse = loss_fn_mse(y, pred)
        loss_orig = loss_fn_orig(y, pred)
        
        optimizer.zero_grad()
        loss_orig.backward()
        optimizer.step()
        
        
        losses_train.append(loss_orig.item())
        # we want 5 spread out output per epoch
        if batch % int(dataset_size/5/batch_size) == 0:
            loss, current = loss_orig.item(), batch * len(X)
            # NOTE: real loss is same as upscaled normalized loss as it's percentage loss (rmspe)
            print(f"loss: {loss:>7f}  [{current:>5d}/{dataset_size:>5d}]", 'real loss:', loss_fn_orig(feature_y.to('cuda:0'), (pred/100)).item())
            print(pred.reshape(-1).tolist()[:7])
            print(y.reshape(-1).tolist()[:7])
            
    dataloader_test = DataLoader(test_dataset, batch_size=batch_size,
                        shuffle=True, num_workers=0, pin_memory=True)
    dataset_size = len(dataloader_test.dataset)
    model.eval()

    losses_test = []
    for batch, (Feature_X, feature_y) in enumerate(dataloader_test):
        with torch.no_grad():
#             X = Feature_X['book_realized_volatility']
#         CNN approach
#             xbookwap1 = Feature_X['book_wap1_1s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
#             xbookdirvolume1 = Feature_X['book_directional_volume1_1s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
#             X = torch.stack([xbookwap1, 
#                              xbookdirvolume1], dim=2)
#             X = X.reshape(-1,1,data_interval_len*data_ohlc_sample_len*2)

#         print(X)
#         input()
#             X = X.reshape(-1,int(600/30)*4*2)
#             X = X.reshape(1,8,-1)
#         print(X)
#             X = torch.cat([X, Feature_X['book_realized_volatility'], Feature_X['trade_realized_volatility']],1)
#             y = feature_y
            X = Feature_X['log_return1_2s'] * 1000
        
            y = feature_y * 100
#             X = torch.cat((X['x_realized_volatility'], X['x_wap_120s']), 1)
#             X = Feature_X['x_wap_120s']
#             y = Feature_X['x_realized_volatility']
            X = X.type(torch.cuda.FloatTensor)
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            loss = loss_fn_orig(y, pred)
            losses_test.append(loss.item())
            
    print("LOSSES TRAIN:", np.mean(losses_train), "LOSSES TEST:", np.mean(losses_test))
            

DEVICE: cuda:0
Epoch 1
-------------------------------
loss: 0.878080  [    0/343145] real loss: 0.8780801892280579
[0.03448563069105148, 0.039495863020420074, 0.02333875186741352, 0.034538667649030685, 0.046411752700805664, 0.025815727189183235, 0.03722376748919487]
[0.20906119048595428, 0.1826799064874649, 0.16328570246696472, 0.4066798686981201, 0.8013405799865723, 0.4421999156475067, 0.5088809132575989]
loss: 0.847465  [65536/343145] real loss: 0.8474645018577576
[0.04636446014046669, 0.04427403584122658, 0.044182464480400085, 0.05121458321809769, 0.03596247732639313, 0.054810840636491776, 0.03703754395246506]
[0.4231337010860443, 0.13289479911327362, 0.8553152680397034, 0.5112099647521973, 0.15840519964694977, 1.3889795541763306, 0.08450010418891907]
loss: 0.816014  [131072/343145] real loss: 0.8160144090652466
[0.048889804631471634, 0.055429767817258835, 0.05332763493061066, 0.04161607846617699, 0.04115968570113182, 0.04331577569246292, 0.054540541023015976]
[0.1263555884361267, 

In [133]:
torch.save(model.state_dict(), os.path.join(OUTPUT_DIRECTORY,"05_2s_datanormalized_logreturn_sequencial.pth"))

In [25]:
dataloader_test = DataLoader(test_dataset, batch_size=5,
                        shuffle=True, num_workers=0, pin_memory=True)
dataset_size = len(dataloader_test.dataset)
model.eval()

losses_test = []
for batch, (Feature_X, feature_y) in enumerate(dataloader_test):
    with torch.no_grad():
        xlogreturn1 = Feature_X['log_return1_2s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
        xbookdirvolume1 = Feature_X['book_directional_volume1_2s'].reshape(-1,data_interval_len,data_ohlc_sample_len)
        X = torch.stack([xlogreturn1, 
                         xbookdirvolume1], dim=2)
        X = X.reshape(-1,1,data_interval_len*data_ohlc_sample_len*2)
#         print(X)
#         print(X.size())
#         input()
#         print(X)
#         input()
#             X = torch.cat([X, Feature_X['book_realized_volatility'], Feature_X['trade_realized_volatility']],1)
        y = feature_y
#         y = Feature_X['book_realized_volatility']
#             X = torch.cat((X['x_realized_volatility'], X['x_wap_120s']), 1)
#             X = Feature_X['x_wap_120s']
#             y = Feature_X['x_realized_volatility']
        X = X.type(torch.cuda.FloatTensor)
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        print("curr", Feature_X['book_realized_volatility'].tolist())
        print('curr1s',[np.sqrt(np.sum(pd.Series(closep)**2)) for closep in [x[::] for x in xlogreturn1.reshape(-1,data_interval_len*data_ohlc_sample_len).tolist()]])
        print('pred', pred.tolist())
        print('actual',y.tolist())
#         input()
        
            
        input()

curr [[0.0036344075262357213], [0.0023030722939839593], [0.001974389597446672], [0.0053188394310924895], [0.002627147852732812]]
curr1s [0.0032973839266358745, 0.0018837148282878475, 0.001543782455083293, 0.0045055278991471435, 0.0024511532570696552]
pred [[0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903]]
actual [[0.0034380510915070772], [0.002843644004315138], [0.0016469400143250823], [0.0045009939931333065], [0.0022277499083429575]]


 


curr [[0.005505567859000631], [0.002583564806666279], [0.003342226784178032], [0.001825386429605389], [0.0023499431049795972]]
curr1s [0.004266288593053351, 0.002322206915645627, 0.0027153584052034044, 0.0014572253909269938, 0.0022752323247224055]
pred [[0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903]]
actual [[0.005065346136689186], [0.0032045149710029364], [0.0024034560192376375], [0.002544580027461052], [0.0019930589478462934]]


 


curr [[0.0036982866537401547], [0.001589579562834356], [0.00250350177478549], [0.0016810987015589993], [0.006155246519659815]]
curr1s [0.0033246328278157124, 0.0014030817000914185, 0.001522057847835663, 0.0013125854564212302, 0.005895381673349276]
pred [[0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903]]
actual [[0.005089106038212776], [0.0017484510317444801], [0.002240682952105999], [0.0014124299632385373], [0.004447725135833025]]


 


curr [[0.0055066300279102524], [0.007521367502576807], [0.014150298200341176], [0.0022619951320607184], [0.0012587560031778458]]
curr1s [0.004429237442109815, 0.005901219429802633, 0.012374638909804352, 0.0019100852788617752, 0.0011237069708421598]
pred [[0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903], [0.0015419030096381903]]
actual [[0.004927168134599924], [0.0065541500225663185], [0.011375034227967262], [0.0018453890224918723], [0.0010599199449643493]]


KeyboardInterrupt: Interrupted by user