### Can our model predict current volatility?  (forget future; first it should be capable of predicting current one with given features)

In [1]:
import os
import time
import multiprocessing
from multiprocessing import Pool

import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter

from optiver_features_handler import get_features_map_for_stock, get_row_id

In [2]:
DATA_DIRECTORY = os.path.join("..","input","optiver-realized-volatility-prediction")
OUTPUT_DIRECTORY = os.path.join("..","output")
MODEL_OUTPUT_DIRECTORY = os.path.join(OUTPUT_DIRECTORY,"models")
os.makedirs(OUTPUT_DIRECTORY,exist_ok=True)
os.makedirs(MODEL_OUTPUT_DIRECTORY,exist_ok=True)

In [3]:
data_interval_seconds = 5
data_intervals_count = int(600/data_interval_seconds)
class OptiverRealizedVolatilityDataset(Dataset):
    def __init__(self, data_directory, mode="train", lazy_load=True):
        """initializes Optiver Competition dataset
        `mode`: train|test
        `data_directory`: the datadirectory of the input data, where there are test.csv, train.csv, and parquet folders for trade_train.parquet and other relevant folders
        """
        print("INIT: OptiverRealizedVolatilityDataset")
        if mode.lower() not in ['train','test']:
            raise Exception("Invalid mode passed for Optiver dataset. Valid values:train|test")
        self.data_directory = data_directory
        self.mode = mode.lower()
        self.main_df = pd.read_csv(os.path.join(self.data_directory,f'{self.mode}.csv'))
#         if self.mode == 'train':
#             self.main_df['row_id'] = self.main_df.apply(lambda x: f"{x['stock_id']:.0f}-{x['time_id']:.0f}", axis=1)
        if self.mode == 'test':
            self.main_df['target'] = 0
        
        self.cache_stocks_done_set = set()
        # this is our final features lookup where we park all our features which can be addressed by row_id
        # which is individual train/test.csv row id using 'stock_id`-`time_id`
        self.cache_rowid_feature_map = {}
        row_id_series = self.main_df['stock_id'].astype(str) + "-" +self.main_df['time_id'].astype(str)
        targets = self.main_df['target'].tolist()
        self.stock_possible_timeids_list = {}
        for idx, row_id in enumerate(row_id_series.tolist()):
            stock_id = int(row_id.split('-')[0])
            time_id = int(row_id.split('-')[1])
            self.cache_rowid_feature_map[row_id] = {'target':targets[idx], 'stock_id':stock_id,'time_id':time_id,'row_id':row_id}
            
            # below code is to make sure what timeids we expect from stock data extractor
            # in case of missing parquet files we'll have to know the keys to fill default values into
            if stock_id not in self.stock_possible_timeids_list:
                self.stock_possible_timeids_list[stock_id] = []
            self.stock_possible_timeids_list[stock_id].append(time_id)
            
        
        if lazy_load == False:
            worker_data = []
            for gkey, gdf in self.main_df.groupby(['stock_id']):
                worker_data.append((self.data_directory, self.mode, gkey))
#             print("---------- CPU COUNG:", multiprocessing.cpu_count())
            # NOTE: this was hell of a hunt; this windows and pytorch and jupyter combination is too tedious
            #       make sure the function that we distribute don't call pytorch
            chunksize = multiprocessing.cpu_count() * 1
            processed = 0
            for worker_data_chunk in [worker_data[i * chunksize:(i + 1) * chunksize] for i in range((len(worker_data) + chunksize - 1) // chunksize )]:
                with Pool(multiprocessing.cpu_count()) as p:
                    
                    feature_set_list = p.starmap(get_features_map_for_stock, worker_data_chunk)
                    
                    for feature_map in feature_set_list:
                        for rowid, features_dict in feature_map.items():
                            for fkey,fval in features_dict.items():
                                self.cache_rowid_feature_map[rowid][fkey] = fval
                            self.cache_rowid_feature_map[rowid]  = OptiverRealizedVolatilityDataset.transform_to_01_realized_volatility_linear_data(self.cache_rowid_feature_map[rowid])
                        # udpate the indications that we've already fetched this stock and the lazy loader code won't fetch this again
                        self.cache_stocks_done_set.add(int(rowid.split('-')[0]))
                    
                    processed += chunksize
                    print(f"Processed and loaded {processed} stocks features.")
    
    def __cache_generate_features(self, main_stock_id, main_time_id):
            
            main_row_id = get_row_id(main_stock_id, main_time_id)
            if main_stock_id not in self.cache_stocks_done_set:
#                 trade_df = pd.read_parquet(os.path.join(self.data_directory, f"trade_{self.mode}.parquet", f"stock_id={stock_id}"))   
                # we'll combine the featureset with the bigger feature set of all stocks
                feature_map = get_features_map_for_stock(self.data_directory, self.mode, main_stock_id)
                # NOTE: sometime we might now have parquet files in that case we'll have 3 entried in .csv while only 1 gets returned in feature map
                # we need to cover for that disparity
                for time_id in self.stock_possible_timeids_list[main_stock_id]:
                    expected_row_id = get_row_id(main_stock_id, time_id)
                    if expected_row_id not in feature_map:
                        feature_map[expected_row_id] = {}
                for rowid, features_dict in feature_map.items():
                    for fkey,fval in features_dict.items():
                        self.cache_rowid_feature_map[rowid][fkey] = fval
                    self.cache_rowid_feature_map[rowid]  = OptiverRealizedVolatilityDataset.transform_to_01_realized_volatility_linear_data(self.cache_rowid_feature_map[rowid])
                self.cache_stocks_done_set.add(main_stock_id)
#             print(self.cache_rowid_feature_map[main_row_id])
#             print(torch.tensor([self.cache_rowid_feature_map[main_row_id].get('book_realized_volatility',0)]))
#             print(torch.tensor(self.cache_rowid_feature_map[main_row_id].get('log_return1_2s', [0]*(int(600/2)))))
#             print(torch.tensor(self.cache_rowid_feature_map.get('book_directional_volume1_2s', [0]*(int(600/2)))))
            return self.cache_rowid_feature_map[main_row_id]
        
    @staticmethod
    def transform_to_01_realized_volatility_linear_data(features_dict):
        return (
                {
                    'row_id':features_dict['row_id'],
                    'stock_id':torch.tensor(features_dict['stock_id'], dtype=torch.float32),
                    'seconds_in_bucket_xs': torch.tensor(np.nan_to_num(features_dict.get('seconds_in_bucket_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
#                     'book_realized_volatility':torch.tensor([features_dict.get('book_realized_volatility',0)]),
                    # TRADE FEATURES
                    'logrett_xs': torch.tensor(np.nan_to_num(features_dict.get('logrett_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'trade_volume_xs': torch.tensor(np.nan_to_num(features_dict.get('trade_volume_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'trade_ordercount_xs': torch.tensor(np.nan_to_num(features_dict.get('trade_ordercount_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'trade_money_turnover_xs': torch.tensor(np.nan_to_num(features_dict.get('trade_money_turnover_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'trade_money_turnover_per_order_xs': torch.tensor(np.nan_to_num(features_dict.get('trade_money_turnover_per_order_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    
#                     'trade_money_turnover_mean': torch.tensor(np.nan_to_num(features_dict.get('trade_money_turnover_mean', 0)), dtype=torch.float32),
#                     'trade_money_turnover_std': torch.tensor(np.nan_to_num(features_dict.get('trade_money_turnover_std', 0)), dtype=torch.float32),
                    # BOOK FEATURES
                    'logret1_xs': torch.tensor(np.nan_to_num(features_dict.get('logret1_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'logret2_xs': torch.tensor(np.nan_to_num(features_dict.get('logret2_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_directional_volume1_xs': torch.tensor(np.nan_to_num(features_dict.get('book_directional_volume1_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_directional_volume2_xs': torch.tensor(np.nan_to_num(features_dict.get('book_directional_volume2_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_price_spread1_xs': torch.tensor(np.nan_to_num(features_dict.get('book_price_spread1_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_price_spread2_xs': torch.tensor(np.nan_to_num(features_dict.get('book_price_spread2_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_bid_spread_xs': torch.tensor(np.nan_to_num(features_dict.get('book_bid_spread_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_ask_spread_xs': torch.tensor(np.nan_to_num(features_dict.get('book_ask_spread_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_total_volume_xs': torch.tensor(np.nan_to_num(features_dict.get('book_total_volume_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_volume_imbalance_xs': torch.tensor(np.nan_to_num(features_dict.get('book_volume_imbalance_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    'book_money_turnover1_xs': torch.tensor(np.nan_to_num(features_dict.get('book_money_turnover1_xs', [0]*(int(600/data_interval_seconds)))), dtype=torch.float32),
                    
#                     'askp2_1s':torch.tensor(features_dict.get('askp2_1s', [0]*(int(600/1)))),
#                     'book_directional_volume1_1s':torch.tensor(features_dict.get('book_directional_volume1_1s', [0]*(int(600/1)))) 
                },
                torch.tensor([features_dict['target']])
#                 [features_dict['target']]
        )
    
    def __len__(self):
        return len(self.main_df)
    
    def __getitem__(self, idx):
        #TODO: handle for num_workers more than 0
        #      using https://pytorch.org/docs/stable/data.html
        #      using torch.util.data.get_worker_info()
        if torch.is_tensor(idx):
            idx = idx.tolist()
        stock_id = self.main_df.at[idx, 'stock_id']
        time_id = self.main_df.at[idx, 'time_id']
        x,y = self.__cache_generate_features(stock_id,time_id)
#         x, y = self.__transform_to_01_realized_volatility_linear_data(features_dict)
        return x,y

In [4]:
if __name__=="__main__":
    dataset = OptiverRealizedVolatilityDataset(DATA_DIRECTORY, mode="train", lazy_load=False)

INIT: OptiverRealizedVolatilityDataset
Processed and loaded 16 stocks features.
Processed and loaded 32 stocks features.
Processed and loaded 48 stocks features.
Processed and loaded 64 stocks features.
Processed and loaded 80 stocks features.
Processed and loaded 96 stocks features.
Processed and loaded 112 stocks features.


In [None]:
# for x in range(0,9):
#     print(dataset[x])
# dataset[10000] #[0]['bidp1_1s']
for key,val in dataset[10000][0].items():
    print(key)
    print(val)
    input()

In [5]:
dataloader_train = DataLoader(dataset, batch_size=4096,shuffle=True, num_workers=0, pin_memory=True)
sizes = set()
for train_batch_idx, (feature_dict, feature_y) in enumerate(dataloader_train):
    sizes.add(f"{feature_dict['logrett_xs'].size()}")
        
        
#         print(val)
#         input()
#     print(x)
#     input()

In [6]:
sizes

{'torch.Size([2948, 120])', 'torch.Size([4096, 120])'}

### Learnings about model CNN input
- it's better to use multiple channel for logreturn1 and logreturn2 than stacking it and using as one channel
- 2 channels input for CNN is better than stacking it(dim 2, which is logret1_t1, logret2_t1, logret1_t2, logret2_t2...) and using it as one channel

In [7]:
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
model = None


def loss_fn_mse(y, pred):
    return torch.mean(torch.square((y-pred)))

def loss_fn_mspe(y, pred):
    return torch.mean(torch.square((y-pred)/y))

def loss_fn_orig(y, pred):
    return torch.sqrt(torch.mean(torch.square((y-pred)/y)))

In [8]:
class NeuralNetwork(nn.Module):
    def __init__(self, mode='both', use_stock_id = True):
        super(NeuralNetwork, self).__init__()
        self.use_stock_id = use_stock_id
        self.flatten = nn.Flatten()
        self.mode = mode
        self.cnn_stack = nn.Sequential(
            nn.Conv1d(9, 24, kernel_size=6, stride=2, padding=0),
            nn.GELU(),
#             nn.BatchNorm1d(4),
#             nn.Dropout(0.1),
            nn.Conv1d(24, 32, kernel_size=4, stride=2, padding=0),
            nn.GELU(),
            nn.Conv1d(32, 48, kernel_size=3, stride=1, padding=0),
            nn.GELU(),
            nn.Conv1d(48, 64, kernel_size=2, stride=1, padding=0),
#             nn.ReLU(),
#             nn.BatchNorm1d(8),
#             nn.Conv1d(4, 4, kernel_size=6, stride=3, padding=0),
#             nn.GELU(),
#             nn.Conv1d(4, 4, kernel_size=4, stride=2, padding=0),
#             nn.GELU(),
#             nn.BatchNorm1d(4),
#             nn.Conv1d(4, 4, kernel_size=6, stride=2, padding=0),
#             nn.GELU(),
#             nn.Dropout(0.1),
#             nn.Dropout(0.1),
#             nn.Conv1d(8, 8, kernel_size=4, stride=2, padding=0), 
#             nn.ReLU(),
#             nn.Dropout(0.1),
        )
        self.linear_stack = nn.Sequential(
            nn.Linear(1600, 512),
            nn.GELU(),
#             nn.Dropout(0.3),
            nn.Linear(512, 512),
            nn.GELU(),
            nn.Linear(512, 512),
            nn.GELU(),
            nn.Linear(512, 256),
            nn.GELU(),
#             nn.Dropout(0.3),
#             nn.Linear(256, 64),
#             nn.ReLU(),
#             nn.Linear(64, 16),
#             nn.ReLU()
        )
        self.linear_hybrid = nn.Sequential(
            nn.Linear(256+6, 256),
            nn.GELU(),
            nn.Linear(256, 256),
            nn.GELU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        )
#         self.basic_stack = nn.Sequential(
#             nn.Linear(int(600/2)*1,512),
#             nn.ReLU(),
#             nn.Dropout(0.4),
#             nn.Linear(512,1024),
#             nn.ReLU(),
#             nn.Dropout(0.4),
# #             nn.Linear(2048,1024),
# #             nn.ReLU(),
# #             nn.Dropout(),
#             nn.Linear(1024,512),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(512,128),
#             nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(128,128),
#             nn.ReLU(),
#             nn.Linear(128,1),
#         )
        
    def forward(self, feature_dict):
#         logits = self.basic_stack(x)
#         x = self.flatten(x)
        x = torch.cat([
#                             feature_dict['logrett_xs'].to(device)*10000, 
#                                torch.log(feature_dict['trade_volume_xs'].to(device)+0.001),
#                               torch.log(feature_dict['trade_ordercount_xs'].to(device)+0.001),
#                             feature_dict['trade_volume_xs'].to(device),
#                                         feature_dict['trade_ordercount_xs'].to(device),
#                                         feature_dict['book_total_volume_xs'].to(device),
#                                         feature_dict['book_volume_imbalance_xs'].to(device)
#                             feature_dict['logrett_xs'].to(device)*10000, 
                               torch.log(feature_dict['trade_volume_xs'].to(device)+0.001),
                              torch.log(feature_dict['trade_ordercount_xs'].to(device)+0.001),
                                feature_dict['logret1_xs'].to(device)*10000,
                            
                                 feature_dict['logret2_xs'].to(device)*10000,
                                    feature_dict['book_price_spread1_xs'].to(device)*10000, 
#                                 feature_dict['book_price_spread2_xs'].to(device)*10000, 
                                feature_dict['book_bid_spread_xs'].to(device)*10000, 
                                feature_dict['book_ask_spread_xs'].to(device)*10000, 
#                                  feature_dict['book_directional_volume1_xs'].to(device),
#                                 feature_dict['book_price_spread1_xs'].to(device)*1000,
                                torch.log(feature_dict['book_total_volume_xs'].to(device)+0.001),
                                torch.log(feature_dict['book_volume_imbalance_xs'].to(device)+0.001),
# #                              feature_dict['book_dirvolume_xs'],
                          ], 1)

#         x = torch.nan_to_num(feature_dict['logrett_xs']).type(torch.cuda.FloatTensor)
        
        
#         print(x)
#         input()
#         if torch.isnan(x).any():
# #             print(x)
#             print(feature_dict)
#             input()
        x = x.to(device)
        x = x.reshape(-1,9,data_intervals_count)
        
        logits = self.cnn_stack(x)
        logits = self.flatten(logits)
        
       
        
        logits = self.linear_stack(logits)
        logits = torch.cat( [logits, 
                             torch.log(feature_dict['trade_money_turnover_mean'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001), 
                                           torch.log(feature_dict['trade_money_turnover_std'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001),
                                           torch.log(feature_dict['trade_price_mean'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001),
                                           torch.log(feature_dict['book_money_turnover_mean'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001),
                                           torch.log(feature_dict['book_money_turnover_std'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001),
                                           torch.log(feature_dict['book_price_mean'].type(torch.cuda.FloatTensor).to(device).reshape(-1,1)+0.001)
                                      ], 1)
        
        if self.use_stock_id:
            stock_id = torch.tensor(feature_dict['stock_id']).reshape(-1,1)
            stock_id = stock_id.to(device)
            logits = torch.cat([logits, stock_id], 1)
            
        logits = self.linear_hybrid(logits)
        return logits





In [127]:
# class VolatilityGRU(nn.Module):
#     def __init__(self, input_size=1, hidden_size=64, repeated_cells=1):
#         self.input_size = input_size
#         self.hidden_size = hidden_size

class SingleFetGRU(nn.Module):
    def __init__(self, hidden_size=64, layers=1, dropout=0, features_out=32, mode="train"):
        """single feature, feature learner
        `mode`: train|feature_generator
        """
        super(SingleFetGRU, self).__init__()
        self.input_size_ = 1
        self.hidden_size_ = hidden_size
        self.repeated_lstm_cells_ = layers
        self.dropout_ = dropout
        self.features_out = features_out
        
        self.rnn_ = nn.GRU(self.input_size_, self.hidden_size_, self.repeated_lstm_cells_, batch_first=True, dropout=self.dropout_)
        
        self.linear_feature_stack_ = nn.Sequential(
            nn.Linear(self.hidden_size_*self.repeated_lstm_cells_, 128),
            nn.GELU(),
            nn.Linear(128, 128),
            nn.GELU(),
            nn.Linear(128, self.features_out),
        )
        
        self.linear_trainer_stack_ = nn.Sequential(
            nn.Linear(self.features_out, 128),
            nn.GELU(),
            nn.Linear(128, 64),
            nn.GELU(),
            nn.Linear(64, 32),
            nn.GELU(),
            nn.Linear(32, 1),   
        )
        
    def set_mode(self, mode):
        self.mode = mode
        
    def forward(self, feature_tensor):
        if self.mode in ["feature_generator","train"]:
            h_0_ = torch.rand(self.repeated_lstm_cells_, feature_tensor.size(0), self.hidden_size_, device=device) #hidden state
            output_, hn_ = self.rnn_(feature_tensor, h_0_) #lstm with input, hidden, and internal state
            hn_ = hn_.reshape(-1, self.hidden_size_*self.repeated_lstm_cells_) #reshaping the data for Dense layer next  
            
            out_ = self.linear_feature_stack_(hn_)
            
            if self.mode == "train":
                out_ = self.linear_trainer_stack_(out_)
            
            return out_
            
            
            
class VolatilityBSModel(nn.Module):
    def __init__(self, mode="hybrid", use_stock_id=False):
        """various rnn features' fusion with fully connected nn
        `mode`: hybrid|<feature_name>
        """
        super(VolatilityBSModel, self).__init__()
        self.mode = mode
        self.use_stock_id = use_stock_id
        self.feature_list = ['logrett_xs','trade_volume_xs','trade_ordercount_xs','trade_money_turnover_xs','trade_money_turnover_per_order_xs',
                             'logret1_xs','logret2_xs','book_directional_volume1_xs','book_directional_volume2_xs',
                             'book_price_spread1_xs','book_price_spread2_xs','book_bid_spread_xs','book_ask_spread_xs',
                             'book_total_volume_xs','book_volume_imbalance_xs','book_money_turnover1_xs']
        self.feature_gen_feature_size = 32
        self.feature_gen_models = {}
        for k in self.feature_list:
            self.feature_gen_models[k]=SingleFetGRU(hidden_size=64, layers=1, dropout=0, features_out=self.feature_gen_feature_size) 
            self.feature_gen_models[k].to(device)
        
        
        self.linear_fusion = nn.Sequential(
            nn.Linear(self.feature_gen_feature_size*len(self.feature_list) + (1 if self.use_stock_id else 0), 512),
            nn.GELU(),
            nn.Linear(512,512),
            nn.GELU(),
            nn.Linear(512,256),
            nn.GELU(),
            nn.Linear(256,128),
            nn.GELU(),
            nn.Linear(128,128),
            nn.ReLU(),
            nn.Linear(128,1)
        )
        self.set_mode(self.mode)
    
    def get_feature_gen_train_modes(self):
        return self.feature_list
    
    def set_mode(self, mode):
        print(f"------- set mode : {mode} -----------")
        self.mode = mode
        for feature_gen_model in self.feature_gen_models.values():
            feature_gen_model.set_mode('feature_generator' if self.mode in ['hybrid','hybrid_feature_out'] else 'train')
    
    def parameters(self):
        
        generator_sources_map = {k:[v] for k,v in self.feature_gen_models.items()}
        generator_sources_map['hybrid']= [self.linear_fusion]
        
        params = []
        if self.mode in generator_sources_map:
            for generator_source in generator_sources_map[self.mode]:
                for param in generator_source.parameters():
                    params.append(param)
        else:
            return super(VolatilityBSModel,self).parameters()
        return params
    
    def feature_transform(self, feature_x, feature_name):
        if feature_name in ['logrett_xs',
                             'logret1_xs','logret2_xs',
                             'book_price_spread1_xs','book_price_spread2_xs','book_bid_spread_xs','book_ask_spread_xs']:
            return feature_x * 10000
        if feature_name in ['trade_volume_xs','trade_money_turnover_xs','trade_money_turnover_per_order_xs',
                             'book_total_volume_xs','book_volume_imbalance_xs','book_money_turnover1_xs']:
            return torch.log(feature_x + 0.001)
        return feature_x
    
    def forward(self, feature_dict):
        
        if self.mode in self.feature_list:
            feature_x = feature_dict[self.mode].to(device).reshape(-1, data_intervals_count ,1)
            feature_x = self.feature_transform(feature_x, self.mode)
            out = self.feature_gen_models[self.mode](feature_x)
            return out
        
        if self.mode in ['hybrid','hybrid_feature_out']:
            generated_features = []
            for feature_name, feature_gen_model in self.feature_gen_models.items():
                feature_x = feature_dict[feature_name].to(device).reshape(-1, data_intervals_count ,1)
                feature_x = self.feature_transform(feature_x, feature_name)
                features_out = feature_gen_model(feature_x)
                generated_features.append(features_out)
                
                
            combined_features = torch.cat(generated_features, 1).reshape(-1, self.feature_gen_feature_size*len(self.feature_list))
            
            if self.use_stock_id:
                stock_id = feature_dict['stock_id'].to(device).reshape(-1,1)
                combined_features = torch.cat([combined_features, stock_id], 1)
                
            if self.mode == 'hybrid_feature_out':
                return combined_features
            
            out = self.linear_fusion(combined_features)
            return out
        
#         input("--- out got")
        


        

In [10]:
# model = VolatilityBSModel(use_stock_id=use_stock_id)
# # model = NeuralNetwork(use_stock_id=False)
# model.to(device)
# adam_for_modes = {}

# for modeidx, mode in enumerate(['yoyo','trade','experiment','book','hybrid']*1):
#     epochs = 1
#     model.mode = mode
#     print(model.parameters())
#     input()

In [11]:
# model = VolatilityBSModel()

In [81]:
# for param in model.parameters():
#     print(param)

In [None]:
for layer in model.children():
    print()
    print()
    print(layer)
    print("-------")
    for l in layer.children():
        print([x for x in l.parameters()])

#### analyze the initial weights (or change them)

In [86]:
# # @torch.no_grad()
# def init_weights(m):
# #     print(m)
#     if type(m) == nn.Linear:
# #         m.weight.fill_(1.0)
#         torch.nn.init.xavier_uniform_(m.weight,gain=10)
#         m.bias.data.uniform_(-1,1)
# #     elif type(m) == nn.ReLU:
# #         print(m.data)
#     else:
#         print(type(m))
# #         print(m.weight)
# model.apply(init_weights)
# # for param in model.parameters():
# # #     print(param)
# #       print(param.data.size(), param.data)

### LEarning rate: our base line is 0.34 loss as that's what the optiver guys have when they use current 10 min realize vol and use it as target (copy to prediction). We create simplest neural network and work with learning rates to figure out what's best and when we see something in range of 0.35 then we've found good Learning rate
- #### SGD: 1e-7 works best
- #### ADAM: 1e-5, (NOTE: 1e-3 makes it behave dumb where some deep local minima gets stuck and produces constant output!)
- TODO: analyze that constant output phenomenon more

In [87]:
# learning_rate = 1e-4
# batch_size = 4096
# epochs = 100

# input_scaling = 1
# output_scaling = 1

# # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8)
# strategyname = "ret1_n_ret2"
# summary_writer = SummaryWriter(f'../output/training_tensorboard/{strategyname}_scaleIn{input_scaling}Out{output_scaling}_{learning_rate}_{batch_size}')

In [96]:
model = None

### Learnings about training
- (non scaling)logreturns input and volatility output; non scaled makes the model predict constant output with no variety(close to 0 std dev)
- scaling input rids of variety issue, 
- scaling output makes the model start with low rmse initially so there's less ground to cover and we can iterate over ideas rapidly due to less epochs needed to achieve

In [126]:
# model = None
strategyname = "ultimate_hybridEXP3_RNN_5s"


print("DEVICE:", device)
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
training_configs = []
learning_rates_to_try = [1e-3]# 1e-4]
batch_sizes_to_try = [64]#, 512]#,10000, 128]
# input_scalings_to_try = [1000]
# output_scalings_to_try = [10000]
output_scaling = 10000
for learning_rate in learning_rates_to_try:
    for batch_size in batch_sizes_to_try:
        for use_stock_id in [False, True]:
            training_configs.append({
                'learning_rate':learning_rate,
                'batch_size':batch_size,
                'use_stock_id': use_stock_id
            })

epochs = 200
for training_config in training_configs:
    
    learning_rate = training_config['learning_rate']
    batch_size = training_config['batch_size']
    use_stock_id = training_config['use_stock_id']
    # TRAINING SETUP
    
    #refresh the model
    
    STRATEGY_NAME_WITH_ATTRS = f"{strategyname}_Stk{use_stock_id}_{learning_rate}_{batch_size}"
    summary_writer = SummaryWriter(f'../output/training_tensorboard/{STRATEGY_NAME_WITH_ATTRS}')

    del model
    torch.cuda.empty_cache()
    model = VolatilityBSModel(use_stock_id=use_stock_id)
#     model = NeuralNetwork(use_stock_id=False)
    model.to(device)
    optimizer_for_modes = {}
    
    
    
    for modeidx, mode in enumerate(['hybrid'] + model.get_feature_gen_train_modes() + ['hybrid']):
        epochs = 1
        model.set_mode(mode)
#         print(model.parameters())
#         input()
#         continue
        if mode not in optimizer_for_modes:
            optimizer_for_modes[mode] = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8)
#             optimizer_for_modes[mode] = torch.optim.SGD(model.parameters(), lr=learning_rate)
        optimizer = optimizer_for_modes[mode]

        
        # TRAINING SETUP DONE

        

        data_ohlc_sample_len = 1 # 1 for each of open high low close
        losses_train = []
        for t in range(epochs):
            t = modeidx*epochs + t
            print(f"Epoch {t+1}\n-------------------------------")
            print("----------", STRATEGY_NAME_WITH_ATTRS, mode,"----------")

            dataloader_train = DataLoader(train_dataset, batch_size=batch_size,
                                shuffle=True, num_workers=0, pin_memory=True)
#             model.train()

            for train_batch_idx, (Feature_X, feature_y) in enumerate(dataloader_train):

                y = feature_y.to(device) * output_scaling 

                pred = model(Feature_X)
#                 pred.to(device)
#                 print(pred)
#                 input()
                loss_orig = loss_fn_orig(y, pred)
                
                optimizer.zero_grad()
                loss_orig.backward()
                optimizer.step()


                losses_train.append(loss_orig.item())

                if (t*int(train_size/batch_size) + train_batch_idx + 1) % int(train_size/20/batch_size) == 0:

                    # NOTE: real loss is same as upscaled normalized loss as it's percentage loss (rmspe)
                    prediction_variety = np.std((pred/output_scaling).reshape(-1).tolist()) * 100
                    #NOTE: prediction variety is important as model sometimes predits a constant value! regardless of the input, then per batch variety is lowest(0 std dev)


                    summary_writer.add_scalar("Prediction Variety", prediction_variety, t*(train_size) + (train_batch_idx*batch_size))
                    summary_writer.add_scalar("Training Loss", np.mean(losses_train), t*(train_size) + (train_batch_idx*batch_size))

                    print("train:", np.mean(losses_train), f"[{train_batch_idx*batch_size:>5d}/{train_size:>5d}]")
                    losses_train = []

            dataloader_test = DataLoader(test_dataset, batch_size=batch_size,
                                    shuffle=True, num_workers=0, pin_memory=True)
            dataset_size = len(dataloader_test.dataset)
            
#             model.eval()

            losses_test = []
            for _, (Feature_X, feature_y) in enumerate(dataloader_test):
                with torch.no_grad():
                    y = feature_y.to(device) * output_scaling
                    pred = model(Feature_X)
                    loss = loss_fn_orig(y, pred)
                    losses_test.append(loss.item())


    #                 summary_writer.add_scalar("Epoch Training Loss", np.mean(losses_train), (t+1)*train_size)
            summary_writer.add_scalar("Test Loss", np.mean(losses_test), t*(train_size) + (train_batch_idx*batch_size))
            print("train:", np.mean(losses_train), "test:", np.mean(losses_test), f"[{train_batch_idx*batch_size:>5d}/{train_size:>5d}]")
            losses_test = []
            if (t+1)%50==0:
                torch.save(model.state_dict(), os.path.join(MODEL_OUTPUT_DIRECTORY,f"{STRATEGY_NAME_WITH_ATTRS}_epoch_{t}_tloss_{loss:.4f}.pth"))
    
            

DEVICE: cuda:0
------- set mode : hybrid -----------
------- set mode : hybrid -----------
Epoch 1
-------------------------------
---------- ultimate_hybridEXP3_RNN_5s_StkFalse_0.001_64 hybrid ----------
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 32])
torch.Size([64, 512])


 


RuntimeError: Function AddmmBackward returned an invalid gradient at index 1 - got [64, 128] but expected shape compatible with [64, 512]

In [19]:
torch.save(model.state_dict(), os.path.join(MODEL_OUTPUT_DIRECTORY,f"11_{STRATEGY_NAME_WITH_ATTRS}_epoch_{t}_tloss_{np.mean(losses_test):.4f}.pth"))

In [114]:
del model
torch.cuda.empty_cache()
import gc
gc.collect()

1633

In [65]:
def optiver_custom_collate_func(batch):
    output_x = {}
    for k,v in batch[0][0].items():
        output_x[k] = []
    
    for x_dict in [x[0] for x in batch]:
        for k,v in x_dict.items():
            output_x[k].append(v)
    
    for k,v in batch[0][0].items():
        if type(output_x[k][0]) != str:
            output_x[k] = torch.stack(output_x[k])
        
    output_y = []
    for y in [x[1] for x in batch]:
        output_y.append(y)
    output_y = torch.stack(output_y)
    
    return (output_x, output_y)
#     input()
#     print(batch)
# #     return batch
#     input()
#     return batch

In [70]:
stime = time.time()

    
dataloader_train = DataLoader(dataset, batch_size=3,
                                shuffle=False, num_workers=0, pin_memory=False)#, collate_fn=optiver_custom_collate_func)
# #             model.train()
# i = 0
# stockid = set()
for train_batch_idx, (Feature_X, feature_y) in enumerate(dataloader_train):
    i += 1
#     print(feature_y)
#     input()
# batch = []
# for idx in range(len(dataset)):
#     batch.append(dataset[idx])
#     if idx % 128 == 0:
#         features_x = [x[0] for x in batch]
#         features_y = [x[1] for x in batch]
#         features_y = torch.tensor(features_y).reshape(-1,1)
# #         print(features_y)
# #         input()
#         batch = []
    
#     y = feature_y.to(device) * output_scaling 
#     print(Feature_X['logret1_xs'].type())
#     pred = model(Feature_X)
#     print(pred.type())
#     input()
#     for stk in Feature_X['row_id']:
        
#         stockid.add(stk.split("-")[0])
# for i in range(len(dataset)-10):
#     dataset[i]
print("-->", (time.time()-stime))

--> 27.83697199821472


In [None]:
torch.cuda.close()

In [None]:
torch.cuda.memory_allocated(device)/1024/1024/1024
# model.to("cpu")
# torch.cuda.memory_stats()

In [None]:
torch.cuda.init()