In [1]:
import os
import time
import multiprocessing
from multiprocessing import Pool
import types

import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter

# from optiver_features_handler import get_features_map_for_stock, get_row_id, realized_volatility

In [2]:
DATA_DIRECTORY = os.path.join("..","input","optiver-realized-volatility-prediction")
TRADE_TRAIN_DIRECTORY = os.path.join(DATA_DIRECTORY,"trade_train.parquet")
TRADE_TEST_DIRECTORY = os.path.join(DATA_DIRECTORY,"trade_test.parquet")
BOOK_TRAIN_DIRECTORY = os.path.join(DATA_DIRECTORY,"book_train.parquet")
BOOK_TEST_DIRECTORY = os.path.join(DATA_DIRECTORY,"book_test.parquet")
OUTPUT_DIRECTORY = os.path.join("..","output")
os.makedirs(OUTPUT_DIRECTORY,exist_ok=True)

In [3]:

def get_row_id(stock_id, time_id):
    if type(time_id) != int:
        time_id = int(time_id)
    return f"{stock_id:.0f}-{time_id}"
    

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))



def get_features_map_for_stock(data_directory, mode, main_stock_id):
        """gets the `stock_id-row_id` wise feature map
        `data_directory`: is where the train.csv and other parquet folders are present
        `mode`: train|test
        `main_stock_id`: the stock id! zlul
        """
        interval_second = 6
        intervals_count = 600//interval_second
        
        feature_map = {}
        book_df = pd.read_parquet(os.path.join(data_directory, f"book_{mode}.parquet", f"stock_id={main_stock_id}"))
        trade_df = pd.read_parquet(os.path.join(data_directory, f"trade_{mode}.parquet", f"stock_id={main_stock_id}"))
        
        book_df['wap1'] = (book_df['bid_price1'] * book_df['ask_size1'] + book_df['ask_price1'] * book_df['bid_size1'])/(book_df['bid_size1'] + book_df['ask_size1'])
        book_df['logret1'] = np.log(book_df['wap1']).diff()
        book_df['wap2'] = (book_df['bid_price2'] * book_df['ask_size2'] + book_df['ask_price2'] * book_df['bid_size2'])/(book_df['bid_size2'] + book_df['ask_size2'])
        book_df['logret2'] = np.log(book_df['wap2']).diff()
        book_df['wap_balance'] = abs(book_df['wap1'] - book_df['wap2'])
        book_df['logret_bid_price1'] = np.log(book_df['bid_price1']).diff()
        book_df['logret_ask_price1'] = np.log(book_df['ask_price1']).diff()
        book_df['logret_bid_price2'] = np.log(book_df['bid_price2']).diff()
        book_df['logret_ask_price2'] = np.log(book_df['ask_price2']).diff()
        
        book_df['price_spread1'] = (book_df['ask_price1'] - book_df['bid_price1']) / ((book_df['ask_price1'] + book_df['bid_price1'])/2)
        book_df['bid_spread'] = abs(book_df['bid_price1'] - book_df['bid_price2']) / ((book_df['bid_price1'] + book_df['bid_price2'])/2)
        book_df['ask_spread'] = abs(book_df['ask_price1'] - book_df['ask_price2']) / ((book_df['ask_price1'] + book_df['ask_price2'])/2)
        book_df["bid_ask_spread"] = abs(book_df['bid_spread'] - book_df['ask_spread'])
        book_df['directional_volume1'] = (book_df['bid_size1'] - book_df['ask_size1'])
        book_df['directional_volume2'] = (book_df['bid_size2'] - book_df['ask_size2'])
        book_df['logret_directional_volume1'] = np.log(book_df['directional_volume1'] - book_df['directional_volume1'].min() + 1).diff()
        book_df['logret_directional_volume2'] = np.log(book_df['directional_volume2'] - book_df['directional_volume2'].min() + 1).diff()
        
        book_df['total_volume'] = book_df['ask_size1'] + book_df['bid_size1'] + book_df['ask_size2'] + book_df['bid_size2']
        book_df['volume_imbalance'] = abs(book_df['ask_size1'] - book_df['bid_size1'] + book_df['ask_size2'] - book_df['bid_size2'])
        
        trade_df['trade_money_turnover'] = trade_df['size'] * trade_df['price']
        trade_df['logret_trade_money_turnover'] = np.log(trade_df['size'] * trade_df['price']).diff()
        
#         trade_df['trade_money_turnover_per_order'] = (trade_df['size'] * trade_df['price'] / trade_df['order_count'])
        trade_df['logret_price'] = np.log(trade_df['price']).diff()
#         trade_df['trade_tendancy'] = trade_df['logret_price'] * trade_df['size']
        merged_df = book_df.merge(trade_df,how='left',on=['time_id','seconds_in_bucket']).reset_index(drop=False)
        
        merged_df['nwap1'] = (merged_df['ask_price1'] + merged_df['bid_price1'])/2
        merged_df['trade_price_push_on_book'] = (merged_df['price'] - merged_df['nwap1'])/(merged_df['price'] + merged_df['nwap1'])/2
#         merged_df['trade_volume_on_book'] = (merged_df['size']/(merged_df['bid_size1']+merged_df['ask_size1']+merged_df['bid_size2']+merged_df['ask_size2']))
        
        del book_df
        del trade_df
        
        overview_aggregations = {
        'wap1': ['sum', 'std'],
        'wap2': ['sum', 'std'],
        'logret1': [realized_volatility],
        'logret2': [realized_volatility],
        'logret_price': [realized_volatility],
        'wap_balance': ['sum', 'max'],
        'price_spread1': ['sum', 'max'],
        'bid_spread': ['sum', 'max'],
        'ask_spread': ['sum', 'max'],
        'total_volume': ['sum', 'max'],
        'volume_imbalance': ['sum', 'max'],
        "bid_ask_spread": ['sum', 'max'],
        'size':  ['sum', 'max','min'],
        'order_count': ['sum', 'max'],
        'trade_money_turnover': ['sum', 'max','min'],
        }
        aggregations = merged_df.groupby('time_id').agg(overview_aggregations).reset_index(drop=False)
        aggregations = aggregations.fillna(-0.01)
        aggregations.columns = ['_'.join(col).strip() for col in aggregations.columns.values]
        for idx, row in aggregations.iterrows():
            row = row.to_dict()
#             print(row)
#             input()
            time_id = row['time_id_']
#             print(int(time_id), type(time_id))
#             input()
            rowid = get_row_id(main_stock_id, time_id)
            
            if rowid not in feature_map:
                feature_map[rowid] = {}
            
            for key, aggs in overview_aggregations.items():
                for agg in aggs:
                    if isinstance(agg, types.FunctionType):
                        agg = agg.__name__
                    feature_map[rowid][f'{key}_{agg}'] = row[f'{key}_{agg}']
        del aggregations
        aggregations = merged_df[merged_df['seconds_in_bucket']>=400].groupby('time_id').agg(overview_aggregations).reset_index(drop=False)
        aggregations = aggregations.fillna(-0.01)
        aggregations.columns = ['_'.join(col).strip() for col in aggregations.columns.values]
        for idx, row in aggregations.iterrows():
            row = row.to_dict()
#             print(row)
#             input()
            time_id = row['time_id_']
#             print(int(time_id), type(time_id))
#             input()
            rowid = get_row_id(main_stock_id, time_id)
            
            if rowid not in feature_map:
                feature_map[rowid] = {}
            
            for key, aggs in overview_aggregations.items():
                for agg in aggs:
                    if isinstance(agg, types.FunctionType):
                        agg = agg.__name__
                    feature_map[rowid][f'{key}_{agg}_400'] = row[f'{key}_{agg}']
        del aggregations
        
        merged_df['seconds_in_bucket'] = merged_df['seconds_in_bucket'] // interval_second
        merge_prepared_df = merged_df.groupby(['time_id','seconds_in_bucket']).agg('sum').reset_index(drop=False)
        del merged_df
        for groupkey, groupdf in merge_prepared_df.groupby('time_id'):

            rowid = get_row_id(main_stock_id, groupkey)
            
            if rowid not in feature_map:
                feature_map[rowid] = {}
            
            sequence_length = len(groupdf['seconds_in_bucket'].to_numpy())
                              
            groupdf['has_trade_data'] = (~groupdf['price'].isnull()).astype(int)
            
            
            feature_map[rowid]['sequence_mask_xs'] = [False]*sequence_length + [True]*(intervals_count-sequence_length)
            feature_map[rowid]['has_trade_data_xs'] = np.concatenate([groupdf['has_trade_data'].to_numpy(),[0]*(intervals_count-sequence_length)])
            feature_map[rowid]['seconds_in_bucket_xs'] = np.concatenate([groupdf['seconds_in_bucket'].to_numpy(),[0]*(intervals_count-sequence_length)])

            for feature_name in ['logret1','logret_bid_price1','logret_ask_price1','logret_bid_price2','logret_ask_price2','price_spread1','bid_spread','ask_spread','logret_directional_volume1','logret_directional_volume2','logret_trade_money_turnover','trade_price_push_on_book','logret_price','order_count']: #'bid_price2','bid_size2','ask_price2','ask_size2'
                nan_replace_val = -0.01
                if feature_name in ['logret_bid_price1','logret_ask_price1','logret_bid_price2','logret_ask_price2','trade_price_push_on_book','logret_price']:
                    nan_replace_val = 0.0
                    
                feature_map[rowid][f'{feature_name}_xs'] = np.concatenate([
                                                        np.nan_to_num(groupdf[feature_name].to_numpy(),nan=nan_replace_val,neginf=nan_replace_val,posinf=nan_replace_val),
                                                        [0]*(intervals_count-sequence_length)
                                                                          ])
#                 feature_map[rowid][f'{feature_name}_v_xs'] = np.concatenate([np.nan_to_num(1/(groupdf[feature_name].to_numpy()),nan=-0.01,neginf=-0.01,posinf=-0.01),[0]*(intervals_count-sequence_length)])
#             groupdf = groupdf.fillna(-0.01) 
            # transformer mask ignores the True values,and False remains unchanged
                
#         print(merged_df)
#         input()
     
#         import gc
        del merge_prepared_df
#         gc.collect()
        return feature_map
    

    

In [4]:
data_interval_seconds = 6
data_intervals_count = int(600/data_interval_seconds)

def transform_to_01_realized_volatility_linear_data(features_dict):
        feature_x  = {
                    'row_id':features_dict['row_id'],
                    'stock_id':torch.tensor(features_dict['stock_id'], dtype=torch.float32),
                    'sequence_mask_xs': torch.tensor(features_dict.get('sequence_mask_xs', [False]+[True]*(int(600/data_interval_seconds)-1)), dtype=torch.bool),
                    'seconds_in_bucket_xs': torch.tensor(features_dict.get('seconds_in_bucket_xs', [(idx) for idx in range(0,int(data_intervals_count))]), dtype=torch.float32),
                    'has_trade_data_xs': torch.tensor(features_dict.get('has_trade_data_xs', [0]*(int(600/data_interval_seconds))), dtype=torch.float32),
                }
        overview_aggregations = {
        'wap1': ['sum', 'std'],
        'wap2': ['sum', 'std'],
        'logret1': [realized_volatility],
        'logret2': [realized_volatility],
        'logret_price': [realized_volatility],
        'wap_balance': ['sum', 'max'],
        'price_spread1': ['sum', 'max'],
        'bid_spread': ['sum', 'max'],
        'ask_spread': ['sum', 'max'],
        'total_volume': ['sum', 'max'],
        'volume_imbalance': ['sum', 'max'],
        "bid_ask_spread": ['sum', 'max'],
        'size':  ['sum', 'max','min'],
        'order_count': ['sum', 'max'],
        'trade_money_turnover': ['sum', 'max','min'],
        }
        
        for key, aggs in overview_aggregations.items():
            for agg in aggs:
                if isinstance(agg, types.FunctionType):
                    agg = agg.__name__
                feature_x[f'{key}_{agg}'] = torch.tensor(features_dict.get(f'{key}_{agg}', -0.01), dtype=torch.float32)
                feature_x[f'{key}_{agg}_400'] = torch.tensor(features_dict.get(f'{key}_{agg}_400', -0.01), dtype=torch.float32)
                
#         for feature_name in ['wap1','wap_balance','logret1','logret2','logrett',
#                              'price_spread1','bid_spread','ask_spread','total_volume','volume_imbalance',
#                             'size','order_count','trade_money_turnover','trade_book_price_spread']:
            
#             feature_x[f'{feature_name}_sum_xs'] = torch.tensor(features_dict.get(f'{feature_name}_sum_xs', [-0.01]*(int(600/data_interval_seconds))), dtype=torch.float32)
#             feature_x[f'{feature_name}_max_xs'] = torch.tensor(features_dict.get(f'{feature_name}_max_xs', [-0.01]*(int(600/data_interval_seconds))), dtype=torch.float32)
        for feature_name in ['logret1','logret_bid_price1','logret_ask_price1','logret_bid_price2','logret_ask_price2',
                        'price_spread1','bid_spread','ask_spread','logret_directional_volume1','logret_directional_volume2','logret_trade_money_turnover','trade_price_push_on_book','logret_price','order_count']: #'bid_price2','bid_size2','ask_price2','ask_size2',
            feature_x[f'{feature_name}_xs'] = torch.tensor(features_dict.get(f'{feature_name}_xs', [-0.01]*(int(600/data_interval_seconds))), dtype=torch.float32)
#             feature_x[f'{feature_name}_v_xs'] = torch.tensor(features_dict.get(f'{feature_name}_v_xs', [-0.01]*(int(600/data_interval_seconds))), dtype=torch.float32)
#             np.concatenate([groupdf[feature_name].to_numpy(),[0]*(intervals_count-sequence_length)])
        return (
                feature_x,
                {'target_realized_volatility':torch.tensor([features_dict['target_realized_volatility']])}
#                 [features_dict['target']]
        )

In [5]:
def iterate_optiver_dataset_for_evaluation(mode):
    if mode not in ['train','test']:
        raise Exception("Invalid mode passed for Optiver dataset. Valid values:train|test")



    main_df = pd.read_csv(os.path.join(DATA_DIRECTORY,f'{mode}.csv'))
    #         if self.mode == 'train':
    #             self.main_df['row_id'] = self.main_df.apply(lambda x: f"{x['stock_id']:.0f}-{x['time_id']:.0f}", axis=1)
    if mode == 'test':
        main_df['target'] = 0


    main_df['row_id'] = main_df['stock_id'].astype(str) + "-" +main_df['time_id'].astype(str)
    targets = main_df['target'].tolist()

    for stock_id, stock_df in main_df.groupby(['stock_id']):
        default_feature_set_list = {}

        for idx, row_id in enumerate(stock_df['row_id'].tolist()):
            stock_id = int(row_id.split('-')[0])
            time_id = int(row_id.split('-')[1])
            default_feature_set_list[row_id] = {'target_realized_volatility':targets[idx], 'stock_id':stock_id,'time_id':time_id,'row_id':row_id}

        feature_set_list = get_features_map_for_stock(DATA_DIRECTORY, mode, stock_id)
#         print(feature_set_list)

        for rowid, features_dict in feature_set_list.items():
            for fkey,fval in features_dict.items():
                default_feature_set_list[rowid][fkey] = fval
#         print(default_feature_set_list)
        for row_id, datadict in default_feature_set_list.items():
            x,y = transform_to_01_realized_volatility_linear_data(datadict)
            
            for fedict in [x,y]:
                for k,v in fedict.items():
                    if torch.is_tensor(v):
                        fedict[k] = v.reshape(1,-1)
                    else:
                        fedict[k] = [v]
            
            
            yield x,y


#         for time_id, time_df in stock_df.groupby(['time_id']):
#             row_id = time_df['row_id'].iloc[0]
#             yield default_feature_set_list[row_id]
    
        del default_feature_set_list
   
            

In [6]:
# for feature_x, feature_y in iterate_optiver_dataset_for_evaluation('test'):
#     for k,v in feature_x.items():
#         print(k)
#         input(v)
#     print(feature_y)
#     input()

In [7]:
use_cuda = torch.cuda.is_available()
# use_cuda = False
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
model = None


def loss_fn_mse(y, pred):
    return torch.mean(torch.square((y-pred)))

def loss_fn_mspe(y, pred):
    return torch.mean(torch.square((y-pred)/y))

def loss_fn_orig(y, pred):
    return torch.sqrt(torch.mean(torch.square((y-pred)/y)))

In [8]:
realize_volatility_scale_factor = 1000
def scale_optiver_feature(feature_name, feature_tensor):
    standard_scaling_feature_map = {'stock_id': {'mean': 62.43794250488281, 'std': 37.12644958496094},
 'seconds_in_bucket_xs': {'mean': 46.53627014160156,
  'std': 30.269084930419922},
 'has_trade_data_xs': {'mean': 0.9432891011238098, 'std': 0.23128946125507355},
 'wap1_sum': {'mean': 389.92791748046875, 'std': 135.835205078125},
 'wap1_sum_400': {'mean': 128.18423461914062, 'std': 47.224849700927734},
 'wap1_std': {'mean': 0.0011102678254246712, 'std': 0.0010516541078686714},
 'wap1_std_400': {'mean': 0.0006360253901220858, 'std': 0.0005713916034437716},
 'wap2_sum': {'mean': 389.9277038574219, 'std': 135.83547973632812},
 'wap2_sum_400': {'mean': 128.1841583251953, 'std': 47.2249641418457},
 'wap2_std': {'mean': 0.0011489872122183442, 'std': 0.0010650980984792113},
 'wap2_std_400': {'mean': 0.0006859369459562004, 'std': 0.0005950198974460363},
 'logret1_realized_volatility': {'mean': 0.005850940477102995,
  'std': 0.004778958857059479},
 'logret1_realized_volatility_400': {'mean': 0.0022959925699979067,
  'std': 0.0019109743880107999},
 'logret2_realized_volatility': {'mean': 0.007210279814898968,
  'std': 0.005797804333269596},
 'logret2_realized_volatility_400': {'mean': 0.003152207238599658,
  'std': 0.0026645385660231113},
 'logret_price_realized_volatility': {'mean': 0.004610072355717421,
  'std': 0.004089121241122484},
 'logret_price_realized_volatility_400': {'mean': 0.0014677889412268996,
  'std': 0.0011549298651516438},
 'wap_balance_sum': {'mean': 0.09234429150819778, 'std': 0.08365236222743988},
 'wap_balance_sum_400': {'mean': 0.029098447412252426,
  'std': 0.026479562744498253},
 'wap_balance_max': {'mean': 0.0010880399495363235,
  'std': 0.0012415354140102863},
 'wap_balance_max_400': {'mean': 0.0008600918226875365,
  'std': 0.000884830835275352},
 'price_spread1_sum': {'mean': 0.2226376235485077, 'std': 0.20077848434448242},
 'price_spread1_sum_400': {'mean': 0.07035906612873077,
  'std': 0.06335863471031189},
 'price_spread1_max': {'mean': 0.0014055456267669797,
  'std': 0.001588768558576703},
 'price_spread1_max_400': {'mean': 0.0011413421016186476,
  'std': 0.0011644071200862527},
 'bid_spread_sum': {'mean': 0.07556163519620895, 'std': 0.06961646676063538},
 'bid_spread_sum_400': {'mean': 0.024438833817839622,
  'std': 0.02328931912779808},
 'bid_spread_max': {'mean': 0.0007034957525320351,
  'std': 0.0008316159946843982},
 'bid_spread_max_400': {'mean': 0.0005391522427089512,
  'std': 0.0005698652239516377},
 'ask_spread_sum': {'mean': 0.07635528594255447, 'std': 0.06991078704595566},
 'ask_spread_sum_400': {'mean': 0.024674607440829277,
  'std': 0.023346172645688057},
 'ask_spread_max': {'mean': 0.0007134961779229343,
  'std': 0.0008447545696981251},
 'ask_spread_max_400': {'mean': 0.0005458995583467185,
  'std': 0.0005785721004940569},
 'total_volume_sum': {'mean': 1629942.5, 'std': 9067553.0},
 'total_volume_sum_400': {'mean': 547846.375, 'std': 3054834.25},
 'total_volume_max': {'mean': 6443.4765625, 'std': 27511.509765625},
 'total_volume_max_400': {'mean': 5374.6015625, 'std': 24850.201171875},
 'volume_imbalance_sum': {'mean': 396575.46875, 'std': 2442117.5},
 'volume_imbalance_sum_400': {'mean': 132286.90625, 'std': 828395.75},
 'volume_imbalance_max': {'mean': 3689.99755859375, 'std': 13807.1123046875},
 'volume_imbalance_max_400': {'mean': 2674.690185546875,
  'std': 10599.6044921875},
 'bid_ask_spread_sum': {'mean': 0.038354359567165375,
  'std': 0.050127506256103516},
 'bid_ask_spread_sum_400': {'mean': 0.011937146075069904,
  'std': 0.01599636673927307},
 'bid_ask_spread_max': {'mean': 0.000663700804580003,
  'std': 0.0009456104598939419},
 'bid_ask_spread_max_400': {'mean': 0.0004903593799099326,
  'std': 0.000652861432172358},
 'size_sum': {'mean': 31860.21484375, 'std': 70259.2109375},
 'size_sum_400': {'mean': 10241.53515625, 'std': 23358.8828125},
 'size_max': {'mean': 3035.258544921875, 'std': 8191.23974609375},
 'size_max_400': {'mean': 1780.333984375, 'std': 4887.39892578125},
 'size_min': {'mean': 6.389898777008057, 'std': 352.6045227050781},
 'size_min_400': {'mean': 22.84056282043457, 'std': 639.7129516601562},
 'order_count_sum': {'mean': 373.43682861328125, 'std': 608.5436401367188},
 'order_count_sum_400': {'mean': 120.78227996826172, 'std': 201.4743194580078},
 'order_count_max': {'mean': 26.246469497680664, 'std': 50.72904968261719},
 'order_count_max_400': {'mean': 16.569400787353516,
  'std': 30.438138961791992},
 'trade_money_turnover_sum': {'mean': 31859.123046875, 'std': 70262.2265625},
 'trade_money_turnover_sum_400': {'mean': 10240.9248046875,
  'std': 23348.404296875},
 'trade_money_turnover_max': {'mean': 3035.2578125, 'std': 8194.078125},
 'trade_money_turnover_max_400': {'mean': 1780.24267578125,
  'std': 4885.64453125},
 'trade_money_turnover_min': {'mean': 6.387502670288086,
  'std': 352.42071533203125},
 'trade_money_turnover_min_400': {'mean': 22.84111213684082,
  'std': 639.8861694335938},
 'logret1_xs': {'mean': -2.4693693756461244e-09, 'std': 0.0006871781661175191},
 'logret_bid_price1_xs': {'mean': -1.9815225016373006e-09,
  'std': 0.0006457031704485416},
 'logret_ask_price1_xs': {'mean': -2.840470081366675e-09,
  'std': 0.0006457548006437719},
 'logret_bid_price2_xs': {'mean': -2.0001771350308672e-09,
  'std': 0.0006615926395170391},
 'logret_ask_price2_xs': {'mean': -2.7945690206365725e-09,
  'std': 0.0006625199457630515},
 'price_spread1_xs': {'mean': 0.0022263764403760433,
  'std': 0.0026683351024985313},
 'bid_spread_xs': {'mean': 0.0007556164055131376,
  'std': 0.0009447085903957486},
 'ask_spread_xs': {'mean': 0.0007635528454557061,
  'std': 0.0009570387774147093},
 'logret_directional_volume1_xs': {'mean': -1.8204282525857707e-09,
  'std': 0.022533627226948738},
 'logret_directional_volume2_xs': {'mean': 1.4707172146799508e-09,
  'std': 0.02200956828892231},
 'logret_trade_money_turnover_xs': {'mean': -2.9323007311177207e-07,
  'std': 1.6745622158050537},
 'trade_price_push_on_book_xs': {'mean': -1.7589563583442214e-07,
  'std': 5.7160297728842124e-05},
 'logret_price_xs': {'mean': -2.801246123951273e-09,
  'std': 0.0006271468009799719},
 'order_count_xs': {'mean': 3.734368324279785, 'std': 10.645439147949219}}
#     print(feature_name, feature_tensor.size())
    
#     if feature_name in ['book_realized_volatility_xs','trade_realized_volatility_xs']:
#         # we expect feature_tensor to be log returns tensor
#         feature_tensor = feature_tensor ** 2
# #         print(feature_tensor)
#         feature_tensor = torch.cumsum(feature_tensor,1)
#         # scale it to make each step realize volatility extrapolatable to 10 min window
# #         feature_tensor = feature_tensor * torch.tensor([data_intervals_count/idx for idx in range(1,data_intervals_count+1,1)])
#         feature_tensor = torch.sqrt(feature_tensor) * realize_volatility_scale_factor
        
    if feature_name == 'sequence_mask_xs':
        feature_tensor = feature_tensor.type(torch.float32)
        return feature_tensor
    if feature_name == 'has_trade_data_xs':
        #TODO: we'll pre convert it so directly reutrn feature_tensor without converting
        return feature_tensor.type(torch.float32)
    if feature_name == 'seconds_in_bucket_xs':
        return feature_tensor / standard_scaling_feature_map[feature_name]['std']/100
    if feature_name in standard_scaling_feature_map:
        return (feature_tensor - standard_scaling_feature_map[feature_name]['mean'])/standard_scaling_feature_map[feature_name]['std']
#         return feature_tensor/standard_scaling_feature_map[feature_name]['std']/2
    if feature_name in ['trade_price_local_standardized_xs','book_wap1_local_standardized_xs']:
        #TODO: the kaggle version of pytorch dont have nan_to_num, do something here!
        feature_tensor = torch.masked_fill(feature_tensor, torch.isinf(feature_tensor),0)
#         feature_tensor = torch.nan_to_num(feature_tensor,nan=0, posinf=0, neginf=0)
#     print(feature_tensor)
#     print(torch.any(torch.isnan(feature_tensor)))
#     input()
    return feature_tensor

In [9]:
class StockIdEmbedding(nn.Module):
    def __init__(self,number_of_stock_embeddings=126+10, number_of_stock_embedding_dimention=2, mode='train'):
        super(StockIdEmbedding, self).__init__()
        
        self.number_of_stock_embeddings = number_of_stock_embeddings
        self.number_of_stock_embedding_dimention = number_of_stock_embedding_dimention
        self.stock_embedding = nn.Embedding(self.number_of_stock_embeddings, self.number_of_stock_embedding_dimention)
        self.mode = mode
        self.linear_stack = nn.Sequential(
            nn.Linear(self.number_of_stock_embedding_dimention, 32),
            nn.Hardswish(),
            nn.Linear(32, 16),
            nn.Hardswish(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
        
    def get_feature_gen_train_modes(self):
        return []
    
    def set_mode(self,mode):
        self.mode = mode
    
    def forward(self, feature_dict):
        
        stock_id_clamped = torch.clamp(feature_dict['stock_id'],0,self.number_of_stock_embeddings-1)
        stock_id_clamped = stock_id_clamped.type(torch.int64)
        stock_id_clamped = stock_id_clamped.to(device).reshape(-1,1)
        embedding_logits = self.stock_embedding(stock_id_clamped)
        embedding_logits = embedding_logits.reshape(-1,self.number_of_stock_embedding_dimention)
        
        if self.mode == 'stock_id_embedding':
            return embedding_logits

            
        logits = self.linear_stack(embedding_logits)
        return logits


In [10]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len,1, d_model)
#         print(pe.size())
        pe[:,0, 0::2] = torch.sin(position * div_term)
        pe[:,0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
#         print("x",x.size())
#         print("PE",self.pe[:,:x.size(1)].size())
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
class MultiFetTransformer(nn.Module):
    def __init__(self,feature_names=None, overview_feature_names=None, mask_feature_name='sequence_mask_xs',sequence_feature_name='seconds_in_bucket_xs', mode="hybrid"):
        """single feature, feature learner
        `mode`: train|feature_generator
        """
        super(MultiFetTransformer,self).__init__()
        if feature_names is None:
             
#             feature_names = ['seconds_in_bucket_xs_group','has_trade_data_xs'] + [ f'{feature_name}_sum_xs' for feature_name in ['wap1','wap_balance','logret1','logret2','logrett',
#                              'price_spread1','bid_spread','ask_spread','total_volume','volume_imbalance',
#                             'size','order_count','trade_money_turnover','trade_book_price_spread']] + [ f'{feature_name}_max_xs' for feature_name in ['wap1','wap_balance','logret1','logret2','logrett',
#                              'price_spread1','bid_spread','ask_spread','total_volume','volume_imbalance',
#                             'size','order_count','trade_money_turnover','trade_book_price_spread']]
            # 'bid_price2','bid_size2','ask_price2','ask_size2',
    # 'seconds_in_bucket', 'logret1'
            feature_names = [f'{feature_name}_xs' for feature_name in ['has_trade_data'] + 
                                                                         ['logret_bid_price1','logret_ask_price1','logret_bid_price2','logret_ask_price2','price_spread1',
                                                        'bid_spread','ask_spread','logret_directional_volume1','logret_directional_volume2','logret_trade_money_turnover','trade_price_push_on_book','logret_price','order_count']]
        
            
#             feature_names = ['seconds_in_bucket_xs_group','has_trade_data_xs']
        
        if overview_feature_names is None:
            overview_feature_names = []
            overview_aggregations = {
            'wap1': ['sum', 'std'],
            'wap2': ['sum', 'std'],
            'logret1': [realized_volatility],
            'logret2': [realized_volatility],
            'logret_price': [realized_volatility],
            'wap_balance': ['sum', 'max'],
            'price_spread1': ['sum', 'max'],
            'bid_spread': ['sum', 'max'],
            'ask_spread': ['sum', 'max'],
            'total_volume': ['sum', 'max'],
            'volume_imbalance': ['sum', 'max'],
            "bid_ask_spread": ['sum', 'max'],
            'size':  ['sum', 'max','min'],
            'order_count': ['sum', 'max'],
            'trade_money_turnover': ['sum', 'max','min'],
            }
        
            for key, aggs in overview_aggregations.items():
                for agg in aggs:
                    if isinstance(agg, types.FunctionType):
                        agg = agg.__name__
                    overview_feature_names.append(f'{key}_{agg}')
                    overview_feature_names.append(f'{key}_{agg}_400')
    #             overview_feature_names = ['wap1_sum', 'wap1_std', 'logret1_realized_volatility', 'logret2_realized_volatility', 'logrett_realized_volatility', 'wap_balance_sum', 'wap_balance_max', 'price_spread1_sum', 'price_spread1_max', 'bid_spread_sum', 'bid_spread_max', 'ask_spread_sum', 'ask_spread_max', 'total_volume_sum', 'total_volume_max', 'volume_imbalance_sum',
    #                                       'volume_imbalance_max', 'bid_ask_spread_sum', 'bid_ask_spread_max', 'size_sum', 'size_max', 'size_min', 'order_count_sum', 'order_count_max', 'trade_money_turnover_sum', 'trade_money_turnover_max', 'trade_money_turnover_min']

        
        
        if type(feature_names) == str:
            feature_names = [feature_names]
        
        self.use_stock_embedding = True
        self.use_overview_features = True
        self.stock_id_embedding_dimension = 3
        self.stock_id_embedding = StockIdEmbedding(number_of_stock_embedding_dimention=self.stock_id_embedding_dimension, mode='stock_id_embedding')
        
        self.feature_names = feature_names
        self.overview_feature_names = overview_feature_names
        self.mask_feature_name = mask_feature_name
        self.sequence_feature_name = sequence_feature_name
        self.features_count = len(self.feature_names)
#         self.scaled_feature_dimension = 4
#         self.feature_scalers = nn.ModuleList([nn.Sequential(
#                     # one is original feature, then stock,
#                     nn.Linear(1+self.stock_id_embedding_dimension, 32),
#                     nn.GELU(),
#                     nn.Linear(32, self.scaled_feature_dimension)
#                 ) for _ in self.feature_names])
#         for feature_name in self.feature_names:
#             self.feature_scalers.append()
        
        self.output_dimensions = 128
        self.transformer_input_dimension = 14
#         self.positional_encoding = PositionalEncoding(d_model=self.transformer_input_dimension)
        self.mode = mode

        self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.transformer_input_dimension, nhead=7, dropout=0.01, activation='gelu')
        self.encoder_stack = nn.TransformerEncoder(self.encoder_layer, num_layers=2)
        
        self.feature_to_feature_embedding = nn.Sequential(
            nn.Linear(self.features_count  + self.stock_id_embedding_dimension, 128),
            nn.GELU(),
            nn.Linear(128, self.transformer_input_dimension)
        )
        
        self.transformer_output_feature = nn.Sequential(
#             nn.Conv1d(self.transformer_input_dimension, 32, kernel_size=5, stride=1, padding=1),
#             nn.ReLU(),
#             nn.Conv1d(32, 24, kernel_size=5, stride=1, padding=1),
#             nn.ReLU(),
#             nn.Flatten(),
            nn.Linear(data_intervals_count*self.transformer_input_dimension,512),
            nn.GELU(),
#             nn.Dropout(0.1),
            nn.Linear(512,self.output_dimensions),
#             nn.Hardswish(),
#             nn.Linear(128, 128),
#             nn.Hardswish(),
#             nn.Linear(128, self.features_out),
        )
        
        self.overviewff_output_feature = nn.Sequential(
#             nn.Conv1d(self.transformer_input_dimension, 32, kernel_size=5, stride=1, padding=1),
#             nn.ReLU(),
#             nn.Conv1d(32, 24, kernel_size=5, stride=1, padding=1),
#             nn.ReLU(),
#             nn.Flatten(),
            nn.Linear(len(self.overview_feature_names)+self.stock_id_embedding_dimension,512),
            nn.GELU(),
#             nn.Dropout(0.1),
            nn.Linear(512,self.output_dimensions),
#             nn.Hardswish(),
#             nn.Linear(128, 128),
#             nn.Hardswish(),
#             nn.Linear(128, self.features_out),
        )
        
        self.transformer_train = nn.Sequential(
            nn.Linear(self.output_dimensions, 256),
            nn.GELU(),
#             nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
#             nn.Dropout(0.2),
            nn.Linear(128, 1),   
        )
        self.overviewff_train = nn.Sequential(
            nn.Linear(self.output_dimensions, 256),
            nn.GELU(),
#             nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
#             nn.Dropout(0.2),
            nn.Linear(128, 1),   
        )
       
        
        self.hybrid_stack = nn.Sequential(

            nn.Linear(self.output_dimensions*2, 256),
            nn.GELU(),
#             nn.Dropout(0.2),
            nn.Linear(256, 128),
#             nn.Linear(self.features_out, 1),
            nn.ReLU(),
#             nn.Dropout(0.2),
#             nn.Linear(128, 64),
#             nn.Hardswish(),
#             nn.Linear(64, 32),
#             nn.Hardswish(),
            nn.Linear(128, 1),   
        )
    
    def set_mode(self, mode):
        self.mode = mode
        
    def get_possible_modes(self):
        return ['transformer_train','overviewff_train','hybrid','full_train']
    
    def parameters(self):
        generator_sources_map = {
            'transformer_train': [self.encoder_stack, self.transformer_output_feature, self.transformer_train],
            'overviewff_train': [self.stock_id_embedding, self.overviewff_output_feature, self.overviewff_train],
            'hybrid': [self.hybrid_stack],
            'full_train': [self.encoder_stack, self.transformer_output_feature, self.transformer_train] + [self.stock_id_embedding, self.overviewff_output_feature, self.overviewff_train] + [self.hybrid_stack]
        }
        params = []
        if self.mode in generator_sources_map:
            for generator_source in generator_sources_map[self.mode]:
                for param in generator_source.parameters():
                    params.append(param)
            return params
        
        
    def forward(self, feature_dict, h0_tensor=None):  
        out_ = []
        if self.mode in ['transformer_train','hybrid','full_train']:
            feature_x = []     
            for idx,feature_name in enumerate(self.feature_names):
                feature_tensor = scale_optiver_feature(feature_name, feature_dict[feature_name]).to(device)
                feature_tensor = feature_tensor.reshape(-1,feature_tensor.size(1),1)
                feature_x.append(feature_tensor)
            
        
            #combine all the features activated tensors
            feature_x = torch.cat(feature_x,dim=2) #.reshape(-1, data_intervals_count, self.input_size_) #+[stock_embedding_logits]

            #positional encoding
            position_encodings = feature_dict[self.sequence_feature_name]
            position_encodings = (position_encodings.to(device)+1)/601
            position_encodings = position_encodings.reshape(-1,feature_x.size(1),1)
            feature_x = torch.add(feature_x, position_encodings)

            #Mask prepare
            mask = feature_dict[self.mask_feature_name].to(device)


            # make them sequence first!!
            # RANT: all of this due to Kaggle using pytorch 1.7.0
            feature_x = torch.stack(feature_x.unbind(0),dim=1)

            transformer_features = self.encoder_stack(feature_x, src_key_padding_mask=mask)
            # back to batch first!
            transformer_features = torch.stack(transformer_features.unbind(0),dim=1)
            transformer_features = transformer_features.reshape(-1, self.transformer_input_dimension*data_intervals_count)
        

            transformer_features = self.transformer_output_feature(transformer_features)
            if self.mode == 'transformer_train':
                return self.transformer_train(transformer_features)
            out_.append(transformer_features)
        # we add overview features here CAT them; that's why 
#         print(out_.size(),stock_embedding_logits.size())
        
        
        
        if self.mode in ['overviewff_train','hybrid','full_train']:
        
            stock_embedding_logits = self.stock_id_embedding(feature_dict)
            
        
        
            feature_x = []
            for idx,feature_name in enumerate(self.overview_feature_names):
                feature_tensor = scale_optiver_feature(feature_name, feature_dict[feature_name]).to(device)
    #             print(feature_tensor.size())
                feature_tensor = feature_tensor.reshape(-1,1)

                feature_x.append(feature_tensor)
            
            
            feature_x = torch.cat([stock_embedding_logits]+feature_x,dim=1)
#             print(feature_x.size())
            overview_features = self.overviewff_output_feature(feature_x)
#             print(overview_features.size())
            if self.mode == 'overviewff_train':
                return self.overviewff_train(overview_features)
            
#             out_.append(stock_embedding_logits)
            out_.append(overview_features)
            
#         print(feature_x.size())
#         print(out_.size())
        out_ = torch.cat(out_,dim=1)
        
        out_ = self.hybrid_stack(out_)

        return out_

# model = MultiFetTransformer()
# model.to(device)
# dataloader_train = DataLoader(dataset, batch_size=2,
#                                 shuffle=True, num_workers=0, pin_memory=False)#, collate_fn=optiver_custom_collate_func)


 
# >>> src = torch.rand(10, 32, 512)
# >>> out = encoder_layer(src)   
# for train_batch_idx, (Feature_X, feature_y) in enumerate(dataloader_train):
#     model(Feature_X)
#     input()

In [11]:
model = MultiFetTransformer()
# model.set_mode('ultimate')
# modelpath = "../input/optiver-realized-volatility-binarysentient-pytorch/07_1s_logret1n2_cnn_epoch_400_tloss_0.2393.pth"
modelpath = "../output/models/18_MultiMod5_4klr_16Atten_2Layer_0.001_256_epoch_3_tloss_nan.pth"
checkpoint = torch.load(modelpath)
model.load_state_dict(checkpoint['base'])
# for k,v in model.feature_gen_models.items():
#     v.load_state_dict(checkpoint[k])
model.to(device)
model.eval()

MultiFetTransformer(
  (stock_id_embedding): StockIdEmbedding(
    (stock_embedding): Embedding(136, 3)
    (linear_stack): Sequential(
      (0): Linear(in_features=3, out_features=32, bias=True)
      (1): Hardswish()
      (2): Linear(in_features=32, out_features=16, bias=True)
      (3): Hardswish()
      (4): Linear(in_features=16, out_features=16, bias=True)
      (5): ReLU()
      (6): Linear(in_features=16, out_features=1, bias=True)
    )
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=14, out_features=14, bias=True)
    )
    (linear1): Linear(in_features=14, out_features=2048, bias=True)
    (dropout): Dropout(p=0.01, inplace=False)
    (linear2): Linear(in_features=2048, out_features=14, bias=True)
    (norm1): LayerNorm((14,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((14,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.01, inplace=False)
  

In [14]:
mode = 'test'
main_df = pd.read_csv(os.path.join(DATA_DIRECTORY,f'{mode}.csv'))
submission_data = []

output_scaling = realize_volatility_scale_factor

for Feature_X, feature_y in iterate_optiver_dataset_for_evaluation(mode):
    
    row_ids = Feature_X['row_id']

    pred = model(Feature_X) 
#     print(pred)
#     print(Feature_X)
#     print(feature_y)
#     input()
    predicted_volatility = (pred/realize_volatility_scale_factor).tolist()
    for idx, row_id in enumerate(row_ids):
        submission_data.append({'row_id':row_id, 'target':predicted_volatility[idx][0]})
        
submission_df = pd.DataFrame(submission_data)
submission_df = main_df.merge(submission_df,on='row_id',how='left')
submission_df = submission_df.rename(columns={'target_y':'target'})
# submission_df
# print(submission_df.columns)
submission_df[['row_id','target']].to_csv("submission.csv", index=False)
# for idx, (X,y) in enumerate(dataset):
#     print(idx, X)

In [15]:
pd.read_csv("submission.csv")

Unnamed: 0,row_id,target
0,0-4,0.000537
1,0-32,0.001509
2,0-34,0.001509


In [14]:
import torch.version

In [15]:
torch.__version__

'1.9.0+cu111'