In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('max_rows', 300)
pd.set_option('max_columns', 300)

import os
import glob

import warnings
warnings.filterwarnings("ignore")

import pickle

def nans(df): return df[df.isnull().any(axis=1)]



In [None]:
# data directory
data_dir = "/home/ash/Documents/Projects/kaggle/ORVP/data/"
# DATA_DIR = "/home/ash/Documents/Projects/kaggle/ORVP/data/"


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file, ftype="csv"):
    """create a dataframe and optimize its memory usage"""
    if ftype=="csv":
        df = pd.read_csv(file)
    if ftype=="parquet":
        df = pd.read_parquet(file)
    df = reduce_mem_usage(df)
    return df

## Functions for preprocessing

In [None]:
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    return len(np.unique(series))

## Main function for preprocessing book data

In [None]:
def preprocessor_book(file_path):
    df = import_data(file_path, "parquet")
    #calculate return etc
    df['wap'] = calc_wap1(df)
    df['log_return'] = df.groupby('time_id')['wap'].apply(log_return)
    
    df['wap2'] = calc_wap2(df)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    df['wap_balance'] = abs(df['wap'] - df['wap2'])
    
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

    #dict for aggregate
    create_feature_dict = {
        'log_return':[realized_volatility],
        'log_return2':[realized_volatility],
        'wap_balance':[np.mean],
        'price_spread':[np.mean],
        'bid_spread':[np.mean],
        'ask_spread':[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
        'wap':[np.mean],
            }

    #####groupby / all seconds
    df_feature = pd.DataFrame(df.groupby(['time_id']).agg(create_feature_dict)).reset_index()
    
    df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
        
    ######groupby / last XX seconds
    last_seconds = [450, 300, 150]
    
    for second in last_seconds:
        second = 600 - second 
    
        df_feature_sec = pd.DataFrame(df.query(f'seconds_in_bucket >= {second}').groupby(['time_id']).agg(create_feature_dict)).reset_index()

        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns] #time_id is changed to time_id_
     
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))

        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    #create row_id
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['time_id_'],axis=1)
    
    return df_feature

## Main function for preprocessing trade data

In [None]:
def preprocessor_trade(file_path):
    df = import_data(file_path, "parquet")
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    aggregate_dictionary = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean],
    }
    
    df_feature = df.groupby('time_id').agg(aggregate_dictionary)
    
    df_feature = df_feature.reset_index()
    df_feature.columns = ['_'.join(col) for col in df_feature.columns]

    
    ######groupby / last XX seconds
    last_seconds = [450, 300, 150]
    
    for second in last_seconds:
        second = 600 - second
    
        df_feature_sec = df.query(f'seconds_in_bucket >= {second}').groupby('time_id').agg(aggregate_dictionary)
        df_feature_sec = df_feature_sec.reset_index()
        
        df_feature_sec.columns = ['_'.join(col) for col in df_feature_sec.columns]
        df_feature_sec = df_feature_sec.add_suffix('_' + str(second))
        
        df_feature = pd.merge(df_feature,df_feature_sec,how='left',left_on='time_id_',right_on=f'time_id__{second}')
        df_feature = df_feature.drop([f'time_id__{second}'],axis=1)
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature = df_feature.drop(['trade_time_id_'],axis=1)
    
    return df_feature

## Combined preprocessor function

In [None]:
def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    def for_joblib(stock_id):
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
            
        df_tmp = pd.merge(preprocessor_book(file_path_book),preprocessor_trade(file_path_trade),on='row_id',how='left')
     
        return pd.concat([df,df_tmp])
    
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(for_joblib)(stock_id) for stock_id in list_stock_ids
        )

    df =  pd.concat(df,ignore_index = True)
    return df


## Training set

In [None]:
train = pd.read_csv(data_dir + 'train.csv')

In [None]:
train_ids = train.stock_id.unique()
df_train = preprocessor(list_stock_ids= train_ids, is_train = True)

In [None]:
df_train_original = df_train

In [None]:
df_train = df_train_original

In [None]:
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]
df_train = train.merge(df_train, on = ['row_id'], how = 'left')
df_train

## Test set

In [None]:
test = pd.read_csv(data_dir + 'test.csv')
test_ids = test.stock_id.unique()
df_test = preprocessor(list_stock_ids= test_ids, is_train = False)
df_test = test.merge(df_test, on = ['row_id'], how = 'left')
df_test

## Target encoding by stock_id

In [None]:
from sklearn.model_selection import KFold
#stock_id target encoding
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean = df_train.groupby('stock_id')['target'].mean() 
df_test['stock_id_target_enc'] = df_test['stock_id'].map(stock_id_target_mean) # test_set

#training
tmp = np.repeat(np.nan, df_train.shape[0])
kf = KFold(n_splits = 10, shuffle=True,random_state = 19911109)
for idx_1, idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()

    tmp[idx_2] = df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp

## Model Building

In [None]:
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)
df_train

In [None]:
df_train.fillna(df_train.mean(), inplace=True)

df_train.to_pickle("df_train.pkl")
df_test.to_pickle("df_test.pkl")

# Read the pickled data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('max_rows', 300)
pd.set_option('max_columns', 300)

import os
import glob

import warnings
warnings.filterwarnings("ignore")

import pickle

def nans(df): return df[df.isnull().any(axis=1)]



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F

import random


from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
from pathlib import Path

In [3]:
import random
SEED = 42

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

random_seed(SEED)

In [20]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

class RMSPEMetric(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):
        return rmspe(y_true, y_score)

In [5]:
df_train = pd.read_pickle("df_train.pkl")
df_test = pd.read_pickle("df_test.pkl")
print("training sample: " + str(len(df_train)))
print("testing sample: " + str(len(df_test)))

training sample: 428932
testing sample: 3


# Normalize the dataframe

In [6]:
# Get the non-categorical features
scales = df_train.drop(['row_id', 'target', 'stock_id'], axis = 1).columns.to_list()

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_train[scales])

# Save the Scaler model
scaler_name = "scaler"
pickle.dump(scaler, open(scaler_name, 'wb'))

In [8]:
# Encode the categorical labels

from sklearn.preprocessing import LabelEncoder 

le=LabelEncoder()
le.fit(df_train["stock_id"])
df_train["stock_id"] = le.transform(df_train["stock_id"])

with open( 'stock_id_encoder.txt', 'wb') as f:
    pickle.dump(le, f)

In [15]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=19901028, shuffle=True)
models = []                          # models
scores = 0.0                         # validation score
bestscores=[]

In [16]:
tabnet_params = dict(
    n_d = 16,
    n_a = 16,
    n_steps = 3,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 1e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = 42,
    #verbose = 5,
    cat_dims=[len(le.classes_)], cat_emb_dim=[10], cat_idxs=[-1] # define categorical features
)

In [None]:
%time
for fold, (trn_idx, val_idx) in enumerate(kf.split(df_train)):

    print("Fold :", fold+1)
    
    # create dataset
    traindf, validdf = df_train.loc[trn_idx], df_train.loc[val_idx]
    
    traindf = traindf.reset_index(drop=True)
    validdf = validdf.reset_index(drop=True)
    
    ## Normalization except stock id ; stock id is used as categoral features

    X_train = traindf.drop(["row_id","target","stock_id"], axis = 1).values

    X_train = scaler.transform(X_train)
    X_traindf = pd.DataFrame(X_train) 

    X_traindf["stock_id"] = traindf["stock_id"]


    X_train = X_traindf.values
    y_train = traindf["target"].values.reshape(-1, 1)

    # validation is same
    X_valid = validdf.drop(["row_id","target","stock_id"], axis = 1).values
    X_valid = scaler.transform(X_valid)

    X_validdf = pd.DataFrame(X_valid)

    X_validdf["stock_id"]=validdf["stock_id"]

    X_valid = X_validdf.values
    y_valid = validdf["target"].values.reshape(-1, 1)

    # calculate weight

    y_weight = 1/np.square(y_train)
    
    print("training samples: " + str(len(y_train)))
    print("validation samples: " + str(len(y_valid)))

    # initialize random seed

    random_seed(SEED)
    
    print("--------Start training--------")
    
    # model 
    
    model = TabNetRegressor(**tabnet_params)
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
#         eval_metric=['rmse'],
        eval_metric=[RMSPEMetric],
        max_epochs=100,
        patience=15,
        batch_size=1024*2, virtual_batch_size=128*2,
        num_workers=4,
        drop_last=False,
#         weights = y_weight,
        loss_fn=nn.L1Loss(),
    )
    
    # save tabnet model
    saving_path_name = "tabnet_model_test_" + str(fold)
    saved_filepath = model.save_model(saving_path_name)
    bestscores.append(model.best_cost)
    
    
    # validation 
    y_pred = model.predict(X_valid)

    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),4)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #keep scores and models
    scores += RMSPE / 5
    models.append(model)
    print("*" * 100)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Fold : 1
training samples: 343145
validation samples: 85787
--------Start training--------
Device used : cuda
epoch 0  | loss: 0.05464 | train_rmspe: 1.35422 | valid_rmspe: 1.27686 |  0:00:09s
epoch 1  | loss: 0.00161 | train_rmspe: 0.85748 | valid_rmspe: 0.96829 |  0:00:17s
epoch 2  | loss: 0.00123 | train_rmspe: 0.63591 | valid_rmspe: 0.52873 |  0:00:26s
epoch 3  | loss: 0.00105 | train_rmspe: 0.52356 | valid_rmspe: 0.40656 |  0:00:35s
epoch 4  | loss: 0.001   | train_rmspe: 0.82277 | valid_rmspe: 0.41624 |  0:00:44s
epoch 5  | loss: 0.00096 | train_rmspe: 0.51095 | valid_rmspe: 0.37339 |  0:00:53s
epoch 6  | loss: 0.00093 | train_rmspe: 0.37975 | valid_rmspe: 0.35702 |  0:01:02s
epoch 7  | loss: 0.00091 | train_rmspe: 0.46656 | valid_rmspe: 0.44007 |  0:01:11s
epoch 8  | loss: 0.00089 | train_rmspe: 0.36905 | valid_rmspe: 0.33272 |  0:01:20s
epoch 9  | loss: 0.00088 | train_rmspe: 0.60255 | valid_rmspe: 0.33541 |  0:01:

epoch 29 | loss: 0.00078 | train_rmspe: 0.29546 | valid_rmspe: 0.27683 |  0:04:48s
epoch 30 | loss: 0.00078 | train_rmspe: 0.31263 | valid_rmspe: 0.30216 |  0:04:59s
epoch 31 | loss: 0.00078 | train_rmspe: 0.3182  | valid_rmspe: 0.2974  |  0:05:08s
epoch 32 | loss: 0.00078 | train_rmspe: 0.29461 | valid_rmspe: 0.28062 |  0:05:17s
epoch 33 | loss: 0.00078 | train_rmspe: 0.30771 | valid_rmspe: 0.29508 |  0:05:26s
epoch 34 | loss: 0.00078 | train_rmspe: 0.28449 | valid_rmspe: 0.27153 |  0:05:35s
epoch 35 | loss: 0.00077 | train_rmspe: 0.34693 | valid_rmspe: 0.33273 |  0:05:44s
epoch 36 | loss: 0.00077 | train_rmspe: 0.33278 | valid_rmspe: 0.30024 |  0:05:53s
epoch 37 | loss: 0.00077 | train_rmspe: 0.34102 | valid_rmspe: 0.32496 |  0:06:03s
epoch 38 | loss: 0.00078 | train_rmspe: 0.28459 | valid_rmspe: 0.2718  |  0:06:12s
epoch 39 | loss: 0.00077 | train_rmspe: 0.2885  | valid_rmspe: 0.27132 |  0:06:21s
epoch 40 | loss: 0.00077 | train_rmspe: 0.30726 | valid_rmspe: 0.29586 |  0:06:30s
epoc

# Test set

In [None]:
## fillna for test data ##

df_test = pd.read_pickle("df_test.pkl")

for col in df_test.columns.to_list()[3:]:
    df_test[col] = df_test[col].fillna(df_train[col].mean())


# ### normarize ###    

x_test = df_test.drop(['row_id', 'time_id',"stock_id"], axis = 1).values
    # Transform stock id to a numeric value

x_test = scaler.transform(x_test)
X_testdf = pd.DataFrame(x_test)
X_testdf["stock_id"]=df_test["stock_id"]

# Label encoding
X_testdf["stock_id"] = le.transform(X_testdf["stock_id"])
x_test = X_testdf.values

target = np.zeros(len(X_test))

for model in models:
    target = model.predict(x_test)
    target += pred / len(models)
target

In [None]:
y_pred = df_test[['row_id']]
y_pred = y_pred.assign(target = target)
y_pred

In [None]:
y_pred

In [None]:
y_pred.to_csv('submission.csv',index = False)

In [None]:
#  import os
# os.listdir()