### Change Log
* Version 40: LGBM: stock clust + time clust; NN: stock clust; TabNet: time clust
* Version 41: LGBM: time clust; NN: stock clust; TabNet: time clust
* Version 42: Clipping min=0
* Version 44: Ensembling - Random Forest
* Version 45: Ensembling - ElasticNet with correlated stocks
* Version 46: FE with enhanced error handling
* Version 47: Ensembling - ElasticNet with correlated stocks (not forcing positive weights)
* Version 48: Ensembling - Random Forest with correlated stocks
* Version 49: Ensembling - ElasticNet with correlated stocks (not forcing positive weights)
* Version 50: Testing of new FE dataset with syn testing (only run LGBM)
* Version 51: Removing >0.99 correlated features
* Version 53:
    - Optimized LGBM + TabNet params
    - Remove correlated feats for TabNet
    - Ensembling - Random Forest with correlated stocks
* Version 54: ElasticNet

Feature:
* Stock / Time clustering - direclty put clustering centroid as feature
* Correlation between time series as a feature
* Remove highly correlated features
* Quarticity https://www.kaggle.com/c/optiver-realized-volatility-prediction/discussion/267096

Model:
* RF meta model
* Add correlated stock prediction to meta model
* Final HP Tuning of all models
* AutoEncoder+1dCNN from public
* 1D CNN
* 2D CNN https://towardsdatascience.com/how-to-encode-time-series-into-images-for-financial-forecasting-using-convolutional-neural-networks-5683eb5c53d9
    - GAF for time series to image transformation
    - 11 features corresponding to 11 channels
* LSTM + Dense layer regression

Ensembling:

In [1]:
INFERENCE = False
TEST_MODE = 'test'

FE_PATH = '/kaggle/input/volatility-fe-output-version-15'
BASE_MODEL_PATH = '/kaggle/input/volatility-model-training-output-version-22' # this is for fixing the pre-trained base models (for submission only)

SEED = 1111
N_FOLD = 5

LGBM_NUM_BOOST = 3000
NN_EPOCH = 1000
TABNET_EPOCH = 1000
# LGBM_NUM_BOOST = 1
# NN_EPOCH = 1
# TABNET_EPOCH = 1

In [2]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np
import glob
import os
import gc
import datetime
import pickle

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib
import random
pd.set_option('display.max_columns', None)

# set seed
def seed_everything(seed=SEED):
    import torch
    import random
    import os
    import numpy as np
    import tensorflow as tf
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(SEED)
seed_everything()

In [3]:
# df = pd.DataFrame({'a':[1,2,3,4,5], 'b':['a','a','b','b','c']})
# df.groupby('b')['a'].agg(lambda s:s.max()-s.min())

In [4]:
# import from FE script
if INFERENCE==False:
    train = pd.read_feather(os.path.join(FE_PATH, 'train.f'))
    test = pd.read_feather(os.path.join(FE_PATH, 'test.f'))
    print(f'Train data shape is {train.shape}')
    print(f'Test data shape is {test.shape}')

Train data shape is (428932, 305)
Test data shape is (3, 304)


In [5]:
# define the keys of each dataset which are preserved in the whole notebook
train_key_cols = ['stock_id','time_id','row_id','target']
test_key_cols = ['stock_id','time_id','row_id']
train_keys = train[train_key_cols].reset_index(drop=True)
test_keys = test[test_key_cols].reset_index(drop=True)
y = train['target']

In [6]:
print(train.shape)
print(y.shape)

(428932, 305)
(428932,)


# LGBM Model

In [7]:
%%time
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb

lgb_features = [col for col in train.columns if col not in ["time_id", "target", "row_id"] and 'ts_ae' not in col]

params0 = {
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': 60,
    'max_bin': 250,
    'min_data_in_leaf':422,
    'learning_rate': 0.014565641020775516,
    'subsample': 0.6563801192981948,
    'subsample_freq': 1,
    'feature_fraction': 0.525513036358404,
    'lambda_l1': 8.177995270216595,
    'lambda_l2': 3.8822889556906657,
    'categorical_column': [lgb_features.index('stock_id'),
                           lgb_features.index('clust_wap1_sma50'),
                           lgb_features.index('clust_wap1_sms50'),
                           lgb_features.index('clust_total_volume_sma60'),
                           lgb_features.index('clust_volume_imbalance_sma80'),
                           lgb_features.index('stock_clustering_label'),
                           lgb_features.index('time_clustering_label')],
    'seed': SEED,
    'feature_fraction_seed': SEED,
    'bagging_seed': SEED,
    'drop_seed': SEED,
    'data_random_seed': SEED,
    'n_jobs':-1,
    'verbose': -1}

# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_evaluate_lgb(train, test, params):  
    print(f'Number of features considered for LightGBM is {len(lgb_features)}')
    oof_pred = np.zeros(train.shape[0])
    test_pred = []
    kfold = GroupKFold(n_splits=N_FOLD)
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train, y, train_keys.time_id)):
        
        if INFERENCE==False:
            print(f'Training fold {fold}')
            x_train, x_val = train.iloc[trn_idx], train.iloc[val_idx]
            y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
            train_weights = 1 / np.square(y_train)
            val_weights = 1 / np.square(y_val)
            train_dataset = lgb.Dataset(x_train[lgb_features], y_train, weight=train_weights)
            val_dataset = lgb.Dataset(x_val[lgb_features], y_val, weight=val_weights)
            model = lgb.train(params=params,
                              num_boost_round=LGBM_NUM_BOOST,
                              train_set=train_dataset, 
                              valid_sets=[train_dataset, val_dataset], 
                              verbose_eval=250,
                              early_stopping_rounds=50,
                              feval=feval_rmspe)
            # predictions
            oof_pred[val_idx] = model.predict(x_val[lgb_features])
            test_pred.append(model.predict(test[lgb_features]))
            # save model
            pickle.dump(model, open(f'lgbm_fold{fold}.p', 'wb'))
        
        elif INFERENCE==True:
            print(f'Inferring fold {fold}')
            model = pickle.load(open(os.path.join(BASE_MODEL_PATH, f'lgbm_fold{fold}.p'), 'rb'))
            test_pred.append(model.predict(test[lgb_features]))
    # Return test predictions
    return oof_pred, test_pred

# Traing and evaluate
oof_pred_lgb, test_pred_lgb = train_and_evaluate_lgb(train, test, params0)
cv_score_lgb = round(rmspe(train['target'], oof_pred_lgb), 5)
print(f'LGBM averaged CV score is {cv_score_lgb}')

del train
gc.collect()

Number of features considered for LightGBM is 270
Training fold 0
Training until validation scores don't improve for 50 rounds
[250]	training's rmse: 0.000460666	training's RMSPE: 0.213388	valid_1's rmse: 0.000487132	valid_1's RMSPE: 0.224663
[500]	training's rmse: 0.000435272	training's RMSPE: 0.201625	valid_1's rmse: 0.000473884	valid_1's RMSPE: 0.218553
[750]	training's rmse: 0.000422306	training's RMSPE: 0.195619	valid_1's rmse: 0.000470266	valid_1's RMSPE: 0.216884
[1000]	training's rmse: 0.000413041	training's RMSPE: 0.191327	valid_1's rmse: 0.000468468	valid_1's RMSPE: 0.216055
[1250]	training's rmse: 0.000405777	training's RMSPE: 0.187962	valid_1's rmse: 0.000467177	valid_1's RMSPE: 0.21546
[1500]	training's rmse: 0.000400151	training's RMSPE: 0.185357	valid_1's rmse: 0.0004664	valid_1's RMSPE: 0.215101
[1750]	training's rmse: 0.000395209	training's RMSPE: 0.183067	valid_1's rmse: 0.000465934	valid_1's RMSPE: 0.214886
[2000]	training's rmse: 0.000391	training's RMSPE: 0.181118	

171

# NN Model

In [8]:
if INFERENCE==False:
    train_nn = pd.read_feather(os.path.join(FE_PATH, 'train_nn.f'))
    test_nn = pd.read_feather(os.path.join(FE_PATH, 'test_nn.f'))

In [9]:
from numpy.random import seed
import tensorflow as tf
from tensorflow import keras
import numpy as np
from keras import backend as K
seed(SEED)
tf.random.set_seed(SEED)

def root_mean_squared_per_error(y_true, y_pred):
         return K.sqrt(K.mean(K.square( (y_true - y_pred)/ y_true )))

es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=20, verbose=0,
    mode='min',restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [10]:
# kfold based on the knn++ algorithm
out_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
out_train = out_train[out_train.time_id.isin(train_nn.time_id.unique())]
out_train = out_train.pivot(index='time_id', columns='stock_id', values='target')
out_train = out_train.fillna(out_train.mean())
# code to add the just the read data after first execution
# data separation based on knn ++
index = []
totDist = []
values = []
# generates a matriz with the values of 
mat = out_train.values
scaler = MinMaxScaler(feature_range=(-1, 1))
mat = scaler.fit_transform(mat)
nind = int(mat.shape[0] / N_FOLD) # number of individuals
# adds index in the last column
mat = np.c_[mat,np.arange(mat.shape[0])]
lineNumber = np.random.choice(np.array(mat.shape[0]), size=N_FOLD, replace=False)
lineNumber = np.sort(lineNumber)[::-1]
for n in range(N_FOLD):
    totDist.append(np.zeros(mat.shape[0]-N_FOLD))
# saves index
for n in range(N_FOLD):
    values.append([lineNumber[n]])    
s=[]
for n in range(N_FOLD):
    s.append(mat[lineNumber[n],:])
    mat = np.delete(mat, obj=lineNumber[n], axis=0)
for n in range(nind-1):    
    luck = np.random.uniform(0,1,N_FOLD)
    for cycle in range(N_FOLD):
         # saves the values of index           
        s[cycle] = np.matlib.repmat(s[cycle], mat.shape[0], 1)
        sumDist = np.sum( (mat[:,:-1] - s[cycle][:,:-1])**2 , axis=1)   
        totDist[cycle] += sumDist        
        # probabilities
        f = totDist[cycle]/np.sum(totDist[cycle]) # normalizing the totdist
        j = 0
        kn = 0
        for val in f:
            j += val        
            if (j > luck[cycle]): # the column was selected
                break
            kn +=1
        lineNumber[cycle] = kn
        # delete line of the value added    
        for n_iter in range(N_FOLD):
            totDist[n_iter] = np.delete(totDist[n_iter],obj=lineNumber[cycle], axis=0)
            j= 0
        s[cycle] = mat[lineNumber[cycle],:]
        values[cycle].append(int(mat[lineNumber[cycle],-1]))
        mat = np.delete(mat, obj=lineNumber[cycle], axis=0)
for n_mod in range(N_FOLD):
    values[n_mod] = out_train.index[values[n_mod]]

In [11]:
# NN Model Architecture

#https://bignerdranch.com/blog/implementing-swish-activation-function-in-keras/
from keras.backend import sigmoid
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Activation

def swish(x, beta = 1):
    return (x * sigmoid(beta * x))
get_custom_objects().update({'swish': Activation(swish)})

def create_nn_model(hidden_units):
    # initialize input list (later to be concatenated)
    raw_input_list = []
    concat_input_list = []
    
    # Each instance will consist of two inputs: a single user id, and a single movie id
    num_input = keras.Input(shape=(len(numerical_feats),), name='num_data')
    raw_input_list.append(num_input)
    concat_input_list.append(num_input)

    # stock_id embedding
    stock_id_input = keras.Input(shape=(1,), name='stock_id')
    stock_id_embedded = keras.layers.Embedding(stock_id_orig_dim, stock_id_emb_dim, input_length=1, name='stock_id_embedding')(stock_id_input)
    stock_id_flattened = keras.layers.Flatten()(stock_id_embedded)
    raw_input_list.append(stock_id_input)
    concat_input_list.append(stock_id_flattened)
    
    # stock clustering embedding
    stock_clustering_label_input = keras.Input(shape=(1,), name='stock_clustering_label')
    stock_clustering_label_embedded = keras.layers.Embedding(stock_clustering_label_orig_dim, stock_clustering_label_emb_dim, input_length=1, name='stock_clustering_label_embedding')(stock_clustering_label_input)
    stock_clustering_label_flattened = keras.layers.Flatten()(stock_clustering_label_embedded)
    raw_input_list.append(stock_clustering_label_input)
    concat_input_list.append(stock_clustering_label_flattened)
    
#     # time clustering embedding
#     time_clustering_label_input = keras.Input(shape=(1,), name='time_clustering_label')
#     time_clustering_label_embedded = keras.layers.Embedding(time_clustering_label_orig_dim, time_clustering_label_emb_dim, input_length=1, name='time_clustering_label_embedding')(time_clustering_label_input)
#     time_clustering_label_flattened = keras.layers.Flatten()(time_clustering_label_embedded)
#     raw_input_list.append(time_clustering_label_input)
#     concat_input_list.append(time_clustering_label_flattened)
    
    # concatencate all input layers
    x = keras.layers.Concatenate()(concat_input_list)
    
    # Add one or more hidden layers
    for n_hidden in hidden_units:
        x = keras.layers.Dense(n_hidden, activation='swish')(x)

    # A single output: our predicted rating
    out = keras.layers.Dense(1, activation='linear', name='prediction')(x)
    
    model = keras.Model(inputs=raw_input_list, outputs=out)
    model.compile(keras.optimizers.Adam(learning_rate=0.006), loss=root_mean_squared_per_error)
    return model

In [12]:
%%time
hidden_units_list = [(240,200,160,120,80,40,20), (220,180,140,100,50,25), (200,150,100,50,25), (180,120,60,30)]
# hidden_units_list = [(4,2), (4,2), (4,2), (4,2)]
# hidden_units_list = [(128,64,32)]
stock_id_orig_dim, stock_id_emb_dim = 112, 24
stock_clustering_label_orig_dim, stock_clustering_label_emb_dim = 6, 4
# time_clustering_label_orig_dim, time_clustering_label_emb_dim = 8, 4
numerical_feats = [c for c in train_nn if c not in train_key_cols and 'clustering_label' not in c]
train_nn_stock_id = train_nn['stock_id']
train_nn_stock_clustering_label = train_nn['stock_clustering_label']
# train_nn_time_clustering_label = train_nn['time_clustering_label']

def train_and_eval_nn():
    oof_pred_nn_list = []
    test_pred_nn_list = []
    for h in range(len(hidden_units_list)):
        # initialize predictions and scores
        oof_pred_nn = np.zeros(train_nn.shape[0])
        fold_scores = []
        test_pred_nn = []
        for fold in range(N_FOLD):
            if INFERENCE==False:
                print(f'Training fold {fold}...')
                # train-test split
                indexes = np.arange(N_FOLD).astype(int)    
                indexes = np.delete(indexes, obj=fold, axis=0) 
                indexes = np.r_[values[indexes[0]],values[indexes[1]],values[indexes[2]],values[indexes[3]]]
                trn_idx = train_nn[train_nn.time_id.isin(indexes)].index.tolist()
                val_idx = train_nn[train_nn.time_id.isin(values[fold])].index.tolist()
                X_train = train_nn.iloc[trn_idx,:][numerical_feats + ['stock_id','stock_clustering_label']]
                y_train = train_nn.iloc[trn_idx,:]['target']
                X_valid = train_nn.iloc[val_idx,:][numerical_feats + ['stock_id','stock_clustering_label']]
                y_valid = train_nn.iloc[val_idx,:]['target']

                # define numerical and categorical data for train set
                X_train_num = X_train[numerical_feats].values
                X_train_stock_id = X_train['stock_id']
                X_train_stock_clustering_label = X_train['stock_clustering_label']
#                 X_train_time_clustering_label = X_train['time_clustering_label']

                # define numerical and categorical data for validation set
                X_valid_num = X_valid[numerical_feats].values
                X_valid_stock_id = X_valid['stock_id']
                X_valid_stock_clustering_label = X_valid['stock_clustering_label']
#                 X_valid_time_clustering_label = X_valid['time_clustering_label']

                # model training
                model = create_nn_model(hidden_units=hidden_units_list[h])
                model.fit([X_train_num, X_train_stock_id, X_train_stock_clustering_label], 
                          y_train,               
                          batch_size=2048,
                          epochs=NN_EPOCH,
                          validation_data=([X_valid_num, X_valid_stock_id, X_valid_stock_clustering_label], y_valid),
                          callbacks=[es, plateau],
                          validation_batch_size=len(y_valid),
                          shuffle=True,
                          verbose=1)

                # validation result
                val_pred = model.predict([X_valid_num, X_valid_stock_id, X_valid_stock_clustering_label]).reshape(1,-1)[0]
                oof_pred_nn[val_idx] = val_pred
                score = round(rmspe(y_valid, val_pred),5)
                fold_scores.append(score)
                print('Fold {}: {}'.format(fold, score))

                # test data prediction
                test_pred_nn.append(model.predict([test_nn[numerical_feats].values, test_nn['stock_id'], test_nn['stock_clustering_label']]).reshape(1,-1)[0].clip(0,1e10))
                # save model
                model.save(f'nn_layer{h}_fold{fold}')
            
            elif INFERENCE==True:
                print(f'Inferring layer {h} fold {fold}')
                path = os.path.join(BASE_MODEL_PATH, f'nn_layer{h}_fold{fold}')
                model = tf.keras.models.load_model(path, compile=False)
                test_pred_nn.append(model.predict([test_nn[numerical_feats].values, test_nn['stock_id'], test_nn['stock_clustering_label']]).reshape(1,-1)[0].clip(0,1e10))
                
            tf.keras.backend.clear_session()
            gc.collect()
    
        # check OOF data quality
        print('Hidden units is ', hidden_units_list[h])
        print(f'Individual folds score is', fold_scores)
        oof_pred_nn_list.append(oof_pred_nn)
        test_pred_nn_list.append(test_pred_nn)
    return oof_pred_nn_list, test_pred_nn_list

oof_pred_nn_list, test_pred_nn_list = train_and_eval_nn()

del train_nn
gc.collect()

Training fold 0...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Fold 0: 0.2163
Training fold 1...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4

0

# TabNet

In [13]:
if INFERENCE==False:
    train_tbn = pd.read_feather(os.path.join(FE_PATH, 'train_tbn.f'))
    test_tbn = pd.read_feather(os.path.join(FE_PATH, 'test_tbn.f'))

In [14]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed

import shutil
import glob

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
import psutil
print(psutil.cpu_count())
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

2
Mon Sep 27 13:07:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    32W / 250W |  15425MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proc

In [15]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False
    def __call__(self, y_true, y_score):
        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))

def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()

In [16]:
# identify categorical and numerical columns
correlated_features = pickle.load(open(os.path.join('/kaggle/input/volatility-correlated-features', f'correlated_features.p'), 'rb'))
cat_cols = ['stock_id','time_clustering_label']
num_cols = [c for c in train_tbn if c not in ['stock_id','time_id','row_id','target'] and 'clustering_label' not in c and c not in correlated_features]
tabnet_feats = [c for c in train_tbn if c in cat_cols + num_cols]

# define categorical features index and dimentions for Tabnet params
cat_idxs = [tabnet_feats.index(c) for c in cat_cols]
cat_dims = [112, 8]
cat_emb_dim = [24, 4]

tabnet_params = dict(
    cat_idxs = cat_idxs,
    cat_dims = cat_dims,
    cat_emb_dim = cat_emb_dim,
    n_d = 24,
    n_a = 24,
    n_steps = 1,
    gamma = 2.0,
    n_independent = 2,
    n_shared = 2,
    lambda_sparse = 2.166158737727093e-06,
    mask_type = "entmax",
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    scheduler_params=dict(mode="min", patience=3, min_lr=1e-5, factor=0.5),
    seed = SEED,
    verbose = 10
)

In [17]:
%%time

import os
import zipfile
def zip_directory(folder_path, zip_path):
    with zipfile.ZipFile(zip_path, mode='w') as zipf:
        len_dir_path = len(folder_path)
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, file_path[len_dir_path:])
    return
                
def train_and_eval_tabnet():
    kfold = GroupKFold(n_splits=N_FOLD)
    oof_pred_tbn = np.zeros(train_tbn.shape[0])
    test_pred_tbn = []
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train_tbn, train_tbn.target, train_tbn.time_id)):
        if INFERENCE==False:
            print(f'Training fold {fold}......')
            X_train, X_val = train_tbn[tabnet_feats].iloc[trn_idx].values, train_tbn[tabnet_feats].iloc[val_idx].values
            y_train, y_val = train_tbn.target.iloc[trn_idx].values.reshape(-1,1), train_tbn.target.iloc[val_idx].values.reshape(-1,1)
            
            model = TabNetRegressor(**tabnet_params)
            model.fit(
              X_train, y_train,
              eval_set=[(X_val, y_val)],
              max_epochs = TABNET_EPOCH,
              patience = 10,
              batch_size = 1024, 
              virtual_batch_size = 128,
              num_workers = 0,
              drop_last = False,
              eval_metric=[RMSPE],
              loss_fn=RMSPELoss
              )
            # saving model
            saving_path_name = f"TabNet_fold{fold}"
            saved_filepath = model.save_model(saving_path_name)
            # predictions
            oof_pred = model.predict(X_val).flatten()
            oof_pred_tbn[val_idx] = oof_pred
            val_rmspe = rmspe(y_val.flatten(), oof_pred)
            print(f'TabNet fold {fold} RMSPE is {val_rmspe}')
            test_pred_tbn.append(model.predict(test_tbn[tabnet_feats].values).flatten())

        elif INFERENCE==True:
            input_path = os.path.join(BASE_MODEL_PATH, f'TabNet_fold{fold}')
            output_filename = f'TabNet_fold{fold}.zip'
            zip_directory(input_path, output_filename)
            model = TabNetRegressor()
            model.load_model(output_filename)
            test_pred_tbn.append(model.predict(test_tbn[tabnet_feats].values).flatten())

    return oof_pred_tbn, test_pred_tbn

oof_pred_tbn, test_pred_tbn = train_and_eval_tabnet()
print(f'OOF score across folds: {rmspe(y, oof_pred_tbn)}')

del train_tbn
gc.collect()

Training fold 0......
Device used : cuda
epoch 0  | loss: 12.91826| val_0_rmspe: 1.67432 |  0:00:17s
epoch 10 | loss: 0.22326 | val_0_rmspe: 0.28279 |  0:03:09s
epoch 20 | loss: 0.20608 | val_0_rmspe: 0.21445 |  0:06:05s

Early stopping occurred at epoch 27 with best_epoch = 17 and best_val_0_rmspe = 0.213
Best weights from best epoch are automatically used!
Successfully saved model at TabNet_fold0.zip
TabNet fold 0 RMSPE is 0.213002423997988
Training fold 1......
Device used : cuda
epoch 0  | loss: 10.06206| val_0_rmspe: 0.29637 |  0:00:16s
epoch 10 | loss: 0.22492 | val_0_rmspe: 0.23883 |  0:03:09s
epoch 20 | loss: 0.21748 | val_0_rmspe: 0.22099 |  0:06:03s
epoch 30 | loss: 0.21325 | val_0_rmspe: 0.22037 |  0:08:58s
epoch 40 | loss: 0.20847 | val_0_rmspe: 0.2221  |  0:11:52s
epoch 50 | loss: 0.2045  | val_0_rmspe: 0.21874 |  0:14:46s

Early stopping occurred at epoch 55 with best_epoch = 45 and best_val_0_rmspe = 0.21755
Best weights from best epoch are automatically used!
Successful

904

# Ensembling

In [18]:
# combining oof set
train_meta = pd.DataFrame(np.column_stack([oof_pred_lgb] + [oof for oof in oof_pred_nn_list] + [oof_pred_tbn]),
                                columns=['lgb'] + [f'nn_layer{i}' for i in range(len(hidden_units_list))] + ['tabnet'])
train_meta = pd.concat([train_keys, train_meta], axis=1).reset_index(drop=True)
    
# combining test set    
test_pred_lgb = pd.DataFrame(np.mean(np.column_stack(test_pred_lgb), axis=1), columns=['lgb'])
test_pred_nn = [pd.DataFrame(np.mean(np.column_stack(test_pred_nn_list[i]), axis=1), columns=[f'nn_layer{i}']) for i in range(len(hidden_units_list))]
test_pred_tbn = pd.DataFrame(np.mean(np.column_stack(test_pred_tbn), axis=1), columns=['tabnet'])
test_meta = pd.concat([test_keys] + [test_pred_lgb] + test_pred_nn + [test_pred_tbn], axis=1).reset_index(drop=True)
print(f'Shape of meta test is {test_meta.shape}')

Shape of meta test is (3, 9)


In [19]:
'''
Getting predictions of correlated stocks
'''
# correlation based on realized volatility (prediction target)
def get_sxt_corr_stock_mapping(data, metric, n_top, log_transform, show_distance):
    n_top = min(len(data.stock_id.unique())-1, n_top)
    # calculate correlations
    if log_transform==False:
        corr = pd.pivot_table(data, values=metric, index='time_id', columns='stock_id', aggfunc=np.sum).corr()
    elif log_transform==True:
        corr = np.log(pd.pivot_table(data, values=metric, index='time_id', columns='stock_id', aggfunc=np.sum)).corr()
    # compile mapping table
    mapping = []
    for stock_id in corr.columns:
        df = pd.DataFrame({'nearest_stocks':corr[stock_id].sort_values(ascending=False)[1:n_top+1].index.tolist()})
        if show_distance==True:
            df['nearest_stocks_corr'] = corr[stock_id].sort_values(ascending=False)[1:n_top+1].tolist()
        df['stock_id'] = stock_id
        mapping.append(df)
    mapping = pd.concat(mapping, axis=0).reset_index(drop=True)
    return mapping

# generate mapping table
target = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
corr_stock_mapping = get_sxt_corr_stock_mapping(data=target, metric='target', n_top=5, log_transform=True, show_distance=False)

# cross join with time_id
train_id_list = train_meta[['stock_id','time_id']].drop_duplicates()
corr_stock_mapping_train = pd.merge(corr_stock_mapping, train_id_list, how='inner',on='stock_id')[['stock_id','time_id','nearest_stocks']]
test_id_list = test_meta[['stock_id','time_id']].drop_duplicates()
corr_stock_mapping_test = pd.merge(corr_stock_mapping, test_id_list, how='inner',on='stock_id')[['stock_id','time_id','nearest_stocks']]

# calculate mean predictions of correlated stocks
corr_stock_pred_train = pd.merge(corr_stock_mapping_train, 
                                 train_meta.drop(['row_id','target'], axis=1).rename(columns={'stock_id':'nearest_stocks'}), 
                                 how='inner', 
                                 on=['nearest_stocks','time_id']).\
                        groupby(['stock_id','time_id'])[['lgb','nn_layer0','nn_layer1','nn_layer2','nn_layer3','tabnet']].\
                        mean().\
                        reset_index()
corr_stock_pred_train.columns = ['stock_id','time_id'] + [f'{c}_corr' for c in ['lgb','nn_layer0','nn_layer1','nn_layer2','nn_layer3','tabnet']]
print(f'Shape of train correlated stocks mean prediciton is {corr_stock_pred_train.shape}')
corr_stock_pred_test = pd.merge(corr_stock_mapping_test, 
                                 test_meta.drop(['row_id'], axis=1).rename(columns={'stock_id':'nearest_stocks'}), 
                                 how='inner', 
                                 on=['nearest_stocks','time_id']).\
                        groupby(['stock_id','time_id'])[['lgb','nn_layer0','nn_layer1','nn_layer2','nn_layer3','tabnet']].\
                        mean().\
                        reset_index()
corr_stock_pred_test.columns = ['stock_id','time_id'] + [f'{c}_corr' for c in ['lgb','nn_layer0','nn_layer1','nn_layer2','nn_layer3','tabnet']]
print(f'Shape of test correlated stocks mean prediciton is {corr_stock_pred_test.shape}')

# add to prediction table
train_meta = pd.merge(train_meta, corr_stock_pred_train, how='left', on=['stock_id','time_id'])
test_meta = pd.merge(test_meta, corr_stock_pred_test, how='left', on=['stock_id','time_id'])
# fillna (in case)
train_meta = train_meta.fillna(train_meta.mean()).fillna(0)
test_meta = test_meta.fillna(test_meta.mean()).fillna(0)
# release memory
del corr_stock_mapping, corr_stock_mapping_train, corr_stock_mapping_test, corr_stock_pred_train, corr_stock_pred_test
gc.collect()
# save data
if INFERENCE==False:
    train_meta.to_csv('train_meta.csv', index=False)
    test_meta.to_csv('test_meta.csv', index=False)

Shape of train correlated stocks mean prediciton is (428932, 8)
Shape of test correlated stocks mean prediciton is (0, 8)


### Ensemble method 1: Random Forest

In [20]:
%%time
from sklearn.ensemble import RandomForestRegressor

rf_params = dict(n_estimators = 80,
                max_depth = 9,
                min_samples_split = 14,
                min_samples_leaf = 32,
                max_features = 'sqrt',
                max_samples = 0.45,
                criterion='mse',
                random_state=SEED)

def train_and_eval_meta_rf():
    if INFERENCE==False:
        # model training
        train_weights = 1 / np.square(train_meta.target)
        model = RandomForestRegressor(**rf_params)
        model.fit(X=train_meta[[c for c in train_meta if c not in train_key_cols]],
                  y=train_meta.target,
                  sample_weight=train_weights)
        pickle.dump(model, open(f'meta_rf.p', 'wb'))
    else:
        model = pickle.load(open(os.path.join(BASE_MODEL_PATH, f'meta_rf.p'), 'rb'))
    # predictions
    train_pred_meta = model.predict(train_meta[[c for c in train_meta if c not in train_key_cols]])
    test_pred_meta = model.predict(test_meta[[c for c in test_meta if c not in train_key_cols]])
    return test_pred_meta, train_pred_meta

# test_pred_meta, train_pred_meta = train_and_eval_meta_rf()

CPU times: user 13 µs, sys: 0 ns, total: 13 µs
Wall time: 16.5 µs


### Ensemble method 2: Linear Regression (OLS)

In [21]:
%%time
from sklearn.linear_model import LinearRegression
def train_and_eval_meta_ols():
    if INFERENCE==False:
        # model training
        train_weights = 1 / np.square(train_meta.target)
        model = LinearRegression(fit_intercept=False)
        model.fit(X=train_meta[[c for c in train_meta if c not in train_key_cols]],
                  y=train_meta.target,
                  sample_weight=train_weights)
        print(f'Fitted coefficients are {model.coef_}')
        pickle.dump(model, open(f'meta_ols.p', 'wb'))
    else:
        model = pickle.load(open(os.path.join(BASE_MODEL_PATH, f'meta_ols.p'), 'rb'))
    # predictions
    train_pred_meta = model.predict(train_meta[[c for c in train_meta if c not in train_key_cols]])
    test_pred_meta = model.predict(test_meta[[c for c in test_meta if c not in train_key_cols]])
    return test_pred_meta, train_pred_meta

# test_pred_meta, train_pred_meta = train_and_eval_meta_ols()

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 14.1 µs


### Ensemble method 3: Equal Weight

In [22]:
%%time
def equal_weight_meta():
    nn_cols = [c for c in test_meta if 'nn' in c and c not in train_key_cols]    
    train_pred_meta = (train_meta[nn_cols].mean(axis=1) + train_meta['lgb'] + train_meta['tabnet']) / 3
    test_pred_meta = (test_meta[nn_cols].mean(axis=1) + test_meta['lgb'] + test_meta['tabnet']) / 3     
    return test_pred_meta, train_pred_meta

# test_pred_meta, train_pred_meta = equal_weight_meta()

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.82 µs


### Ensemble method 4: Weighted by CV score

In [23]:
%%time
def cv_score_weighting():
    if INFERENCE==False:
        # initialize cv score tables
        base_cv = []
        nn_cv = []
        # calcualte OOF score of each model
        base_cv.append(('lgb', rmspe(train_keys['target'], oof_pred_lgb)))
        for i in range(len(hidden_units_list)):
            nn_cv.append((f'nn_layer{i}', rmspe(train_keys['target'], oof_pred_nn_list[i])))
        base_cv.append(('tabnet', rmspe(train_keys['target'], oof_pred_tbn)))
        # transform list to table
        base_cv = pd.DataFrame(base_cv, columns=['model','cv'])
        nn_cv = pd.DataFrame(nn_cv, columns=['model','cv'])
        # define contrast parameter
        k_nn = 0.015
        k_base = 0.2
        # calculate NN weights
        nn_cv['imp'] = np.exp((1/k_nn) * 1 / nn_cv['cv'])
        nn_cv['weight'] = nn_cv['imp'] / np.sum(nn_cv['imp'])
        nn_avg_cv = pd.DataFrame({'model':['nn'], 'cv':[np.sum(np.multiply(nn_cv.cv, nn_cv.weight))]})
        # calculate base model weight (including the overall NN)
        base_cv = pd.concat([base_cv, nn_avg_cv], axis=0).reset_index(drop=True)
        base_cv['imp'] = np.exp((1/k_base) * 1 / base_cv['cv'])
        base_cv['weight'] = base_cv['imp'] / np.sum(base_cv['imp'])
        # derive final weight for all models
        nn_cv['weight'] = nn_cv['weight'] * float(base_cv[base_cv.model=='nn']['weight'])
        nn_cv = nn_cv[['model','weight']]
        base_cv = base_cv[base_cv.model!='nn'][['model','weight']]
        base_cv = pd.concat([base_cv, nn_cv], axis=0).reset_index(drop=True)
        model_weights = dict(base_cv.to_records(index=False))
        pickle.dump(model_weights, open(f'model_weights.p', 'wb'))
    else:
        model_weights = pickle.load(open(os.path.join(BASE_MODEL_PATH, f'model_weights.p'), 'rb'))
    # make predictions
    train_pred_meta = sum([train_meta[m] * model_weights[m] for m in model_weights])
    test_pred_meta = sum([test_meta[m] * model_weights[m] for m in model_weights])  
    return test_pred_meta, train_pred_meta

# test_pred_meta, train_pred_meta = cv_score_weighting()

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


### Ensemble method 5: ElasticNet

In [24]:
%%time
from sklearn.linear_model import ElasticNet
def train_and_eval_meta_elasticnet():
    if INFERENCE==False:
        # model training
        train_weights = 1 / np.square(train_meta.target)
        model = ElasticNet(fit_intercept=False, alpha=1e-10, l1_ratio=0, positive=False, random_state=SEED, max_iter=10000)
        model.fit(X=train_meta[[c for c in train_meta if c not in train_key_cols]],
                  y=train_meta.target,
                  sample_weight=train_weights)
        print(f'Fitted coefficients are {model.coef_}')
        pickle.dump(model, open(f'meta_elasticnet.p', 'wb'))
    else:
        model = pickle.load(open(os.path.join(BASE_MODEL_PATH, f'meta_elasticnet.p'), 'rb'))
    # predictions
    train_pred_meta = model.predict(train_meta[[c for c in train_meta if c not in train_key_cols]])
    test_pred_meta = model.predict(test_meta[[c for c in test_meta if c not in train_key_cols]])
    return test_pred_meta, train_pred_meta

test_pred_meta, train_pred_meta = train_and_eval_meta_elasticnet()

Fitted coefficients are [ 0.06530569  0.09766328  0.22771501  0.08149597  0.19969643  0.32674456
  0.15052084  0.21272125 -0.09283636 -0.01153631 -0.19988715 -0.05826142]
CPU times: user 2min 2s, sys: 133 ms, total: 2min 2s
Wall time: 2min 3s


  positive)


# Clipping

In [25]:
# clipping
target = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/train.csv')
min_target, max_target = target.target.min(), target.target.max()
# min_target, max_target = 0, target.target.max()
print(min_target, max_target)

oof_pred_lgb = np.clip(oof_pred_lgb, min_target, max_target)
for i in range(len(hidden_units_list)):
    oof_pred_nn_list[i] = np.clip(oof_pred_nn_list[i], min_target, max_target)
oof_pred_tbn = np.clip(oof_pred_tbn, min_target, max_target)
train_pred_meta = np.clip(train_pred_meta, min_target, max_target)
test_pred_meta = np.clip(test_pred_meta, min_target, max_target)

0.000105263 0.07032062


# Final score and submission

In [26]:
# evaluation
if INFERENCE==False:
    # LightGBM
    lgb_cv_score = round(rmspe(train_keys['target'], oof_pred_lgb), 5)
    print(f'LGBM CV score is {lgb_cv_score}')
    # NN
    for i in range(len(hidden_units_list)):
        cv_score = round(rmspe(train_keys['target'], oof_pred_nn_list[i]), 5)
        print(f'NN{i} CV score is {cv_score}')
    # TabNet
    tbn_cv_score = round(rmspe(train_keys['target'], oof_pred_tbn), 5)
    print(f'TabNet CV score is {tbn_cv_score}')
    # Ensembled
    meta_cv_score = round(rmspe(train_keys['target'], train_pred_meta), 5)
    print(f'Meta Model CV score is {meta_cv_score}')


# submission
test['target'] = test_pred_meta
test = test[['row_id', 'target']].reset_index(drop=True)

LGBM CV score is 0.21466
NN0 CV score is 0.20927
NN1 CV score is 0.20983
NN2 CV score is 0.20959
NN3 CV score is 0.20975
TabNet CV score is 0.21051
Meta Model CV score is 0.2069


In [27]:
if INFERENCE==False:
    test.to_csv('submission.csv', index=False)
    display(test.head())
else:
    if TEST_MODE=='test':
        orig_test = pd.read_csv('/kaggle/input/optiver-realized-volatility-prediction/test.csv')
        if test['row_id'].tolist()==orig_test['row_id'].tolist():
            test.to_csv('submission.csv', index=False)
            display(test.head())
        else:
            assert test['row_id'].tolist()==orig_test['row_id'].tolist()

Unnamed: 0,row_id,target
0,0-4,0.001644
1,0-32,0.001659
2,0-34,0.001659
