In [2]:
SEED = 42
import numpy as np
import pandas as pd
from utils import read_train_test, get_time_stock
from optiver_lofo import OptiverLOFO, plot_importance
import itertools
import gc

  from tqdm.autonotebook import tqdm


In [3]:
train, _ = read_train_test()
df_book = pd.read_csv('../input/processed-book-ffill/df_book.csv')
df_trade = pd.read_csv('../input/processed-book-ffill/df_trade.csv')
train_ = df_book.merge(df_trade, on = ['row_id'], how = 'left')
train = train.merge(train_, on = ['row_id'], how = 'left')
del _, df_book, df_trade, train_
gc.collect()
# Get group stats of time_id and stock_id
train = get_time_stock(train)
train = train.sample(frac=1, random_state=SEED).reset_index(drop=True)
train

Our training set has 428932 rows


Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap1_median,wap2_sum,wap2_mean,...,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_300_median_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time,trade_log_return_realized_volatility_150_median_time
0,16,9787,0.005983,16-9787,600.47920,1.000799,0.001474,1.001224,600.47980,1.000800,...,0.002483,0.000756,0.004954,0.001089,0.002451,0.003181,0.000859,0.006486,0.001580,0.003066
1,105,5773,0.002415,105-5773,600.64197,1.001070,0.000766,1.000880,600.63720,1.001062,...,0.001704,0.000593,0.004029,0.000843,0.001570,0.002196,0.000715,0.005209,0.001141,0.002004
2,58,22076,0.002863,58-22076,598.30756,0.997179,0.000907,0.996946,598.41360,0.997356,...,0.000838,0.000478,0.003798,0.000237,0.000727,0.001039,0.000544,0.004101,0.000335,0.000903
3,77,5817,0.005279,77-5817,601.05817,1.001764,0.002921,1.001746,601.09640,1.001827,...,0.004393,0.001471,0.009818,0.001796,0.004206,0.005367,0.001700,0.011079,0.002390,0.005120
4,56,14572,0.002328,56-14572,598.78840,0.997981,0.000927,0.997882,598.76544,0.997942,...,0.000915,0.000401,0.002313,0.000221,0.000834,0.001135,0.000482,0.002866,0.000348,0.001030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,76,21439,0.002647,76-21439,600.85600,1.001427,0.000982,1.001238,600.83440,1.001391,...,0.001346,0.000629,0.003775,0.000000,0.001217,0.001646,0.000698,0.004178,0.000520,0.001442
428928,108,16733,0.001089,108-16733,599.87500,0.999792,0.000372,0.999724,599.80200,0.999670,...,0.000877,0.000352,0.002123,0.000000,0.000857,0.001113,0.000443,0.002451,0.000000,0.001026
428929,37,14273,0.002531,37-14273,601.11096,1.001852,0.000367,1.001833,601.35460,1.002258,...,0.000989,0.000426,0.002689,0.000000,0.000896,0.001216,0.000498,0.002931,0.000000,0.001113
428930,41,11351,0.001864,41-11351,600.30000,1.000500,0.000194,1.000479,600.28180,1.000470,...,0.001173,0.000534,0.003743,0.000489,0.001038,0.001426,0.000638,0.004545,0.000618,0.001287


In [21]:
feature_cols = [c for c in train.columns if c not in ['row_id', 'target', 'time_id', 'stock_id', 'target', 'logtarget']]
# wap1_cols = [c for c in feature_cols if c.split('_')[0]=='wap1']
# feature_cols = [c for c in feature_cols if c not in wap1_cols]
print(f"# features: {len(feature_cols)}")
feature_groups = {
    'wap1': [c for c in feature_cols if c.split('_')[0]=='wap1'],
    'wap2': [c for c in feature_cols if c.split('_')[0]=='wap2'],
    'log_return1': [c for c in feature_cols if c.split('_')[0:2]==['log', 'return1'] and c.split('_')[-1] not in ['time', 'stock']],
    'log_return2': [c for c in feature_cols if c.split('_')[0:2]==['log', 'return2'] and c.split('_')[-1] not in ['time', 'stock']],
    'wap_balance': [c for c in feature_cols if c.split('_')[0:2]==['wap', 'balance']],
    'price_spread': [c for c in feature_cols if c.split('_')[0:2]==['price', 'spread']],
    'bid_spread': [c for c in feature_cols if c.split('_')[0:2]==['bid', 'spread']],
    'ask_spread': [c for c in feature_cols if c.split('_')[0:2]==['ask', 'spread']],
    'total_volume': [c for c in feature_cols if c.split('_')[0:2]==['total', 'volume']],
    'volume_imbalance': [c for c in feature_cols if c.split('_')[0:2]==['volume', 'imbalance']],
    'trade_log_return': [c for c in feature_cols if c.split('_')[0:3]==['trade', 'log', 'return'] and c.split('_')[-1] not in ['time', 'stock']],
    'trade_seconds_in_bucket': [c for c in feature_cols if c.split('_')[0:4]==['trade', 'seconds', 'in', 'bucket']],    
    'trade_size': [c for c in feature_cols if c.split('_')[0:2]==['trade', 'size']],
    'trade_order_count': [c for c in feature_cols if c.split('_')[0:3]==['trade', 'order', 'count']],
    'timeagg': [c for c in feature_cols if c.split('_')[-1]=='time'],
    'stockagg': [c for c in feature_cols if c.split('_')[-1]=='stock'],
    
#     '150': [c for c in feature_cols if '150' in c],
#     '300': [c for c in feature_cols if '300' in c],
#     '450': [c for c in feature_cols if '450' in c],
}
grouped_features = list(itertools.chain.from_iterable(feature_groups.values()))
len(grouped_features), len(set(grouped_features))

# features: 328


(328, 328)

In [22]:
feature_groups_level2 = {}
lags = ['150', '300', '450']
for k in feature_groups:
    feature_groups_level2[k] = []
    for lag in lags:
        feature_groups_level2[k+'_'+lag] = []
    for v in feature_groups[k]:
        for lag in lags:
            islag = False
            if lag in v:
                feature_groups_level2[k+'_'+lag].append(v)
                islag = True
                break
        if not islag:
            feature_groups_level2[k].append(v)
grouped_features = list(itertools.chain.from_iterable(feature_groups_level2.values()))
len(grouped_features), len(set(grouped_features))

(328, 328)

In [25]:
feature_groups_level3 = {}
cats = ['log_return1', 'log_return2', 'trade_log_return']
tosplit = ['timeagg', 'stockagg']
for k in feature_groups_level2:
    if k.split('_')[0] not in tosplit:
        feature_groups_level3[k] = feature_groups_level2[k]
    else:
        for cat in cats:
            feature_groups_level3[k+'_'+cat] = [c for c in feature_groups_level2[k] if cat in c]
grouped_features = list(itertools.chain.from_iterable(feature_groups_level3.values()))
len(grouped_features), len(set(grouped_features))

(328, 328)

In [27]:
feature_groups_level3

{'wap1': ['wap1_sum', 'wap1_mean', 'wap1_std', 'wap1_median'],
 'wap1_150': ['wap1_sum_150',
  'wap1_mean_150',
  'wap1_std_150',
  'wap1_median_150'],
 'wap1_300': ['wap1_sum_300',
  'wap1_mean_300',
  'wap1_std_300',
  'wap1_median_300'],
 'wap1_450': ['wap1_sum_450',
  'wap1_mean_450',
  'wap1_std_450',
  'wap1_median_450'],
 'wap2': ['wap2_sum', 'wap2_mean', 'wap2_std', 'wap2_median'],
 'wap2_150': ['wap2_sum_150',
  'wap2_mean_150',
  'wap2_std_150',
  'wap2_median_150'],
 'wap2_300': ['wap2_sum_300',
  'wap2_mean_300',
  'wap2_std_300',
  'wap2_median_300'],
 'wap2_450': ['wap2_sum_450',
  'wap2_mean_450',
  'wap2_std_450',
  'wap2_median_450'],
 'log_return1': ['log_return1_sum',
  'log_return1_realized_volatility',
  'log_return1_mean',
  'log_return1_std',
  'log_return1_median'],
 'log_return1_150': ['log_return1_sum_150',
  'log_return1_realized_volatility_150',
  'log_return1_mean_150',
  'log_return1_std_150',
  'log_return1_median_150'],
 'log_return1_300': ['log_return1_

In [None]:
lofo = OptiverLOFO(train, feature_cols, group_dict=feature_groups_level3)
lofo_df = lofo.get_importance()
plot_importance(lofo_df)

base cv mean: 0.22662093098679223


HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))