In [1]:
SEED = 42
import numpy as np
import pandas as pd
from utils import read_train_test, get_time_stock
from optiver_lofo import OptiverLOFO, plot_importance
from optiver_lofo import OptiverFLOFO
import itertools
import gc

  from tqdm.autonotebook import tqdm


In [2]:
train, _ = read_train_test()
df_book = pd.read_csv('../input/processed-book-ffill/df_book.csv')
df_trade = pd.read_csv('../input/processed-book-ffill/df_trade.csv')
train_ = df_book.merge(df_trade, on = ['row_id'], how = 'left')
train = train.merge(train_, on = ['row_id'], how = 'left')
del _, df_book, df_trade, train_
gc.collect()
# Get group stats of time_id and stock_id
train = get_time_stock(train)
train = train.sample(frac=1, random_state=SEED).reset_index(drop=True)
train

Our training set has 428932 rows


Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap1_median,wap2_sum,wap2_mean,...,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_300_median_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time,trade_log_return_realized_volatility_150_median_time
0,16,9787,0.005983,16-9787,600.47920,1.000799,0.001474,1.001224,600.47980,1.000800,...,0.002483,0.000756,0.004954,0.001089,0.002451,0.003181,0.000859,0.006486,0.001580,0.003066
1,105,5773,0.002415,105-5773,600.64197,1.001070,0.000766,1.000880,600.63720,1.001062,...,0.001704,0.000593,0.004029,0.000843,0.001570,0.002196,0.000715,0.005209,0.001141,0.002004
2,58,22076,0.002863,58-22076,598.30756,0.997179,0.000907,0.996946,598.41360,0.997356,...,0.000838,0.000478,0.003798,0.000237,0.000727,0.001039,0.000544,0.004101,0.000335,0.000903
3,77,5817,0.005279,77-5817,601.05817,1.001764,0.002921,1.001746,601.09640,1.001827,...,0.004393,0.001471,0.009818,0.001796,0.004206,0.005367,0.001700,0.011079,0.002390,0.005120
4,56,14572,0.002328,56-14572,598.78840,0.997981,0.000927,0.997882,598.76544,0.997942,...,0.000915,0.000401,0.002313,0.000221,0.000834,0.001135,0.000482,0.002866,0.000348,0.001030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,76,21439,0.002647,76-21439,600.85600,1.001427,0.000982,1.001238,600.83440,1.001391,...,0.001346,0.000629,0.003775,0.000000,0.001217,0.001646,0.000698,0.004178,0.000520,0.001442
428928,108,16733,0.001089,108-16733,599.87500,0.999792,0.000372,0.999724,599.80200,0.999670,...,0.000877,0.000352,0.002123,0.000000,0.000857,0.001113,0.000443,0.002451,0.000000,0.001026
428929,37,14273,0.002531,37-14273,601.11096,1.001852,0.000367,1.001833,601.35460,1.002258,...,0.000989,0.000426,0.002689,0.000000,0.000896,0.001216,0.000498,0.002931,0.000000,0.001113
428930,41,11351,0.001864,41-11351,600.30000,1.000500,0.000194,1.000479,600.28180,1.000470,...,0.001173,0.000534,0.003743,0.000489,0.001038,0.001426,0.000638,0.004545,0.000618,0.001287


In [4]:
feature_cols = [c for c in train.columns if c not in ['row_id', 'target', 'time_id', 'stock_id', 'target', 'logtarget']]
wap1_cols = [c for c in feature_cols if c.split('_')[0]=='wap1']
feature_cols = [c for c in feature_cols if c not in wap1_cols]
print(f"# features: {len(feature_cols)}")
feature_groups = {
#     'wap1': [c for c in feature_cols if c.split('_')[0]=='wap1'], # removed
    
    'wap2': [c for c in feature_cols if c.split('_')[0]=='wap2'],
    'log_return1': [c for c in feature_cols if c.split('_')[0:2]==['log', 'return1'] and c.split('_')[-1] not in ['time', 'stock']],
    'log_return2': [c for c in feature_cols if c.split('_')[0:2]==['log', 'return2'] and c.split('_')[-1] not in ['time', 'stock']],
    'wap_balance': [c for c in feature_cols if c.split('_')[0:2]==['wap', 'balance']],
    'price_spread': [c for c in feature_cols if c.split('_')[0:2]==['price', 'spread']],
    'bid_spread': [c for c in feature_cols if c.split('_')[0:2]==['bid', 'spread']],
    'ask_spread': [c for c in feature_cols if c.split('_')[0:2]==['ask', 'spread']],
    'total_volume': [c for c in feature_cols if c.split('_')[0:2]==['total', 'volume']],
    'volume_imbalance': [c for c in feature_cols if c.split('_')[0:2]==['volume', 'imbalance']],
    'trade_log_return': [c for c in feature_cols if c.split('_')[0:3]==['trade', 'log', 'return'] and c.split('_')[-1] not in ['time', 'stock']],
    'trade_seconds_in_bucket': [c for c in feature_cols if c.split('_')[0:4]==['trade', 'seconds', 'in', 'bucket']],    
    'trade_size': [c for c in feature_cols if c.split('_')[0:2]==['trade', 'size']],
    'trade_order_count': [c for c in feature_cols if c.split('_')[0:3]==['trade', 'order', 'count']],
    'timeagg': [c for c in feature_cols if c.split('_')[-1]=='time'],
    'stockagg': [c for c in feature_cols if c.split('_')[-1]=='stock'],
    
#     '150': [c for c in feature_cols if '150' in c],
#     '300': [c for c in feature_cols if '300' in c],
#     '450': [c for c in feature_cols if '450' in c],
}
grouped_features = list(itertools.chain.from_iterable(feature_groups.values()))
len(grouped_features), len(set(grouped_features))

# features: 312


(312, 312)

In [5]:
flofo = OptiverFLOFO(train, feature_cols)
flofo.get_importance()

fitting base model on fold 0


HBox(children=(FloatProgress(value=0.0, max=312.0), HTML(value='')))




Unnamed: 0,feature,importance_mean,importance_std,val_imp_0,val_imp_1,val_imp_2,val_imp_3,val_imp_4,val_imp_5,val_imp_6,val_imp_7,val_imp_8,val_imp_9
5,log_return1_realized_volatility,0.060880,0.032654,0.079672,0.014042,0.108128,0.076622,0.073556,0.058565,0.096755,0.028505,0.067648,0.005304
119,log_return1_realized_volatility_150,0.025120,0.012239,0.015260,0.018627,0.040507,0.037301,0.012355,0.001962,0.024676,0.033335,0.039199,0.027983
7,log_return1_std,0.003493,0.000963,0.002230,0.003055,0.003404,0.003594,0.004622,0.003148,0.003100,0.005782,0.003306,0.002686
293,trade_log_return_realized_volatility_std_time,0.002501,0.000418,0.002415,0.002569,0.002450,0.002699,0.002132,0.002965,0.001853,0.002971,0.001887,0.003072
206,log_return1_realized_volatility_450_median_stock,0.002245,0.001076,0.003153,0.002454,0.001349,0.000603,0.002668,0.000177,0.002518,0.003226,0.003343,0.002957
...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,log_return1_realized_volatility_450_min_time,-0.000049,0.000041,-0.000048,-0.000033,-0.000055,-0.000045,-0.000115,-0.000080,-0.000081,-0.000072,0.000009,0.000029
264,log_return1_realized_volatility_450_max_time,-0.000073,0.000032,-0.000119,-0.000066,-0.000106,-0.000068,-0.000065,-0.000079,-0.000015,-0.000049,-0.000044,-0.000117
281,log_return2_realized_volatility_300_median_time,-0.000074,0.000015,-0.000054,-0.000064,-0.000066,-0.000091,-0.000105,-0.000070,-0.000066,-0.000081,-0.000083,-0.000061
284,log_return1_realized_volatility_150_max_time,-0.000075,0.000018,-0.000094,-0.000090,-0.000053,-0.000094,-0.000060,-0.000039,-0.000079,-0.000073,-0.000080,-0.000088
