In [7]:
SEED=42

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
%matplotlib inline

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_450', 'log_return2_realized_volatility_450', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_150', 'log_return2_realized_volatility_150', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_450', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_150']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the time id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

In [8]:
train = pd.read_csv("../input/processed/processed_train.csv")
test = pd.read_csv("../input/processed/processed_test.csv")
# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)
train

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,...,trade_log_return_realized_volatility_450_max_time,trade_log_return_realized_volatility_450_min_time,trade_log_return_realized_volatility_300_mean_time,trade_log_return_realized_volatility_300_std_time,trade_log_return_realized_volatility_300_max_time,trade_log_return_realized_volatility_300_min_time,trade_log_return_realized_volatility_150_mean_time,trade_log_return_realized_volatility_150_std_time,trade_log_return_realized_volatility_150_max_time,trade_log_return_realized_volatility_150_min_time
0,0,5,0.004136,0-5,303.12506,1.003725,0.000693,303.105530,1.003661,0.000781,...,0.003242,0.000543,0.001820,0.000692,0.004595,0.000710,0.002286,0.000836,0.005362,0.000888
1,0,11,0.001445,0-11,200.04778,1.000239,0.000262,200.041170,1.000206,0.000272,...,0.002701,0.000000,0.000906,0.000460,0.002783,0.000000,0.001140,0.000583,0.002851,0.000000
2,0,16,0.002168,0-16,187.91385,0.999542,0.000864,187.939820,0.999680,0.000862,...,0.002751,0.000114,0.001100,0.000428,0.003082,0.000497,0.001347,0.000484,0.003414,0.000717
3,0,31,0.002195,0-31,119.85978,0.998831,0.000757,119.835945,0.998633,0.000656,...,0.003404,0.000000,0.001052,0.000600,0.004218,0.000000,0.001349,0.000698,0.004974,0.000269
4,0,62,0.001747,0-62,175.93286,0.999619,0.000258,175.934250,0.999626,0.000317,...,0.001936,0.000158,0.000812,0.000372,0.002470,0.000278,0.001036,0.000466,0.003281,0.000317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.87045,0.999582,0.000486,309.871370,0.999585,0.000613,...,0.003079,0.000000,0.001261,0.000485,0.004049,0.000534,0.001576,0.000651,0.004783,0.000647
428928,126,32753,0.003113,126-32753,223.55214,1.002476,0.001264,223.580320,1.002602,0.001303,...,0.003531,0.000000,0.001008,0.000687,0.006310,0.000329,0.001241,0.000814,0.007915,0.000404
428929,126,32758,0.004070,126-32758,256.27704,1.001082,0.000466,256.255070,1.000996,0.000599,...,0.001669,0.000000,0.001055,0.000376,0.001995,0.000000,0.001306,0.000422,0.002566,0.000000
428930,126,32763,0.003357,126-32763,399.72174,1.001809,0.000456,399.714320,1.001790,0.000507,...,0.003270,0.000400,0.001474,0.000591,0.005284,0.000686,0.001839,0.000731,0.006914,0.001004


In [9]:
# Split features and target
not_used_cols = ['row_id', 'target', 'time_id', 'stock_id', 'target']
feature_cols = [x for x in train.columns if x not in not_used_cols]
dataset = Dataset(df=train, target="target", features=feature_cols)
kfold = KFold(n_splits = 5, random_state = SEED, shuffle = True)

In [11]:
lofo_imp = LOFOImportance(dataset, cv=kfold, scoring="rmses")