In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns 
import glob
import gc
from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns
import numpy.matlib


In [2]:
import numpy.matlib


path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [3]:
# data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Function to calculate first WAP
def calc_wap1(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

# Function to calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def calc_wap3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

# Function to calculate the log of the return
# Remember that logb(x / y) = logb(x) - logb(y)
def log_return(series):
    return np.log(series).diff()

# Calculate the realized volatility
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# Function to count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Function to read our base train and test set
def read_train_test():
    #train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    #train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    #print(f'Our training set has {train.shape[0]} rows')
    return test

# Function to preprocess book data (for each stock id)
def book_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['wap3'] = calc_wap3(df)
    df['wap4'] = calc_wap4(df)
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return)
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.std],
        'wap2': [np.sum, np.std],
        'wap3': [np.sum, np.std],
        'wap4': [np.sum, np.std],
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
        'wap_balance': [np.sum, np.max],
        'price_spread':[np.sum, np.max],
        'price_spread2':[np.sum, np.max],
        'bid_spread':[np.sum, np.max],
        'ask_spread':[np.sum, np.max],
        'total_volume':[np.sum, np.max],
        'volume_imbalance':[np.sum, np.max],
        "bid_ask_spread":[np.sum,  np.max],
    }
    create_feature_dict_time = {
        'log_return1': [realized_volatility],
        'log_return2': [realized_volatility],
        'log_return3': [realized_volatility],
        'log_return4': [realized_volatility],
    }
    
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)

    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id__100'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to preprocess trade data (for each stock id)
def trade_preprocessor(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    df['amount']=df['price']*df['size']
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, np.max, np.min],
        'order_count':[np.sum,np.max],
        'amount':[np.sum,np.max,np.min],
    }
    create_feature_dict_time = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.sum],
    }
    # Function to get group stats for different windows (seconds in bucket)
    def get_stats_window(fe_dict,seconds_in_bucket, add_suffix = False):
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(fe_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    

    # Get the stats for different windows
    df_feature = get_stats_window(create_feature_dict,seconds_in_bucket = 0, add_suffix = False)
    df_feature_500 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 500, add_suffix = True)
    df_feature_400 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 200, add_suffix = True)
    df_feature_100 = get_stats_window(create_feature_dict_time,seconds_in_bucket = 100, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        # new
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        
        # vol vars
        
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_500, how = 'left', left_on = 'time_id_', right_on = 'time_id__500')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_100, how = 'left', left_on = 'time_id_', right_on = 'time_id__100')
    # Drop unnecesary time_ids
    df_feature.drop(['time_id__500','time_id__400', 'time_id__300', 'time_id__200','time_id','time_id__100'], axis = 1, inplace = True)
    
    
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    return df_feature

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    
# Funtion to make preprocessing function in parallel (for each stock id)
def preprocessor(list_stock_ids, is_train = True):
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    return df

# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [4]:
# Read train and test
train =pd.read_pickle("../input/optiver006/train.pkl")
test = read_train_test()

In [5]:
# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()
# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
#train = get_time_stock(train)
test = get_time_stock(test)

train1=train
test1=test

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished


In [6]:
# replace by order sum (tau)
train['size_tau'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique'] )
test['size_tau'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique'] )
#train['size_tau_450'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_450'] )
#test['size_tau_450'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_450'] )
train['size_tau_400'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_400'] )
test['size_tau_400'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_400'] )
train['size_tau_300'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_300'] )
test['size_tau_300'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_300'] )
#train['size_tau_150'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_150'] )
#test['size_tau_150'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_150'] )
train['size_tau_200'] = np.sqrt( 1/ train['trade_seconds_in_bucket_count_unique_200'] )
test['size_tau_200'] = np.sqrt( 1/ test['trade_seconds_in_bucket_count_unique_200'] )

In [7]:
train['size_tau2'] = np.sqrt( 1/ train['trade_order_count_sum'] )
test['size_tau2'] = np.sqrt( 1/ test['trade_order_count_sum'] )
#train['size_tau2_450'] = np.sqrt( 0.25/ train['trade_order_count_sum'] )
#test['size_tau2_450'] = np.sqrt( 0.25/ test['trade_order_count_sum'] )
train['size_tau2_400'] = np.sqrt( 0.33/ train['trade_order_count_sum'] )
test['size_tau2_400'] = np.sqrt( 0.33/ test['trade_order_count_sum'] )
train['size_tau2_300'] = np.sqrt( 0.5/ train['trade_order_count_sum'] )
test['size_tau2_300'] = np.sqrt( 0.5/ test['trade_order_count_sum'] )
#train['size_tau2_150'] = np.sqrt( 0.75/ train['trade_order_count_sum'] )
#test['size_tau2_150'] = np.sqrt( 0.75/ test['trade_order_count_sum'] )
train['size_tau2_200'] = np.sqrt( 0.66/ train['trade_order_count_sum'] )
test['size_tau2_200'] = np.sqrt( 0.66/ test['trade_order_count_sum'] )

# delta tau
train['size_tau2_d'] = train['size_tau2_400'] - train['size_tau2']
test['size_tau2_d'] = test['size_tau2_400'] - test['size_tau2']

In [8]:
colNames = [col for col in list(train.columns)
            if col not in {"stock_id", "time_id", "target", "row_id"}]
len(colNames)

194

In [9]:
from sklearn.cluster import KMeans
# making agg features

train_p = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')

corr = train_p.corr()

ids = corr.index

kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
print(kmeans.labels_)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )
    

mat = []
matTest = []

n = 0
for ind in l:
    print(ind)
    newDf = train.loc[train['stock_id'].isin(ind) ]
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    mat.append ( newDf )
    
    newDf = test.loc[test['stock_id'].isin(ind) ]    
    newDf = newDf.groupby(['time_id']).agg(np.nanmean)
    newDf.loc[:,'stock_id'] = str(n)+'c1'
    matTest.append ( newDf )
    
    n+=1
    
mat1 = pd.concat(mat).reset_index()
mat1.drop(columns=['target'],inplace=True)

mat2 = pd.concat(matTest).reset_index()

[1 0 4 2 1 1 2 4 6 2 1 0 4 4 1 1 1 2 4 4 4 0 1 1 3 1 1 4 3 4 3 4 4 1 3 3 4
 3 4 1 4 1 4 4 1 0 4 4 1 0 0 3 3 3 2 0 2 4 1 4 4 1 4 1 0 3 3 0 3 0 6 5 3 3
 0 1 2 0 3 3 3 4 1 1 0 2 3 3 1 0 1 4 4 4 4 4 1 3 1 0 1 4 1 0 1 4 1 0 4 0 4
 0]
[1, 11, 22, 50, 55, 56, 62, 73, 76, 78, 84, 87, 96, 101, 112, 116, 122, 124, 126]
[0, 4, 5, 10, 15, 16, 17, 23, 26, 28, 29, 36, 42, 44, 48, 53, 66, 69, 72, 85, 94, 95, 100, 102, 109, 111, 113, 115, 118, 120]
[3, 6, 9, 18, 61, 63, 86, 97]
[27, 31, 33, 37, 38, 40, 58, 59, 60, 74, 75, 77, 82, 83, 88, 89, 90, 98, 99, 110]
[2, 7, 13, 14, 19, 20, 21, 30, 32, 34, 35, 39, 41, 43, 46, 47, 51, 52, 64, 67, 68, 70, 93, 103, 104, 105, 107, 108, 114, 119, 123, 125]
[81]
[8, 80]


In [10]:
mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
mat1 = mat1.pivot(index='time_id', columns='stock_id')
mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
mat1.reset_index(inplace=True)

mat2 = mat2.pivot(index='time_id', columns='stock_id')
mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
mat2.reset_index(inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [11]:
train

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_std,wap2_sum,wap2_std,wap3_sum,wap3_std,...,trade_log_return_realized_volatility_200_min_time,size_tau,size_tau_400,size_tau_300,size_tau_200,size_tau2,size_tau2_400,size_tau2_300,size_tau2_200,size_tau2_d
0,0,5,0.004136,0-5,303.125061,0.000693,303.105530,0.000781,303.134857,0.000637,...,0.000826,0.158114,0.250000,0.218218,0.192450,0.095346,0.054772,0.067420,0.077460,-0.040574
1,0,11,0.001445,0-11,200.047775,0.000262,200.041168,0.000272,200.035614,0.000298,...,0.000000,0.182574,0.301511,0.250000,0.213201,0.132453,0.076089,0.093659,0.107606,-0.056365
2,0,16,0.002168,0-16,187.913849,0.000864,187.939819,0.000862,187.923065,0.000670,...,0.000698,0.200000,0.316228,0.288675,0.235702,0.121268,0.069663,0.085749,0.098518,-0.051605
3,0,31,0.002195,0-31,119.859779,0.000757,119.835945,0.000656,119.870163,0.000606,...,0.000260,0.258199,0.577350,0.333333,0.316228,0.130189,0.074788,0.092057,0.105766,-0.055401
4,0,62,0.001747,0-62,175.932861,0.000258,175.934250,0.000317,175.928284,0.000215,...,0.000306,0.213201,0.408248,0.301511,0.267261,0.106000,0.060892,0.074953,0.086115,-0.045108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.870453,0.000486,309.871368,0.000613,309.778625,0.000596,...,0.000619,0.164399,0.277350,0.235702,0.204124,0.098533,0.056603,0.069673,0.080049,-0.041930
428928,126,32753,0.003113,126-32753,223.552139,0.001264,223.580322,0.001303,223.505493,0.001340,...,0.000367,0.152499,0.277350,0.223607,0.188982,0.082479,0.047380,0.058321,0.067006,-0.035098
428929,126,32758,0.004070,126-32758,256.277039,0.000466,256.255066,0.000599,256.146057,0.000520,...,0.000000,0.169031,0.267261,0.204124,0.192450,0.101015,0.058029,0.071429,0.082065,-0.042986
428930,126,32763,0.003357,126-32763,399.721741,0.000456,399.714325,0.000507,399.775391,0.000424,...,0.000927,0.111803,0.192450,0.152499,0.136083,0.065372,0.037553,0.046225,0.053109,-0.027819


In [12]:
nnn = ['time_id',
     'log_return1_realized_volatility_0c1',
     'log_return1_realized_volatility_1c1',     
     'log_return1_realized_volatility_3c1',
     'log_return1_realized_volatility_4c1',     
     'log_return1_realized_volatility_6c1',
     'total_volume_sum_0c1',
     'total_volume_sum_1c1', 
     'total_volume_sum_3c1',
     'total_volume_sum_4c1', 
     'total_volume_sum_6c1',
     'trade_size_sum_0c1',
     'trade_size_sum_1c1', 
     'trade_size_sum_3c1',
     'trade_size_sum_4c1', 
     'trade_size_sum_6c1',
     'trade_order_count_sum_0c1',
     'trade_order_count_sum_1c1',
     'trade_order_count_sum_3c1',
     'trade_order_count_sum_4c1',
     'trade_order_count_sum_6c1',      
     'price_spread_sum_0c1',
     'price_spread_sum_1c1',
     'price_spread_sum_3c1',
     'price_spread_sum_4c1',
     'price_spread_sum_6c1',   
     'bid_spread_sum_0c1',
     'bid_spread_sum_1c1',
     'bid_spread_sum_3c1',
     'bid_spread_sum_4c1',
     'bid_spread_sum_6c1',       
     'ask_spread_sum_0c1',
     'ask_spread_sum_1c1',
     'ask_spread_sum_3c1',
     'ask_spread_sum_4c1',
     'ask_spread_sum_6c1',   
     'volume_imbalance_sum_0c1',
     'volume_imbalance_sum_1c1',
     'volume_imbalance_sum_3c1',
     'volume_imbalance_sum_4c1',
     'volume_imbalance_sum_6c1',       
     'bid_ask_spread_sum_0c1',
     'bid_ask_spread_sum_1c1',
     'bid_ask_spread_sum_3c1',
     'bid_ask_spread_sum_4c1',
     'bid_ask_spread_sum_6c1',
     'size_tau2_0c1',
     'size_tau2_1c1',
     'size_tau2_3c1',
     'size_tau2_4c1',
     'size_tau2_6c1'] 
train = pd.merge(train,mat1[nnn],how='left',on='time_id')
test = pd.merge(test,mat2[nnn],how='left',on='time_id')

In [13]:
import gc
del mat1,mat2
gc.collect()

21

In [14]:
# train2=pd.read_csv('../input/optiver-volatility-predictionaggregated-data/train_agg_final_df.csv')

In [15]:
train

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_std,wap2_sum,wap2_std,wap3_sum,wap3_std,...,bid_ask_spread_sum_0c1,bid_ask_spread_sum_1c1,bid_ask_spread_sum_3c1,bid_ask_spread_sum_4c1,bid_ask_spread_sum_6c1,size_tau2_0c1,size_tau2_1c1,size_tau2_3c1,size_tau2_4c1,size_tau2_6c1
0,0,5,0.004136,0-5,303.125061,0.000693,303.105530,0.000781,303.134857,0.000637,...,0.118397,0.113150,0.174687,0.155552,0.175661,0.058550,0.057267,0.078471,0.054691,0.050700
1,0,11,0.001445,0-11,200.047775,0.000262,200.041168,0.000272,200.035614,0.000298,...,0.072559,0.071506,0.122967,0.110758,0.099451,0.081235,0.078955,0.122289,0.078616,0.045740
2,0,16,0.002168,0-16,187.913849,0.000864,187.939819,0.000862,187.923065,0.000670,...,0.079010,0.091842,0.158230,0.142469,0.088431,0.078550,0.087378,0.116278,0.074977,0.080722
3,0,31,0.002195,0-31,119.859779,0.000757,119.835945,0.000656,119.870163,0.000606,...,0.072684,0.075466,0.133869,0.109418,0.139281,0.100382,0.089673,0.105948,0.094684,0.055447
4,0,62,0.001747,0-62,175.932861,0.000258,175.934250,0.000317,175.928284,0.000215,...,0.076716,0.073103,0.109770,0.110781,0.177286,0.087285,0.089068,0.112663,0.086381,0.046358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.870453,0.000486,309.871368,0.000613,309.778625,0.000596,...,0.088530,0.091100,0.148065,0.140075,0.163514,0.064380,0.064007,0.089665,0.061596,0.041157
428928,126,32753,0.003113,126-32753,223.552139,0.001264,223.580322,0.001303,223.505493,0.001340,...,0.077133,0.072731,0.128888,0.121852,0.100756,0.073644,0.065909,0.085298,0.062453,0.062479
428929,126,32758,0.004070,126-32758,256.277039,0.000466,256.255066,0.000599,256.146057,0.000520,...,0.070713,0.086828,0.148556,0.138048,0.080120,0.098654,0.092117,0.124247,0.090574,0.120507
428930,126,32763,0.003357,126-32763,399.721741,0.000456,399.714325,0.000507,399.775391,0.000424,...,0.110701,0.127982,0.204786,0.185053,0.194994,0.051648,0.054263,0.067649,0.051358,0.040747


In [16]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [17]:
# train3=train2[['stock_id','time_id','median_bid_size1','median_bid_size2']]

In [18]:
# trainf=pd.merge(train3,train)

In [19]:
# trainf=train.drop(['bid_ask_spread_amax',
#  'bid_ask_spread_sum',
#  'bid_ask_spread_sum_0c1',
#  'bid_ask_spread_sum_1c1',
#  'bid_ask_spread_sum_3c1',
#  'bid_ask_spread_sum_4c1',
#  'bid_ask_spread_sum_6c1',
#  'log_return1_realized_volatility_0c1',
#  'log_return1_realized_volatility_100',
#  'log_return1_realized_volatility_1c1',
#  'log_return1_realized_volatility_200',
#  'log_return1_realized_volatility_200_max_stock',
#  'log_return1_realized_volatility_200_max_time',
#  'log_return1_realized_volatility_200_mean_stock',
#  'log_return1_realized_volatility_200_mean_time',
#  'log_return1_realized_volatility_200_min_time',
#  'log_return1_realized_volatility_200_std_stock',
#  'log_return1_realized_volatility_200_std_time',
#  'log_return1_realized_volatility_300',
#  'log_return1_realized_volatility_300_max_stock',
#  'log_return1_realized_volatility_300_max_time',
#  'log_return1_realized_volatility_300_mean_stock',
#  'log_return1_realized_volatility_300_mean_time',
#  'log_return1_realized_volatility_300_min_time',
#  'log_return1_realized_volatility_300_std_stock',
#  'log_return1_realized_volatility_300_std_time',
#  'log_return1_realized_volatility_3c1',
#  'log_return1_realized_volatility_400',
#  'log_return1_realized_volatility_400_mean_stock',
#  'log_return1_realized_volatility_400_mean_time',
#  'log_return1_realized_volatility_400_min_time',
#  'log_return1_realized_volatility_400_std_stock',
#  'log_return1_realized_volatility_400_std_time',
#  'log_return1_realized_volatility_4c1',
#  'log_return1_realized_volatility_max_time',
#  'log_return2_realized_volatility',
#  'log_return2_realized_volatility_100',
#  'log_return2_realized_volatility_200',
#  'log_return2_realized_volatility_200_max_stock',
#  'log_return2_realized_volatility_200_max_time',
#  'log_return2_realized_volatility_200_mean_stock',
#  'log_return2_realized_volatility_200_mean_time',
#  'log_return2_realized_volatility_200_min_time',
#  'log_return2_realized_volatility_200_std_stock',
#  'log_return2_realized_volatility_200_std_time',
#  'log_return2_realized_volatility_300',
#  'log_return2_realized_volatility_300_max_stock',
#  'log_return2_realized_volatility_300_max_time',
#  'log_return2_realized_volatility_300_mean_stock',
#  'log_return2_realized_volatility_300_mean_time',
#  'log_return2_realized_volatility_300_min_time',
#  'log_return2_realized_volatility_300_std_stock',
#  'log_return2_realized_volatility_300_std_time',
#  'log_return2_realized_volatility_400',
#  'log_return2_realized_volatility_400_mean_stock',
#  'log_return2_realized_volatility_400_mean_time',
#  'log_return2_realized_volatility_400_min_time',
#  'log_return2_realized_volatility_400_std_stock',
#  'log_return2_realized_volatility_400_std_time',
#  'log_return2_realized_volatility_max_time',
#  'log_return2_realized_volatility_mean_stock',
#  'log_return2_realized_volatility_mean_time',
#  'log_return2_realized_volatility_min_time',
#  'log_return2_realized_volatility_std_stock',
#  'log_return2_realized_volatility_std_time',
#  'log_return3_realized_volatility',
#  'log_return3_realized_volatility_100',
#  'log_return3_realized_volatility_200',
#  'log_return3_realized_volatility_300',
#  'log_return3_realized_volatility_400',
#  'log_return4_realized_volatility',
#  'log_return4_realized_volatility_100',
#  'log_return4_realized_volatility_200',
#  'log_return4_realized_volatility_300',
#  'log_return4_realized_volatility_400',
#  'price_spread2_amax',
#  'price_spread_amax',
#  'price_spread_sum',
#  'price_spread_sum_0c1',
#  'price_spread_sum_1c1',
#  'price_spread_sum_3c1',
#  'price_spread_sum_4c1',
#  'size_tau2_200',
#  'size_tau2_300',
#  'size_tau2_400',
#  'size_tau2_d',
#  'size_tau_200',
#  'total_volume_amax',
#  'trade_abs_diff',
#  'trade_amount_amax',
#  'trade_amount_amin',
#  'trade_amount_sum',
#  'trade_df_max',
#  'trade_df_min',
#  'trade_f_max',
#  'trade_f_min',
#  'trade_iqr_p',
#  'trade_log_return_realized_volatility_100',
#  'trade_log_return_realized_volatility_200',
#  'trade_log_return_realized_volatility_200_max_time',
#  'trade_log_return_realized_volatility_200_mean_stock',
#  'trade_log_return_realized_volatility_200_mean_time',
#  'trade_log_return_realized_volatility_200_min_time',
#  'trade_log_return_realized_volatility_200_std_stock',
#  'trade_log_return_realized_volatility_200_std_time',
#  'trade_log_return_realized_volatility_300',
#  'trade_log_return_realized_volatility_300_max_time',
#  'trade_log_return_realized_volatility_300_mean_stock',
#  'trade_log_return_realized_volatility_300_mean_time',
#  'trade_log_return_realized_volatility_300_std_stock',
#  'trade_log_return_realized_volatility_300_std_time',
#  'trade_log_return_realized_volatility_400_mean_stock',
#  'trade_log_return_realized_volatility_400_mean_time',
#  'trade_log_return_realized_volatility_400_std_stock',
#  'trade_log_return_realized_volatility_400_std_time',
#  'trade_log_return_realized_volatility_mean_time',
#  'trade_order_count_sum_100',
#  'trade_order_count_sum_1c1',
#  'trade_order_count_sum_200',
#  'trade_order_count_sum_300',
#  'trade_order_count_sum_400',
#  'trade_order_count_sum_4c1',
#  'trade_seconds_in_bucket_count_unique_100',
#  'trade_seconds_in_bucket_count_unique_200',
#  'trade_seconds_in_bucket_count_unique_300',
#  'trade_seconds_in_bucket_count_unique_400',
#  'trade_seconds_in_bucket_count_unique_500',
#  'trade_size_sum_100',
#  'trade_size_sum_200',
#  'trade_size_sum_300',
#  'volume_imbalance_sum_6c1',
#  'wap3_std',
#  'wap3_sum',
#  'wap4_std',
#  'wap4_sum','trade_tendency',
#      ]
#  ,axis=1)

In [20]:
trainf=train.drop(['bid_ask_spread_amax',
 'bid_ask_spread_sum',
 'bid_ask_spread_sum_0c1',
 'bid_ask_spread_sum_1c1',
 'bid_ask_spread_sum_3c1',
 'bid_ask_spread_sum_4c1',
 'bid_ask_spread_sum_6c1',
 'log_return1_realized_volatility_0c1',
 'log_return1_realized_volatility_100',
 'log_return1_realized_volatility_1c1',
 'log_return1_realized_volatility_200',
 'log_return1_realized_volatility_200_max_stock',
 'log_return1_realized_volatility_200_max_time',
 'log_return1_realized_volatility_200_mean_stock',
 'log_return1_realized_volatility_200_mean_time',
 'log_return1_realized_volatility_200_min_time',
 'log_return1_realized_volatility_200_std_stock',
 'log_return1_realized_volatility_200_std_time',
 'log_return1_realized_volatility_300',
 'log_return1_realized_volatility_300_max_stock',
 'log_return1_realized_volatility_300_max_time',
 'log_return1_realized_volatility_300_mean_stock',
 'log_return1_realized_volatility_300_mean_time',
 'log_return1_realized_volatility_300_min_time',
 'log_return1_realized_volatility_300_std_stock',
 'log_return1_realized_volatility_300_std_time',
 'log_return1_realized_volatility_3c1',
 'log_return1_realized_volatility_400',
 'log_return1_realized_volatility_400_mean_stock',
 'log_return1_realized_volatility_400_mean_time',
 'log_return1_realized_volatility_400_min_time',
 'log_return1_realized_volatility_400_std_stock',
 'log_return1_realized_volatility_400_std_time',
 'log_return1_realized_volatility_4c1',
 'log_return1_realized_volatility_max_time',
 'log_return2_realized_volatility',
 'log_return2_realized_volatility_100',
 'log_return2_realized_volatility_200',
 'log_return2_realized_volatility_200_max_stock',
 'log_return2_realized_volatility_200_max_time',
 'log_return2_realized_volatility_200_mean_stock',
 'log_return2_realized_volatility_200_mean_time',
 'log_return2_realized_volatility_200_min_time',
 'log_return2_realized_volatility_200_std_stock',
 'log_return2_realized_volatility_200_std_time',
 'log_return2_realized_volatility_300',
 'log_return2_realized_volatility_300_max_stock',
 'log_return2_realized_volatility_300_max_time',
 'log_return2_realized_volatility_300_mean_stock',
 'log_return2_realized_volatility_300_mean_time',
 'log_return2_realized_volatility_300_min_time',
 'log_return2_realized_volatility_300_std_stock',
 'log_return2_realized_volatility_300_std_time',
 'log_return2_realized_volatility_400',
 'log_return2_realized_volatility_400_mean_stock',
 'log_return2_realized_volatility_400_mean_time',
 'log_return2_realized_volatility_400_min_time',
 'log_return2_realized_volatility_400_std_stock',
 'log_return2_realized_volatility_400_std_time',
 'log_return2_realized_volatility_max_time',
 'log_return2_realized_volatility_mean_stock',
 'log_return2_realized_volatility_mean_time',
 'log_return2_realized_volatility_min_time',
 'log_return2_realized_volatility_std_stock',
 'log_return2_realized_volatility_std_time',
 'log_return3_realized_volatility',
 'log_return3_realized_volatility_100',
 'log_return3_realized_volatility_200',
 'log_return3_realized_volatility_300',
 'log_return3_realized_volatility_400',
 'log_return4_realized_volatility',
 'log_return4_realized_volatility_100',
 'log_return4_realized_volatility_200',
 'log_return4_realized_volatility_300',
 'log_return4_realized_volatility_400',
 'price_spread2_amax',
 'price_spread_amax',
 'price_spread_sum',
 'price_spread_sum_0c1',
 'price_spread_sum_1c1',
 'price_spread_sum_3c1',
 'price_spread_sum_4c1',
 'size_tau2_200',
 'size_tau2_300',
 'size_tau2_400',
 'size_tau2_d',
 'size_tau_200',
 'total_volume_amax',
 'trade_abs_diff',
 'trade_amount_amax',
 'trade_amount_amin',
 'trade_amount_sum',
 'trade_df_max',
 'trade_df_min',
 'trade_f_max',
 'trade_f_min',
 'trade_iqr_p',
 'trade_log_return_realized_volatility_100',
 'trade_log_return_realized_volatility_200',
 'trade_log_return_realized_volatility_200_max_time',
 'trade_log_return_realized_volatility_200_mean_stock',
 'trade_log_return_realized_volatility_200_mean_time',
 'trade_log_return_realized_volatility_200_min_time',
 'trade_log_return_realized_volatility_200_std_stock',
 'trade_log_return_realized_volatility_200_std_time',
 'trade_log_return_realized_volatility_300',
 'trade_log_return_realized_volatility_300_max_time',
 'trade_log_return_realized_volatility_300_mean_stock',
 'trade_log_return_realized_volatility_300_mean_time',
 'trade_log_return_realized_volatility_300_std_stock',
 'trade_log_return_realized_volatility_300_std_time',
 'trade_log_return_realized_volatility_400_mean_stock',
 'trade_log_return_realized_volatility_400_mean_time',
 'trade_log_return_realized_volatility_400_std_stock',
 'trade_log_return_realized_volatility_400_std_time',
 'trade_log_return_realized_volatility_mean_time',
 'trade_order_count_sum_100',
 'trade_order_count_sum_1c1',
 'trade_order_count_sum_200',
 'trade_order_count_sum_300',
 'trade_order_count_sum_400',
 'trade_order_count_sum_4c1',
 'trade_seconds_in_bucket_count_unique_100',
 'trade_seconds_in_bucket_count_unique_200',
 'trade_seconds_in_bucket_count_unique_300',
 'trade_seconds_in_bucket_count_unique_400',
 'trade_seconds_in_bucket_count_unique_500',
 'trade_size_sum_100',
 'trade_size_sum_200',
 'trade_size_sum_300',
 'volume_imbalance_sum_6c1',
 'wap3_std',
 'wap3_sum',
 'wap4_std',
 'wap4_sum','trade_tendency','trade_size_sum', 'trade_size_amax', 'trade_size_amin', 'trade_order_count_sum', 'trade_order_count_amax', 'trade_energy', 'trade_abs_diff_v', 'trade_energy_v', 'trade_iqr_p_v', 'trade_log_return_realized_volatility_500', 'trade_size_sum_500', 'trade_order_count_sum_500', 'trade_log_return_realized_volatility_400', 'trade_size_sum_400', 
   'trade_log_return_realized_volatility_mean_stock', 'trade_log_return_realized_volatility_std_stock', 'trade_log_return_realized_volatility_max_stock', 'trade_log_return_realized_volatility_min_stock', 'trade_log_return_realized_volatility_400_max_stock', 'trade_log_return_realized_volatility_400_min_stock', 'trade_log_return_realized_volatility_300_max_stock', 'trade_log_return_realized_volatility_300_min_stock', 'trade_log_return_realized_volatility_200_max_stock', 'trade_log_return_realized_volatility_200_min_stock','trade_log_return_realized_volatility_std_time', 'trade_log_return_realized_volatility_max_time', 'trade_log_return_realized_volatility_min_time', 'trade_log_return_realized_volatility_400_max_time', 'trade_log_return_realized_volatility_400_min_time', 'trade_log_return_realized_volatility_300_min_time',
'trade_size_sum_0c1', 'trade_size_sum_1c1', 'trade_size_sum_3c1', 'trade_size_sum_4c1', 'trade_size_sum_6c1', 'trade_order_count_sum_0c1', 'trade_order_count_sum_3c1', 'trade_order_count_sum_6c1','trade_seconds_in_bucket_count_unique','trade_log_return_realized_volatility',
                  ]
  ,axis=1)

In [21]:
columns = [c for c in trainf.columns if '_tau' not in c]

In [22]:
trainfi=trainf[columns]

In [23]:
# Trial 8 finished with value: 0.2909154539994846 and parameters: {'num_leaves': 313, 'max_depth': -1, 'max_bin': 112, 'min_data_in_leaf': 618, 'reg_alpha': 1.5003475984254964, 'reg_lambda': 2.5618208457226173, 'colsample_bytree': 0.7011948193792243, 'subsample': 0.7363229960848949, 'subsample_freq': 2}. Best is trial 4 with value: 0.2849588122600494.

In [24]:
# 'num_leaves': 511, 'max_depth': 0, 'max_bin': 75, 'min_data_in_leaf': 480, 'lambda_l1': 0, 'lambda_l2': 15, 'reg_alpha': 4.641617961984979, 'reg_lambda': 4.062066200661617, 'colsample_bytree': 0.7603890287663787, 'subsample': 0.948429579313745, 'bagging_freq': 1, 'subsample_freq': 6, 'feature_fraction': 0.8}. Best is trial 450 with value: 0.25626889257570545.

In [25]:
import optuna
from lightgbm import LGBMRegressor

In [26]:
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

seed0=2021
params0 = {'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}

seed1=23
params1 = {'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
    'seed':seed1,
    'feature_fraction_seed': seed1,
    'bagging_seed': seed1,
    'drop_seed': seed1,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
seed2=51
params2 = {'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
    'seed':seed2,
    'feature_fraction_seed': seed2,
    'bagging_seed': seed2,
    'drop_seed': seed2,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
seed3=101
params3 = {'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
    'seed':seed3,
    'feature_fraction_seed': seed3,
    'bagging_seed': seed3,
    'drop_seed': seed3,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}
seed4=371
params4 = {'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
    'seed':seed4,
    'feature_fraction_seed': seed4,
    'bagging_seed': seed4,
    'drop_seed': seed4,
    'data_random_seed': seed0,
    'n_jobs':-1,
    'verbose': -1}



seeds=42
paramss = {
       'num_leaves':320,
    'objective': 'rmse',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'max_bin':112,
    'min_data_in_leaf':618,
    'learning_rate': 0.05,
    'subsample': 0.735,
    'subsample_freq': 2,
    'feature_fraction': 0.5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'categorical_column':[0],
    'reg_alpha': 1.5 ,
    'reg_lambda': 2.5,
    'colsample_bytree': 0.71,
        'categorical_column':[0],
        'seed': seed1,
        'feature_fraction_seed': seed1,
        'bagging_seed': seed1,
        'drop_seed': seed1,
        'data_random_seed': seed1,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs':-1,
    }
# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_evaluate_lgb(train, test, params0,params1,params2,params3,params4):
    # Hyperparammeters (just basic)
    
    features = [col for col in train.columns if col not in {"time_id", "target", "row_id"}]
    y = train['target']
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(test.shape[0])
    # Create a KFold object
    kfold = GroupKFold(n_splits = 5)
    groups=train["time_id"]
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train,y,groups)):
        print(f'Training fold {fold + 1}')
        
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)
        model0 = lgb.train(params = params0,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        
        model1 = lgb.train(params = params1,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        
        
       
        
        
        
       
        
        
        model4 = lgb.train(params = params4,
                          num_boost_round=1000,
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          verbose_eval = 250,
                          early_stopping_rounds=50,
                          feval = feval_rmspe)
        
        
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = (model0.predict(x_val[features])+model1.predict(x_val[features])+model4.predict(x_val[features]))/3
        # Predict the test set
        
        test_predictions += (model0.predict(test[features]) / 5 +model1.predict(test[features])/5 +model4.predict(test[features]) / 5)/3
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
#     lgb.plot_importance(model,max_num_features=20)
    # Return test predictions
    return test_predictions
# Traing and evaluate
test_predictions=train_and_evaluate_lgb(trainf, test,params0,params1,params2,params3,params4)

Training fold 1
Training until validation scores don't improve for 50 rounds
[250]	training's rmse: 0.000369808	training's RMSPE: 0.171301	valid_1's rmse: 0.000469014	valid_1's RMSPE: 0.216307
Early stopping, best iteration is:
[296]	training's rmse: 0.000362033	training's RMSPE: 0.1677	valid_1's rmse: 0.000468816	valid_1's RMSPE: 0.216216
Training until validation scores don't improve for 50 rounds
[250]	training's rmse: 0.000369592	training's RMSPE: 0.171201	valid_1's rmse: 0.000469016	valid_1's RMSPE: 0.216308
Early stopping, best iteration is:
[251]	training's rmse: 0.000369431	training's RMSPE: 0.171126	valid_1's rmse: 0.000468989	valid_1's RMSPE: 0.216295
Training until validation scores don't improve for 50 rounds
[250]	training's rmse: 0.000369306	training's RMSPE: 0.171068	valid_1's rmse: 0.000470253	valid_1's RMSPE: 0.216878
Early stopping, best iteration is:
[202]	training's rmse: 0.000378527	training's RMSPE: 0.17534	valid_1's rmse: 0.000470072	valid_1's RMSPE: 0.216795
Tra

In [27]:

# from optuna.visualization.matplotlib import plot_optimization_history

In [28]:
# plot_optimization_history(study);

In [29]:
# from optuna.visualization.matplotlib import plot_param_importances

# plot_param_importances(study);

In [30]:
#testing OPTUNA:
TIME = 1800*2
N_SPLITS = 5
RANDOM_STATE = 2021
FIXED_PARAMS = {
                 'learning_rate':0.05,
                'metric': 'rmse',
                'verbosity': -1,
                'n_jobs': -1,
                #'max_bin': 127,
                'seed': RANDOM_STATE,
    'max_depth':-1
    
            
}

In [31]:
def objective(trial,X,y):
    
    params = {"device": "gpu",
             "metric": "rmse",
#         'n_estimators': trial.suggest_int('n_estimators',50,200),
        'num_leaves': trial.suggest_int('num_leaves', 100, 400,step=10),
        'max_bin':trial.suggest_int('max_bin', 50, 120,step=10),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf',400,700,step=10),
        #'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 5,step=0.001),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 5,step=0.001),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1,step=0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9,step=0.1),
        'subsample_freq': trial.suggest_int('subsample_freq',1,10,step=1),
        
       
    }
      
        
    params.update(FIXED_PARAMS)
    kfold = GroupKFold(n_splits = 5)
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse", valid_name='valid_1')
    rmspe_list = []
    groups=X['time_id']
    features = [col for col in X.columns if col not in {"time_id", "target", "row_id"}]
    X=X.drop(['time_id'],axis=1)
        #'cat_smooth': trial.suggest_float('cat_smooth', 10, 100.0),  
        #'feature_fraction': trial.suggest_float('feature_fraction',0.3,0.99),
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X,y,groups)):
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        d_train=lgb.Dataset(X_train[features], y_train, weight = train_weights)
        d_valid= lgb.Dataset(X_val[features], y_val, weight = val_weights)
        model = lgb.train(params,num_boost_round=1000,
                      train_set=d_train,
                      valid_sets=[d_train, d_valid],
                      verbose_eval=250,
                      early_stopping_rounds=50,
                      callbacks=[pruning_callback])

        preds = model.predict(X_val)
        score = rmspe(y_val, preds)
        print(f'Our out of folds RMSPE is {score}')
        rmspe_list.append(score)
        
    
    return np.mean(rmspe_list)
        
       
    

In [32]:
from functools import partial
from optuna.samplers import TPESampler
import warnings

In [33]:
X_display = trainf.drop(['row_id', 'target'], axis = 1)
X = X_display
y = trainf['target']
X= X.interpolate(method='index')

In [34]:
# study1=optuna.study.create_study(direction='minimize',load_if_exists=False)
# func = lambda trial: objective(trial, X, y)
# study1.optimize(func,n_trials=20)

In [35]:
# print(study1.best_trial)

In [36]:
test['target'] = test_predictions
test[['row_id', 'target']].to_csv('submission.csv',index = False)