In [None]:
import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from joblib import Parallel, delayed
from sklearn.model_selection import GridSearchCV

In [None]:
# The data path

path = '~/Datasets/Kaggle Optiver'

list_order_book_file_train = glob.glob(f'{path}/book_train.parquet/*')
list_order_book_file_test = glob.glob(f'{path}/book_test.parquet/*')

list_order_trade_file_train = glob.glob(f'{path}/trade_train.parquet/*')
list_order_trade_file_test = glob.glob(f'{path}/trade_test.parquet/*')

In [None]:
# Generate new features

 # Bid ask spread
def bid_ask_spread1(df):
    return df['ask_price1']/df['bid_price1'] - 1

def bid_ask_spread2(df):
    return df['ask_price2']/df['bid_price2'] - 1
    
def bid_ask_spread_all(df):
    return (df['ask_price2'] + df['ask_price1']) / (df['bid_price2'] + df['bid_price1']) - 1

 # Weight average price
def wap_1(df):
    price = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])\
    / (df['ask_size1'] + df['bid_size1'])
    return price

def wap_2(df):
    price = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])\
    / (df['ask_size2'] + df['bid_size2'])
    return price

def wap_3(df):
    price = (df['bid_price1'] * df['ask_size2'] + df['ask_price2'] * df['bid_size1'])\
    / (df['ask_size2'] + df['bid_size1'])
    return price

def wap_4(df):
    price = (df['bid_price2'] * df['ask_size1'] + df['ask_price1'] * df['bid_size2'])\
    / (df['ask_size1'] + df['bid_size2'])
    return price

 # Log return
def log_return(price):
    return np.log(price).diff()

 # Volatility
def volatility_single_step(log_return):
     return np.sqrt(np.sum(np.square(log_return)))

# Deal with book data

def book_feature_eng(stock_data_all):
    
    stock_data_300 = stock_data_all[stock_data_all['seconds_in_bucket']<301]
    stock_data_list = [stock_data_300, stock_data_all]
    num_i = 0
    
    for stock_data in stock_data_list:
        num_i += 1 
        wap_list = [wap_1, wap_2, wap_3, wap_4]

        for i in range(1,5):
            stock_data.loc[:,f'WAP{i}'] = wap_list[i-1](stock_data)
            stock_data.loc[:,f'log_return{i}'] = stock_data.groupby(['time_id'])[f'WAP{i}'].apply(log_return)
            #stock_data = stock_data[~stock_data[f'log_return{i}'].isnull()]

        stock_data.loc[:,'WAP_mean'] = stock_data.loc[:,['WAP1','WAP2','WAP3','WAP4']].mean(axis=1)

        stock_data.loc[:,'log_return_mean'] = stock_data.groupby(['time_id'])['WAP_mean'].apply(log_return)
        stock_data = stock_data[~stock_data[f'log_return{i}'].isnull()]

        ba_spread_list = [bid_ask_spread1, bid_ask_spread2, bid_ask_spread_all]

        for i in range(1,4):
            stock_data.loc[:,f'ba_spread{i}'] = ba_spread_list[i-1](stock_data)

        stock_data.loc[:,'total_volumn'] = stock_data['ask_size1'] + stock_data['ask_size2'] +\
                                           stock_data['bid_size1'] + stock_data['bid_size2']

        f_eng_dict = {
            'log_return1' : [volatility_single_step],
            'log_return2' : [volatility_single_step],
            'log_return3' : [volatility_single_step],
            'log_return4' : [volatility_single_step],
            'log_return_mean' : [volatility_single_step],
            'ba_spread1' : [np.mean],
            'ba_spread2' : [np.mean],
            'total_volumn' : [np.mean],
            'WAP_mean' : [np.mean]
        }
        
        df_final = pd.DataFrame(stock_data.groupby(['time_id']).agg(f_eng_dict))
        df_final = df_final.reset_index()
        df_final.columns = [
            'time_id',
            'log_return1',
            'log_return2',
            'log_return3',
            'log_return4',
            'log_return_mean',
            'ba_spread1',
            'ba_spread2',
            'total_volumn',
            'WAP_mean'
        ]
        
        if num_i == 1:
            df_final_merge = df_final
        else:
            df_final_merge = df_final_merge.merge(df_final, on=['time_id'])
    
    return df_final_merge
        
def eng_all_book(file):   
    
    file_df = pd.read_parquet(file)
    all_stock_vol = book_feature_eng(file_df)
    all_stock_vol['stock_id'] = file.split('=')[1]
    all_stock_vol['stock_id'] = all_stock_vol['stock_id'].astype('int32')
    
    return all_stock_vol

# Deal with trade data

def trade_feature_eng(stock_data_all):
    
    stock_data_300 = stock_data_all[stock_data_all['seconds_in_bucket']<301]
    stock_data_list = [stock_data_300, stock_data_all]
    num_i = 0
    
    for stock_data in stock_data_list:
        num_i += 1 
    
        stock_data.loc[:,'log_return_trade'] = stock_data.groupby(['time_id'])['price'].apply(log_return)
        stock_data = stock_data[~stock_data['log_return_trade'].isnull()]

        f_eng_dict = {
            'log_return_trade' : [volatility_single_step],
            'size' : [np.mean],
            'order_count' : [np.mean]
            }

        df_final = pd.DataFrame(stock_data.groupby(['time_id']).agg(f_eng_dict))
        df_final = df_final.reset_index()
        df_final.columns = [
            'time_id',
            'log_return_trade',
            'size',
            'order_count'
        ]
        
        if num_i == 1:
            df_final_merge = df_final
        else:
            df_final_merge = df_final_merge.merge(df_final, on=['time_id'])
    
    return df_final_merge

def eng_all_trade(file):
    
    file_df = pd.read_parquet(file)
    all_stock_vol = trade_feature_eng(file_df)
    all_stock_vol['stock_id'] = file.split('=')[1]
    all_stock_vol['stock_id'] = all_stock_vol['stock_id'].astype('int32')
    
    return all_stock_vol

# Generate final features for all data

def train_and_test_data(book_list, test_list):
    
    all_book = Parallel(n_jobs=-1)(delayed(eng_all_book)(file) for file in book_list)
    all_book = pd.concat(all_book, ignore_index = True)
    all_book = all_book.sort_values(by=['stock_id','time_id']).reset_index(drop=True)

    all_trade = Parallel(n_jobs=-1)(delayed(eng_all_trade)(file) for file in test_list)
    all_trade = pd.concat(all_trade, ignore_index = True)
    all_trade = all_trade.sort_values(by=['stock_id','time_id']).reset_index(drop=True)
    
    df = pd.merge(all_book,all_trade, how='left', on=['stock_id','time_id'])
    
    return df

# The root mean square percentage error

def rmspe(y_true, y_pred):
    return  np.round((np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))),3)

# Transfer dataframe to submitting form

def submit_format(df):
    df['row_id'] = df['stock_id'].astype(str) + '-' + df['time_id'].astype(str)
    df = df[['row_id','target']]
    return df

In [None]:
# Traning X data

df_train = train_and_test_data(list_order_book_file_train,list_order_trade_file_train)
df_train_features = df_train.drop(['time_id'], axis=1)

# Training y data

df_train_target = pd.read_csv(f'{path}/train.csv')
dtrain = lgb.Dataset(df_train_features, label=df_train_target['target'])

# LightGBM Model

para = {
    'metric' : '',
    'max_depth' : 30,
    'num_leaves' : 100,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'min_child_samples' : 19,
    'min_child_weight' : 0.002,
    'bagging_fraction' : 0.5,
    'bagging_freq' : 1,
    'reg_alpha' : 0.01,
    'reg_lambda' : 4,
    'cat_smooth' : 12,
    'num_iterations' : 100,
    'device':'gpu'
}

model = lgb.train(para, dtrain)

In [None]:
# GridSearchCV for finding best parameters

# parameters = {
    
# }

# gbm = lgb.LGBMRegressor(
#     metric = '',
#     max_depth = 30,
#     num_leaves = 100,
#     learning_rate = 0.05,
#     feature_fraction = 0.9,
#     min_child_samples = 19,
#     min_child_weight = 0.002,
#     bagging_fraction = 0.5,
#     bagging_freq = 1,
#     reg_alpha = 0.01,
#     reg_lambda = 4,
#     cat_smooth = 12,
#     num_iterations = 100,
#     device = 'gpu'
# )
# gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='neg_median_absolute_error', cv=3, n_jobs = -1)
# gsearch.fit(df_train_features.values, df_train_target['target'].values)
# print(f'Best: {gsearch.best_score_} using {gsearch.best_params_}')
# print('-----------------------------------------------------------')
# print(gsearch.cv_results_['mean_test_score'])
# print('-----------------------------------------------------------')
# print(gsearch.cv_results_['params'])

In [None]:
# Test X data

df_test = train_and_test_data(list_order_book_file_test,list_order_trade_file_test)
df_test_features = df_test.drop(['time_id'], axis=1)

# Using LightGBM to predict test y

ypred = model.predict(df_test_features)
ypred = np.asarray(ypred)

In [None]:
# Generate dataframe for submitting

df_test['target'] = ypred
df_test = df_test[['stock_id', 'time_id', 'target']]
df_test = submit_format(df_test)
df_test.dropna(inplace=True)
df_test.columns = ['row_id','target']

In [None]:
df_test.to_csv('submission.csv',index = False)