In [None]:
import glob
import cudf
import cupy
from cuml import train_test_split
import xgboost as xgb

In [None]:
path = '~/Datasets/Kaggle Optiver'

list_order_book_file_train = glob.glob(f'{path}/book_train.parquet/*')
list_order_book_file_test = glob.glob(f'{path}/book_test.parquet/*')

In [None]:
def bid_ask_spread(df):
    return df['ask_price1']/df['bid_price1'] - 1
    
def wap_1(df):
    price = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])\
    / (df['ask_size1'] + df['bid_size1'])
    return price

def wap_2(df):
    price = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])\
    / (df['ask_size2'] + df['bid_size2'])
    return price

def log_return(price):
    return cupy.diff(cupy.log(price))

def volatility(log_return):
     return cupy.sqrt(cupy.sum(cupy.square(log_return)))

def data_eng(stock_data):
    stock_data['WAP1'] = wap_1(stock_data)
    stock_data['log_return'] = cupy.log(stock_data['WAP1'])
    log_return_lag = stock_data.groupby('time_id')['log_return'].shift(1)
    stock_data['log_return1'] = cudf.DataFrame(log_return_lag).reset_index(drop=True)['log_return']
    stock_data = stock_data[~stock_data['log_return1'].isnull()]
    stock_data['log_return'] = stock_data['log_return1'] - stock_data['log_return']
    stock_data['log_return'] = cupy.square(stock_data['log_return'])
    
    df_vol = stock_data.groupby('time_id')['log_return'].sum()
    df_vol = cudf.DataFrame(df_vol).rename(columns={'log_return':'first_10_vol'})
    df_vol = df_vol.reset_index().rename(columns={'index':'time_id'})
    df_vol = df_vol.sort_values(by=['time_id'])
    df_vol['first_10_vol'] = cupy.sqrt(df_vol['first_10_vol'])
    df_vol.sort_values(by=['time_id']).reset_index(drop=True)
    
    return df_vol

def eng_all_stock(file_list):
    stock_file_df = cudf.DataFrame()

    
    for file in file_list:
        file_df = cudf.read_parquet(file)
        all_stock_vol = data_eng(file_df)
        all_stock_vol['stock_id'] = file.split('=')[1]
        all_stock_vol['stock_id'] = all_stock_vol['stock_id'].astype('int32')
        stock_file_df = cudf.concat([stock_file_df,all_stock_vol])
        
    
    stock_file_df = stock_file_df[['stock_id', 'time_id', 'first_10_vol']]
    return stock_file_df

def rmspe(y_true, y_pred):
    return  cupy.round((cupy.sqrt(cupy.mean(cupy.square((y_true - y_pred) / y_true)))),3)

def submit_format(df):
    df['row_id'] = df['stock_id'].astype(str) + '-' + df['time_id'].astype(str)
    df = df.rename(columns={'first_10_vol':'target'})
    df = df[['row_id','target']]
    return df

In [None]:
df_train = eng_all_stock(list_order_book_file_train)
df_train = df_train.sort_values(by=['stock_id','time_id']).reset_index(drop=True)

In [None]:
df_train_target = cudf.read_csv(f'{path}/train.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train,df_train_target['target'],test_size=0.2)

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
params = {'tree_method': 'gpu_hist', 'max_depth': 20, 'learning_rate': 0.55}
model = xgb.train(params, dtrain)
ypred = model.predict(dtest)

In [None]:
df_test = eng_all_stock(list_order_book_file_test)
df_test = df_test.sort_values(by=['stock_id','time_id']).reset_index(drop=True)

In [None]:
dtest = xgb.DMatrix(df_test)
ypred = model.predict(dtest)
ypred = cupy.asarray(ypred)

In [None]:
df_test['first_10_vol'] = ypred
df_test = submit_format(df_test)
df_test

In [None]:
df_test.to_csv('submission.csv',index = False)