In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob

# https://zhuanlan.zhihu.com/p/180347090
from joblib import Parallel, delayed

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./__notebook__.ipynb


In [2]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def calculate_wap(df):
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    b1 = df['bid_size1'] + df['ask_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b2 = df['bid_size2'] + df['ask_size2']
    
    x = (a1/b1 + a2/b2)/ 2
    
    return x

In [3]:
def get_stock_stat(stock_id : int, dataType = 'train'):
    
    book_train_subset = pd.read_parquet(f'../input/optiver-realized-volatility-prediction/book_{dataType}.parquet/stock_id={stock_id}/')
    book_train_subset.sort_values(by=['time_id', 'seconds_in_bucket'])

    book_train_subset['bas'] = (book_train_subset[['ask_price1', 'ask_price2']].min(axis = 1)
                                / book_train_subset[['bid_price1', 'bid_price2']].max(axis = 1)
                                - 1)                               

    
    book_train_subset['wap'] = calculate_wap(book_train_subset)

    book_train_subset['log_return'] = (book_train_subset.groupby(by = ['time_id'])['wap'].
                                       apply(log_return).
                                       reset_index(drop = True).
                                       fillna(0)
                                      )
    
    stock_stat = pd.merge(
        book_train_subset.groupby(by = ['time_id'])['log_return'].agg(realized_volatility).reset_index(),
        book_train_subset.groupby(by = ['time_id'], as_index = False)['bas'].mean(),
        on = ['time_id'],
        how = 'left'
    )
    
    stock_stat.insert(0, "stock_id", stock_id)  #第一列插入
    
    return stock_stat

In [4]:
def get_dataSet(stock_ids : list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

In [5]:
#train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

#train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')
#train_dataSet = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

In [6]:
train_dataSet = pd.read_csv("../input/optiverrealizedvolatilitydatasets/optiver-realized-volatility-datasets.csv")
train_dataSet.head()

Unnamed: 0,stock_id,time_id,target,log_return,bas
0,0,5,0.004136,0.004115,0.000852
1,0,11,0.001445,0.001268,0.000394
2,0,16,0.002168,0.002719,0.000725
3,0,31,0.002195,0.002625,0.000861
4,0,62,0.001747,0.001901,0.000397


In [7]:



y_train = train_dataSet['target']
X_train = train_dataSet.drop(['stock_id', 'time_id', 'target'], axis = 1)
X_train.head()

Unnamed: 0,log_return,bas
0,0.004115,0.000852
1,0.001268,0.000394
2,0.002719,0.000725
3,0.002625,0.000861
4,0.001901,0.000397


In [8]:
#clf = XGBRegressor(random_state = 0
                   #,n_estimators = 200
                   #,learning_rate = 0.1
                   #,subsample = 0.8
                   #,colsample_bytree = 0.8,n_jobs= - 1)

#clf.fit(X_train,y_train.to_numpy().ravel())

In [9]:
params = {'lambda': 1.00952957200908,
         'alpha': 0.006342908262784082,
         'colsample_bytree': 1.0,
         'subsample': 0.4,
         'learning_rate': 0.02,
         'n_estimators': 617,
         'max_depth': 7,
         'random_state': 2020,
         'min_child_weight': 101}

clf = XGBRegressor(**params, tree_method='gpu_hist')
clf.fit(X_train,y_train.to_numpy().ravel())

XGBRegressor(alpha=0.006342908262784082, base_score=0.5, booster='gbtree',
             callbacks=None, colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=1.0, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=0, gpu_id=0,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', lambda=1.00952957200908,
             learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=101,
             missing=nan, monotone_constraints='()', n_estimators=617, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=2020, ...)

In [10]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test_dataSet = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
test_dataSet = test_dataSet.drop(['stock_id', 'time_id'], axis = 1)

y_pred = test_dataSet[['row_id']]
X_test = test_dataSet.drop(['row_id'], axis = 1).fillna(0)

In [11]:
y_pred = y_pred.assign(target = clf.predict(X_test))
y_pred.to_csv('submission.csv',index = False)

In [12]:
y_pred

Unnamed: 0,row_id,target
0,0-4,0.001522
1,0-32,0.000662
2,0-34,0.000662
