<a href="https://colab.research.google.com/github/sajacaros/optiver/blob/main/sajacaros/04_model_conv1d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gc
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm

warnings.filterwarnings(action='ignore')
# warnings.filterwarnings(action='default')

In [2]:
running_colab = 'google.colab' in str(get_ipython()) if hasattr(__builtins__,'__IPYTHON__') else False
if running_colab:
    from google.colab import drive
    drive.mount('/content/drive')
if running_colab:
    data_path = '/content/drive/MyDrive/Colab Notebooks/ai6th/data/optiver/'
else:
    data_path = '../../data/'

In [3]:
train_df = pd.read_csv(data_path+'train.csv')
test_df = pd.read_csv(data_path+'test.csv')
submission_df = pd.read_csv(data_path+'sample_submission.csv')
target_df = pd.read_csv(data_path+'revealed_targets.csv')

In [4]:
# null 값 처리
train_df['far_price'] = train_df['far_price'].fillna(0)
train_df['near_price'] = train_df['near_price'].fillna(1)
train_df.isnull().sum()

stock_id                     0
date_id                      0
seconds_in_bucket            0
imbalance_size             220
imbalance_buy_sell_flag      0
reference_price            220
matched_size               220
far_price                    0
near_price                   0
bid_price                  220
bid_size                     0
ask_price                  220
ask_size                     0
wap                        220
target                      88
time_id                      0
row_id                       0
dtype: int64

In [5]:
cols_group_by = ['date_id', 'seconds_in_bucket']
cols_fill_nan = [
    'imbalance_size', 'reference_price', 'matched_size', 'wap',
    'bid_price', 'bid_size', 'ask_price', 'ask_size',
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']
train_grouped_median = train_df.groupby(cols_group_by)[cols_fill_nan].transform('median')
train_df[cols_fill_nan] = train_df[cols_fill_nan].fillna(train_grouped_median)
train_df.loc[train_df.isnull().any(axis=1)] # target 88

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
369508,131,35,0,1381981.10,0,0.999950,9723622.86,0.0,1.0,0.999688,0.0,1.000242,0.0,1.000000,,1925,35_0_131
369700,131,35,10,1371886.54,0,1.000252,9961197.49,0.0,1.0,0.999969,0.0,1.000485,0.0,1.000223,,1926,35_10_131
369892,131,35,20,1331838.54,0,1.000122,9999133.11,0.0,1.0,0.999883,0.0,1.000328,0.0,1.000149,,1927,35_20_131
370084,131,35,30,1350584.58,0,0.999910,10133596.07,0.0,1.0,0.999757,0.0,1.000186,0.0,0.999971,,1928,35_30_131
370276,131,35,40,1327284.70,0,0.999926,10133596.07,0.0,1.0,0.999758,0.0,1.000203,0.0,0.999984,,1929,35_40_131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225338,158,388,510,339056.84,0,1.001074,19640140.62,0.0,1.0,1.000867,0.0,1.001250,0.0,1.001027,,21391,388_510_158
4225538,158,388,520,437746.70,0,1.001183,19767348.90,0.0,1.0,1.001007,0.0,1.001290,0.0,1.001163,,21392,388_520_158
4225738,158,388,530,483535.46,0,1.000994,19800447.29,0.0,1.0,1.000905,0.0,1.001187,0.0,1.001032,,21393,388_530_158
4225938,158,388,540,453894.62,0,1.000989,20020720.89,0.0,1.0,1.000824,0.0,1.001024,0.0,1.000911,,21394,388_540_158


In [6]:
# (stock_id=131, date_id=35), (stock_id=158, date_id=388) target이 없으므로 제거
train_df = train_df.drop(index=train_df.loc[((train_df['stock_id']==131) & (train_df['date_id']==35))  | ((train_df['stock_id']==158) & (train_df['date_id']==388))].index)
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
3555081,101,328,0,3683316.08,0,0.999681,7405651.01,0.0,1.0,0.999647,0.0,1.000356,0.0,1.0,,18040,328_0_101
4764999,19,438,0,2201071.62,-1,1.000146,6841177.82,0.0,1.0,0.999749,0.0,1.000277,0.0,1.0,,24090,438_0_19


In [7]:
# stock_id=101, date_id=328, seconds_in_bucket=0의 target값을 stock_id=101, date_id=328, seconds_in_bucket=1의 target값으로 세팅
train_df.loc[train_df['row_id']=='328_0_101', 'target'] = train_df.loc[(train_df['stock_id']==101) & (train_df['time_id']==18041), 'target'].values
# stock_id=19, date_id=438, seconds_in_bucket=0의 target값을 stock_id=19, date_id=438, seconds_in_bucket=1의 target값으로 세팅
train_df.loc[train_df['row_id']=='438_0_19', 'target'] = train_df.loc[(train_df['stock_id']==19) & (train_df['time_id']==24091), 'target'].values
train_df.isnull().sum()

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64

In [8]:
selected_features = [
    'stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price',
    'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size','wap'
]
categorical_features = ['stock_id']

cat_idxs = [idx for idx, cat in enumerate(selected_features) if cat in categorical_features]
cat_dims = [train_df.loc[:, [cat]].nunique()[0] for cat in categorical_features]
cat_idxs, cat_dims

([0], [200])

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_df[selected_features] = scaler.fit_transform(train_df[selected_features])

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_df.loc[:, selected_features].values, train_df.loc[:,['target']].values, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((4190296, 13), (4190296, 1), (1047574, 13), (1047574, 1))

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [13]:
# TabNet
from pytorch_tabnet.tab_model import TabNetRegressor

reg = TabNetRegressor(
    cat_dims=cat_dims,
    cat_emb_dim=10,
    cat_idxs=cat_idxs,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    verbose=10,
)
reg

In [14]:
# 모델 학습
max_epochs = 100
model = reg.fit(
    X_train=X_train,
    y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['mae'],
    max_epochs=max_epochs,
    patience=20,
    batch_size=1024, # default
    virtual_batch_size=128, # default
)

epoch 0  | loss: 89.17242| train_mae: 6.38784 | valid_mae: 6.38603 |  0:03:47s


KeyboardInterrupt: 

In [None]:
train_mae = reg.history['train_mae']
val_mae = reg.history['valid_mae']

x_len = np.arange(len(val_mae))
plt.figure(figsize=(15,5))
plt.plot(x_len, train_mae, marker='.', c='blue', label='Train-set MAE')
plt.plot(x_len, val_mae, marker='.', c='red', label='Validation-set MAE')
plt.legend(loc='upper right')
plt.grid()
plt.xlabel('epoch')
plt.ylabel('MAE')
plt.show()