In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import MinMaxScaler 
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
train = pd.read_csv("./data/train.csv")
stocks = list(range(0,200))
train = train.sort_values(['date_id','time_id'])
train = train[train['wap'].notna()]
train[train['stock_id'] == 0].head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
191,0,0,10,1299772.7,1,1.000026,15261106.63,,,0.999812,13996.5,1.000026,23519.16,0.999892,0.389814,1,0_10_0
382,0,0,20,1299772.7,1,0.999919,15261106.63,,,0.999812,4665.5,0.999919,12131.6,0.999842,4.220009,2,0_20_0
573,0,0,30,1299772.7,1,1.000133,15261106.63,,,1.000026,55998.0,1.000133,46203.3,1.000085,5.450249,3,0_30_0
764,0,0,40,1218204.43,1,1.000455,15342674.9,,,1.000241,14655.95,1.000455,26610.45,1.000317,3.169775,4,0_40_0


In [3]:
train.isnull().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                   0
imbalance_buy_sell_flag          0
reference_price                  0
matched_size                     0
far_price                  2894122
near_price                 2856960
bid_price                        0
bid_size                         0
ask_price                        0
ask_size                         0
wap                              0
target                           0
time_id                          0
row_id                           0
dtype: int64

In [4]:
def create_lagged_data(data,var,window_size,forecast_dist):
    var_data = data[var]
    y = var_data.iloc[window_size+forecast_dist:]
    var_data = var_data.values.reshape(-1,1)
    n = data.shape[0]
    X = np.hstack(tuple([var_data[i: n-j, :] for i, j in enumerate(range(window_size,0,-1))]))
    X = X[:X.shape[0]-forecast_dist]
    y.reset_index()
    return pd.DataFrame(X,index=y.index).to_numpy(), y.to_numpy()

In [5]:
window_size = 5
scaler =  MinMaxScaler()
train['wap_scaled'] = scaler.fit_transform(train['wap'].to_numpy().reshape(-1, 1)).squeeze()


for stock in stocks:
    print("creating forecaster for stock: ",stock)
    X, y = create_lagged_data(train[train['stock_id'] == stock],'wap_scaled',window_size,6)

    size = int(X.shape[0]*.9)
    X_train = X[:size]
    y_train = y[:size]

    X_test = X[size:]
    y_test = y[size:]

    n_features = 1

    forecaster = GradientBoostingRegressor().fit(X_train,y_train)

    print("score: ",forecaster.score(X_test,y_test))

    with open('./models/gboost/model_{}'.format(stock), 'wb') as handle:
        pickle.dump(forecaster, handle, protocol=pickle.HIGHEST_PROTOCOL)

creating forecaster for stock:  0
score:  0.6188867375222868
creating forecaster for stock:  1
score:  0.5431612101592789
creating forecaster for stock:  2
score:  0.527267763539716
creating forecaster for stock:  3
score:  0.6124656426484918
creating forecaster for stock:  4
score:  0.6140481802051205
creating forecaster for stock:  5
score:  0.5618386001825766
creating forecaster for stock:  6
score:  0.5738052058712491
creating forecaster for stock:  7
score:  0.5877927602362565
creating forecaster for stock:  8
score:  0.5342321919951488
creating forecaster for stock:  9
score:  0.5858242913989474
creating forecaster for stock:  10
score:  0.6557305204884657
creating forecaster for stock:  11
score:  0.6267408839327612
creating forecaster for stock:  12
score:  0.5868122628297128
creating forecaster for stock:  13
score:  0.669791607929535
creating forecaster for stock:  14
score:  0.5884246869526231
creating forecaster for stock:  15
score:  0.5923462466097864
creating forecaster 