In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import MinMaxScaler 
from sklearn.ensemble import VotingRegressor
import torch
from torch import nn

In [2]:
train = pd.read_csv("./data/train.csv")
stocks = list(range(74,200))
train = train.sort_values(['date_id','time_id'])
train = train[train['wap'].notna()]
train[train['stock_id'] == 0].head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
191,0,0,10,1299772.7,1,1.000026,15261106.63,,,0.999812,13996.5,1.000026,23519.16,0.999892,0.389814,1,0_10_0
382,0,0,20,1299772.7,1,0.999919,15261106.63,,,0.999812,4665.5,0.999919,12131.6,0.999842,4.220009,2,0_20_0
573,0,0,30,1299772.7,1,1.000133,15261106.63,,,1.000026,55998.0,1.000133,46203.3,1.000085,5.450249,3,0_30_0
764,0,0,40,1218204.43,1,1.000455,15342674.9,,,1.000241,14655.95,1.000455,26610.45,1.000317,3.169775,4,0_40_0


In [3]:
def create_lagged_data(data,var,window_size,forecast_dist):
    var_data = data[var]
    y = var_data.iloc[window_size+forecast_dist:]
    var_data = var_data.values.reshape(-1,1)
    n = data.shape[0]
    X = np.hstack(tuple([var_data[i: n-j, :] for i, j in enumerate(range(window_size,0,-1))]))
    X = X[:X.shape[0]-forecast_dist]
    y.reset_index()
    return pd.DataFrame(X,index=y.index).to_numpy(), y.to_numpy()

In [4]:
def create_lagged_with_target(data,var,window_size,forecast_dist,target):
    var_data = data[var]
    target_data = data[target]
    y = target_data.iloc[window_size+forecast_dist:]
    var_data = var_data.values.reshape(-1,1)
    n = data.shape[0]
    X = np.hstack(tuple([var_data[i: n-j, :] for i, j in enumerate(range(window_size,0,-1))]))
    X = X[:X.shape[0]-forecast_dist]
    y.reset_index()
    return pd.DataFrame(X,index=y.index).to_numpy(), y.to_numpy()

In [5]:
class TimeModel(nn.Module):
    def __init__(self,window_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size=window_size, hidden_size=50, num_layers=1, batch_first=True)
        self.linear = nn.Linear(50, 1)
        self.double()
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x.type(torch.float64)

In [6]:
window_size = 5
scaler =  MinMaxScaler()
train['wap_scaled'] = scaler.fit_transform(train['wap'].to_numpy().reshape(-1, 1)).squeeze()


for stock in stocks:
    print("creating forecaster for stock: ",stock)
    X, y = create_lagged_data(train[train['stock_id'] == stock],'wap_scaled',window_size,6)

    size = int(X.shape[0]*.9)
    X_train = X[:size]
    y_train = y[:size]

    X_test = X[size:]
    y_test = y[size:]

    n_features = 1

    with open('./models/random_foresest/model_{}'.format(stock), 'rb') as handle:
        rf = pickle.load(handle)

    with open('./models/gboost/model_{}'.format(stock), 'rb') as handle:
        gb = pickle.load(handle)

    #lstm = TimeModel(window_size=window_size)
    #lstm.load_state_dict(torch.load('./models/lstm/model_{}'.format(stock)))

    forecaster = VotingRegressor(estimators=[('gb', gb), ('rf', rf)]).fit(X_train,y_train)
    

    print("score: ",forecaster.score(X_test,y_test))

    with open('./models/ensembler/model_{}'.format(stock), 'wb') as handle:
        pickle.dump(forecaster, handle, protocol=pickle.HIGHEST_PROTOCOL)

creating forecaster for stock:  74
score:  0.5751791708023085
creating forecaster for stock:  75
score:  0.5965521449928693
creating forecaster for stock:  76
score:  0.559678833470707
creating forecaster for stock:  77
score:  0.6022061667189348
creating forecaster for stock:  78
score:  0.5843106574977222
creating forecaster for stock:  79
score:  0.6338415166991956
creating forecaster for stock:  80
score:  0.536183629408861
creating forecaster for stock:  81
score:  0.4322691692125802
creating forecaster for stock:  82
score:  0.5526315544021274
creating forecaster for stock:  83
score:  0.5212723337699467
creating forecaster for stock:  84
score:  0.5753109155018441
creating forecaster for stock:  85
score:  0.6207417115736527
creating forecaster for stock:  86
score:  0.6495523347500924
creating forecaster for stock:  87
score:  0.5996715930300054
creating forecaster for stock:  88
score:  0.6386421822143017
creating forecaster for stock:  89
score:  0.5977509806511128
creating f

In [7]:
window_size = 5
forecasters = {}
for stock in stocks:
    with open('./models/ensembler/model_{}'.format(stock), 'rb') as handle:
        forecasters[stock] = pickle.load(handle)

forecasts = {}
for stock in stocks:
    print("creating forecaster for stock: ",stock)
    X, y = create_lagged_with_target(train[train['stock_id'] == stock],'wap_scaled',window_size,6,'target')

    size = int(X.shape[0]*.9)
    X_train[stock] = forecasters[stock].predict(X[:size])
    y_train[stock] = y[:size]

    X_test[stock] = forecasters[stock].predict(X[size:])
    y_test[stock] = y[size:]

pd.DataFrame(X_train).head()

: 