# Setting up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
# from fastai import *
# from fastai.tabular import *
from sklearn.ensemble import RandomForestRegressor
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump, load
from datetime import datetime, timedelta

%matplotlib inline
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [2]:
# %%time
# path = 'D://Coding//XTX Forecasting Challenge//data-training.csv'
# df = pd.read_csv(path)

In [2]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [3]:
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
askRateList = ['askRate' + str(i) for i in range(0,15)]

# Exploratory Data Analysis

In [5]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Feature engineering

### Basics

#### Cross-sectional features

In [56]:
# different from submission
def compute_cross_sectional(df):
#     df = pd.DataFrame([base_row])
#     df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df
df = compute_cross_sectional(df)

# TA

#### Time series features

In [59]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df
df = add_time_features(df)

In [8]:
# Requires a window of up to a 1000 past items

#### Manual time features — can consider adding more to the lags list

In [61]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df
df = add_manual_time_features(df)

In [11]:
df.to_feather('intermediate.file')

In [4]:
df = pd.read_feather('intermediate.file')

#### Tick chart version with ffill

In [12]:
# midrate version
df['time'] = pd.date_range(start='1/1/1970', periods=2999999, freq='T')
df.set_index('time', inplace=True)
df_mid = df.midRate.resample('15Min').ohlc()
df_mid['vol'] = df.bidAskVol.resample('15Min').mean()

In [13]:
# takes 5 min
df_mid_ta = ta.add_all_ta_features(df_mid, "open", "high", "low", "close", "vol", fillna=True)

  dip[i] = 100 * (dip_mio[i]/trs[i])
  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  dx = 100 * np.abs((dip - din) / (dip + din))


In [63]:
# dump(df_mid_ta, 'df_mid_ta.joblib')
df_mid_ta = load('df_mid_ta.joblib')

In [48]:
X_pca[:1]

array([[ 8.8970764e+01,  1.8328373e+01, -1.9263544e+01, -1.5587357e+01,
        -4.7632145e+01,  5.3526089e+01,  1.2909327e+01, -1.8256685e+01,
         3.6211021e+00, -1.5878495e+01,  2.5628702e+02, -2.4331863e+01,
         4.8973980e+00,  1.8533689e-01,  2.0022793e+01,  3.3933430e+01,
         9.8881996e+01, -3.3170212e+01,  1.0228539e+02,  4.5066171e+00,
        -2.8165766e+01, -8.9614071e-02, -1.1391719e+01, -1.1329915e+01,
         1.7320482e+01, -1.0655308e+01, -5.4672961e+00,  2.3649971e+01,
        -2.2624979e+01, -2.9913643e+01, -5.1912155e+00,  3.9424671e+01,
        -1.1298305e+01, -4.6017570e+00, -2.1380127e+01,  1.8141096e+01,
        -5.3065324e+00,  1.2948828e+01, -1.2238501e+01,  6.9587070e-01,
        -4.0120945e+00,  5.1201946e-01,  1.0402673e+01, -1.2824577e+01,
         9.7495251e+00, -6.7554579e+00,  3.2993751e+00,  1.9961107e+01,
         3.2359062e+01, -7.7225599e+00]], dtype=float32)

In [15]:
# takes 30s
new_df = df.join(df_mid_ta).ffill()
new_df = new_df.astype('float32')

In [4]:
# dump(new_df, 'new_df.joblib')
new_df = load('new_df.joblib')

# Feature Selection

In [None]:
# figure out a way to validate the blend

In [23]:
new_df.drop([*askRateList, *askSizeList, *bidRateList, *bidSizeList], axis=1, inplace=True)

In [25]:
# takes 40s
X = new_df.drop('y', axis=1).values
y = new_df.y.values

# standardise
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [26]:
# pca takes 1 min
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_scaled)

In [27]:
dump(scaler, 'scaler_drop.joblib')
dump(pca, 'pca_drop.joblib')
# X_pca = load('X_pca.joblib')
# y = load('y.joblib')

['pca_drop.joblib']

In [None]:
# print(pca.explained_variance_ratio_)

# Cross-validation

# Lasso

In [28]:
rlcv = RollingWindowSplit(n_splits=5, compatible=True)

In [29]:
# takes at least 1 min on pca variables
from sklearn.linear_model import LassoLarsCV
lasso = LassoLarsCV(cv=rlcv, n_jobs=-1).fit(X_pca, y)

In [None]:
# actually the lasso above has seen the entire dataset....

In [30]:
dump(lasso, 'lassocv_drop.joblib')
# lasso = load('lassocv.joblib')

['lassocv_drop.joblib']

In [29]:
lasso

LassoLarsCV(copy_X=True,
            cv=RollingWindowSplit(compatible=True, max_train_size=None, n_splits=5),
            eps=2.220446049250313e-16, fit_intercept=True, max_iter=500,
            max_n_alphas=1000, n_jobs=-1, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [31]:
def rlcvscore(model):
    cvtrain, cvvalid, cvvalidsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_pca), 1):
        x_train, x_valid = X_pca[train_index], X_pca[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        cvtrain.append(model.score(x_train, y_train))
        cvvalid.append(model.score(x_valid, y_valid))
        sigmoid = (1/(1+np.exp(-0.22*model.predict(x_valid)))-0.5)*20  
        cvvalidsig.append(r2d2(y_valid, sigmoid))
    print(f'{np.array(cvtrain).round(4)}')
    print(f'{np.array(cvvalid).round(4)}')
    print(f'{np.array(cvvalidsig).round(4)}')
    print(f'{np.mean(cvtrain):.4f}, {np.mean(cvvalid):.4f}, {np.mean(cvvalidsig):.4f}')

In [32]:
rlcvscore(lasso) # dropped

[0.0479 0.0557 0.0582 0.0557 0.0553]
[0.0557 0.0582 0.0557 0.0553 0.0569]
[0.0581 0.0609 0.058  0.0574 0.0585]
0.0546, 0.0564, 0.0586


In [26]:
rlcvscore(lasso) # undropped

[0.0304 0.0465 0.0495 0.05   0.0484]
[0.0465 0.0495 0.05   0.0484 0.047 ]
[0.049  0.0522 0.0526 0.0507 0.0485]
0.0449, 0.0483, 0.0506


In [None]:
# dump(lasso, f'lasso_rlcv_114ft_0.0175_0.0187.joblib')
# lasso = load('lasso_rlcv_114ft_0.0175_0.0187.joblib') 

# RF

In [None]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=6, min_samples_split=1000, min_samples_leaf=1000,
                                 max_features='auto', n_jobs=-1, random_state=41)

In [None]:
rf_model.fit(x_train, y_train);

In [None]:
rlcvscore(rf_model) # realistic cv

In [None]:
a = df.drop('y', axis=1).columns[indices]

In [None]:
# create X with important variables only
X = df.drop('y', axis=1)[a[:20]].values
y = df.y.values

In [None]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=2, min_samples_split=2, min_samples_leaf=5000,
                                 max_features='auto', n_jobs=-1, random_state=41)
rf_model.fit(x_train, y_train);

In [None]:
rlcvscore(rf_model) #n_est 10, depth 2, samples_split 2, samples_leaf 1000, 30 most importantvariables

In [None]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
# Plot the feature importances of the forest
plt.figure(figsize=(15,8))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
# save model
dump(rf, 'model.joblib')

In [None]:
# load model
rf2 = load('model.joblib')

# Fast.ai

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
path = f'D:\Coding\XTX Forecasting Challenge'
data = TabularDataBunch.from_df(path = path, df = df[:int(5e5)], dep_var = 'y', procs=procs,
                                 valid_idx = list(range(int(4e5),int(5e5))))

In [None]:
data.show_batch(rows=10)

In [None]:
# data = (TabularList.from_df(df[:int(5e5)], cont_names=df.columns, procs=procs)
#                            .split_by_idx(list(range(int(0.8*5e5),int(5e5))))
#                            .label_from_df(cols=dep_var, label_cls=FloatList)
#                            .databunch())

In [None]:
learn = tabular_learner(data, layers=[500,200], metrics=r2_score, ps=[0.001,0.01], emb_drop=0.04)

In [None]:
learn.model

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
learn.fit_one_cycle(3, 1e-4, wd=0.1)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.predict(df.iloc[int(8.1e5)])

In [None]:
df.y.iloc[int(8.1e5)]

In [None]:
preds = learn.get_preds()

# Submission testing

In [None]:
def get_next_data_as_numpy_array(iteration):
    return df.iloc[iteration][:60].values

In [None]:
def append_to_df(massive_df, row):
    try: row.index = [massive_df.index[-1] + timedelta(minutes=1)]
    except: row.index = [datetime(1970,1,1)]
    return massive_df.append(row)

In [None]:
def add_resample_features(massive_df, resampled_df):
    leftovers = len(massive_df) % 15
    a = pd.DataFrame()
    def pad_history():
        full_resampled = resampled_df.append(df_mid)
        a = pd.DataFrame([full_resampled.iloc[0] for j in range(30+1-len(full_resampled))])
        a = a.append(full_resampled)
        a.index = pd.date_range(start=df_mid.index[-1], periods=len(a), freq='-15Min').sort_values()
        df_mid_ta = ta.add_all_ta_features(a, "open", "high", "low", "close", "vol", fillna=True)
        return df_mid_ta
    if leftovers == 0:
        df_mid = massive_df.tail(15).midRate.resample('15Min').ohlc()
        df_mid['vol'] = massive_df.tail(15).bidAskVol.resample('15Min').mean()
        df_mid_ta = pad_history()
        resampled_df = resampled_df.append(df_mid_ta)
    else:
        df_mid = massive_df.tail(leftovers).midRate.resample('15Min').ohlc()
        df_mid['vol'] = massive_df.tail(leftovers).bidAskVol.resample('15Min').mean()
        df_mid_ta = pad_history()
    massive_df.update(df_mid_ta)
    massive_df = massive_df.ffill().astype('float32')
    return massive_df, resampled_df

In [None]:
massive_df, resampled_df = pd.DataFrame(), pd.DataFrame()

In [None]:
def compute_cross_sectional(base_row):
    df = pd.DataFrame([base_row])
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df

In [None]:
for iteration in range(30):
    base_row = get_next_data_as_numpy_array(len(massive_df))
    row = compute_cross_sectional(base_row)
    massive_df = append_to_df(massive_df, row)
    massive_df, resampled_df = add_resample_features(massive_df, resampled_df)
    massive_df = add_time_features(massive_df)
    massive_df = add_manual_time_features(massive_df)

In [None]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
#     print(b1,a1,b2,a2)
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)    
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df

In [None]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df

In [None]:
massive_df

In [None]:
massive_df = add_manual_time_features(massive_df)