# Setting up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
# from fastai import *
# from fastai.tabular import *
from sklearn.ensemble import RandomForestRegressor
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump, load
from datetime import datetime, timedelta

%matplotlib inline
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [2]:
# %%time
# path = 'D://Coding//XTX Forecasting Challenge//data-training.csv'
# df = pd.read_csv(path)

In [55]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [4]:
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
askRateList = ['askRate' + str(i) for i in range(0,15)]

# Exploratory Data Analysis

In [5]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Feature engineering

### Basics

#### Cross-sectional features

In [56]:
# different from submission
def compute_cross_sectional(df):
#     df = pd.DataFrame([base_row])
#     df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df
df = compute_cross_sectional(df)

# TA

#### Time series features

In [59]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df
df = add_time_features(df)

In [8]:
# Requires a window of up to a 1000 past items

#### Manual time features — can consider adding more to the lags list

In [61]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df
df = add_manual_time_features(df)

In [11]:
df.to_feather('intermediate.file')

In [None]:
df = pd.read_feather('intermediate.file')

#### Tick chart version with ffill

In [12]:
# midrate version
df['time'] = pd.date_range(start='1/1/1970', periods=2999999, freq='T')
df.set_index('time', inplace=True)
df_mid = df.midRate.resample('15Min').ohlc()
df_mid['vol'] = df.bidAskVol.resample('15Min').mean()

In [13]:
# takes 5 min
df_mid_ta = ta.add_all_ta_features(df_mid, "open", "high", "low", "close", "vol", fillna=True)

  dip[i] = 100 * (dip_mio[i]/trs[i])
  dip[i] = 100 * (dip_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  din[i] = 100 * (din_mio[i]/trs[i])
  dx = 100 * np.abs((dip - din) / (dip + din))


In [63]:
# dump(df_mid_ta, 'df_mid_ta.joblib')
df_mid_ta = load('df_mid_ta.joblib')

In [48]:
X_pca[:1]

array([[ 8.8970764e+01,  1.8328373e+01, -1.9263544e+01, -1.5587357e+01,
        -4.7632145e+01,  5.3526089e+01,  1.2909327e+01, -1.8256685e+01,
         3.6211021e+00, -1.5878495e+01,  2.5628702e+02, -2.4331863e+01,
         4.8973980e+00,  1.8533689e-01,  2.0022793e+01,  3.3933430e+01,
         9.8881996e+01, -3.3170212e+01,  1.0228539e+02,  4.5066171e+00,
        -2.8165766e+01, -8.9614071e-02, -1.1391719e+01, -1.1329915e+01,
         1.7320482e+01, -1.0655308e+01, -5.4672961e+00,  2.3649971e+01,
        -2.2624979e+01, -2.9913643e+01, -5.1912155e+00,  3.9424671e+01,
        -1.1298305e+01, -4.6017570e+00, -2.1380127e+01,  1.8141096e+01,
        -5.3065324e+00,  1.2948828e+01, -1.2238501e+01,  6.9587070e-01,
        -4.0120945e+00,  5.1201946e-01,  1.0402673e+01, -1.2824577e+01,
         9.7495251e+00, -6.7554579e+00,  3.2993751e+00,  1.9961107e+01,
         3.2359062e+01, -7.7225599e+00]], dtype=float32)

In [15]:
# takes 30s
new_df = df.join(df_mid_ta).ffill()
new_df = new_df.astype('float32')

In [64]:
# dump(new_df, 'new_df.joblib')
new_df = load('new_df.joblib')

In [65]:
new_df[:1]

Unnamed: 0_level_0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,askRate10,askRate11,askRate12,askRate13,askRate14,askSize0,askSize1,askSize2,askSize3,askSize4,askSize5,askSize6,askSize7,askSize8,askSize9,askSize10,askSize11,askSize12,askSize13,askSize14,bidRate0,bidRate1,bidRate2,bidRate3,bidRate4,bidRate5,bidRate6,bidRate7,bidRate8,bidRate9,bidRate10,bidRate11,bidRate12,bidRate13,bidRate14,bidSize0,bidSize1,bidSize2,bidSize3,bidSize4,bidSize5,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,y,spread,midRate,bidAskVol,totalBidVol1,totalAskVol1,totalBidVol2,totalAskVol2,totalBidVol3,totalAskVol3,totalBidVol4,totalAskVol4,totalBidVol5,totalAskVol5,totalBidVol6,totalAskVol6,totalBidVol7,totalAskVol7,totalBidVol8,totalAskVol8,totalBidVol9,totalAskVol9,totalBidVol10,totalAskVol10,totalBidVol11,totalAskVol11,totalBidVol12,totalAskVol12,totalBidVol13,totalAskVol13,totalBidVol14,totalAskVol14,bidAskRatio1,bidAskRatio2,bidAskRatio3,bidAskRatio4,bidAskRatio5,bidAskRatio6,bidAskRatio7,bidAskRatio8,bidAskRatio9,bidAskRatio10,bidAskRatio11,bidAskRatio12,bidAskRatio13,bidAskRatio14,totalAvailVol,vwaBid,vwaAsk,vwaBidDMid,vwaAskDMid,diff_vwaBidAskDMid,deltaVBid,deltaVAsk,VOI,OIR,daskRate1,dbidRate1,daskRate2,dbidRate2,daskRate3,dbidRate3,daskRate4,dbidRate4,daskRate5,dbidRate5,daskRate6,dbidRate6,daskRate7,dbidRate7,daskRate8,dbidRate8,daskRate9,dbidRate9,daskRate10,dbidRate10,daskRate20,dbidRate20,daskRate30,dbidRate30,daskRate40,dbidRate40,daskRate50,dbidRate50,daskRate60,dbidRate60,daskRate70,dbidRate70,daskRate80,dbidRate80,daskRate90,dbidRate90,daskRate100,dbidRate100,daskRate200,dbidRate200,daskRate300,dbidRate300,daskRate400,dbidRate400,daskRate500,dbidRate500,daskRate600,dbidRate600,daskRate700,dbidRate700,daskRate800,dbidRate800,daskRate900,dbidRate900,open,high,low,close,vol,volume_adi,volume_obv,volume_cmf,volume_fi,volume_em,volume_vpt,volume_nvi,volatility_atr,volatility_bbh,volatility_bbl,volatility_bbm,volatility_bbhi,volatility_bbli,volatility_kcc,volatility_kch,volatility_kcl,volatility_kchi,volatility_kcli,volatility_dch,volatility_dcl,volatility_dchi,volatility_dcli,trend_macd,trend_macd_signal,trend_macd_diff,trend_ema_fast,trend_ema_slow,trend_adx,trend_adx_pos,trend_adx_neg,trend_vortex_ind_pos,trend_vortex_ind_neg,trend_vortex_diff,trend_trix,trend_mass_index,trend_cci,trend_dpo,trend_kst,trend_kst_sig,trend_kst_diff,trend_ichimoku_a,trend_ichimoku_b,trend_visual_ichimoku_a,trend_visual_ichimoku_b,trend_aroon_up,trend_aroon_down,trend_aroon_ind,momentum_rsi,momentum_mfi,momentum_tsi,momentum_uo,momentum_stoch,momentum_stoch_signal,momentum_wr,momentum_ao,others_dr,others_dlr,others_cr
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1
1970-01-01,1619.5,1620.0,1621.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1615.0,1614.0,1613.0,1612.0,1611.0,1610.0,1607.0,1606.0,1605.0,1604.0,1603.0,1602.0,1601.5,1601.0,1600.0,7.0,10.0,1.0,10.0,20.0,3.0,20.0,27.0,11.0,14.0,35.0,10.0,1.0,10.0,13.0,-0.5,4.5,1617.25,8.0,17.0,11.0,18.0,35.0,28.0,35.0,48.0,35.0,51.0,35.0,71.0,35.0,98.0,35.0,109.0,35.0,123.0,35.0,158.0,35.0,168.0,35.0,169.0,35.0,179.0,35.0,192.0,35.0,1.545455,0.514286,0.8,1.371429,1.457143,2.028571,2.8,3.114286,3.514286,4.514286,4.8,4.828571,5.114286,5.485714,227.0,1606.132812,1620.671387,11.117188,3.421387,-7.695801,7.0,1.0,6.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1617.25,1617.25,1617.25,1617.25,8.0,-0.007724,0.0,0.0,0.0,0.0,-0.119901,1000.0,0.248556,1617.25,1617.25,1617.25,0.0,0.0,1617.25,1617.25,1617.25,0.0,0.0,1617.25,1617.25,0.0,0.0,0.0,0.0,0.0,1617.25,1617.25,0.0,0.0,0.0,1.0,1.0,0.0,-1.509016,0.0,0.0,24.60437,-14.98572,-14.98572,0.0,1617.25,1617.25,1642.025879,1642.023071,4.0,4.0,0.0,50.0,0.0,-100.0,0.0,50.0,50.0,-50.0,0.0,-1.498574,0.0,0.0


# Feature Selection

In [19]:
# takes 40s
X = new_df.drop('y', axis=1).values
y = new_df.y.values

# standardise
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [42]:
new_df.columns

Index(['askRate0', 'askRate1', 'askRate2', 'askRate3', 'askRate4', 'askRate5',
       'askRate6', 'askRate7', 'askRate8', 'askRate9',
       ...
       'momentum_mfi', 'momentum_tsi', 'momentum_uo', 'momentum_stoch',
       'momentum_stoch_signal', 'momentum_wr', 'momentum_ao', 'others_dr',
       'others_dlr', 'others_cr'],
      dtype='object', length=233)

In [41]:
X[:1]

array([[ 1.61950000e+03,  1.62000000e+03,  1.62100000e+03,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  1.00000000e+01,  2.40000000e+01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.61500000e+03,  1.61400000e+03,  1.61300000e+03,
         1.61200000e+03,  1.61100000e+03,  1.61000000e+03,
         1.60700000e+03,  1.60600000e+03,  1.60500000e+03,
         1.60400000e+03,  1.60300000e+03,  1.60200000e+03,
         1.60150000e+03,  1.60100000e+03,  1.60000000e+03,
         7.00000000e+00,  1.00000000e+01,  1.00000000e+00,
         1.00000000e+01,  2.00000000e+01,  3.00000000e+0

In [39]:
X_scaled[:1]

array([[-8.51030171e-01, -8.32115710e-01, -8.00245523e-01,
        -5.77722549e+01, -5.65140533e+01, -5.55046234e+01,
        -5.45748291e+01, -5.37677002e+01, -5.30957947e+01,
        -5.24735527e+01, -5.16888542e+01, -5.09189415e+01,
        -5.02083778e+01, -4.95083427e+01, -4.90881271e+01,
        -4.98240352e-01, -1.77631497e-01,  3.48564297e-01,
        -6.76704407e-01, -6.37844384e-01, -6.31431758e-01,
        -5.51112056e-01, -4.97578442e-01, -4.66209471e-01,
        -5.00656724e-01, -4.90353018e-01, -5.02407551e-01,
        -5.23163736e-01, -5.55020273e-01, -5.55982649e-01,
        -9.93078232e-01, -1.01142764e+00, -1.02989149e+00,
        -1.04838479e+00, -1.06687796e+00, -1.08539236e+00,
        -1.17833591e+00, -1.19674587e+00, -1.21503139e+00,
        -1.23294032e+00, -1.25033188e+00, -1.26698756e+00,
        -1.26424944e+00, -1.26069248e+00, -1.27509272e+00,
         2.33293306e-02, -1.36479616e-01, -7.12609947e-01,
        -2.12449953e-01,  1.84370860e-01, -5.08140147e-0

In [20]:
# pca takes 1 min
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_scaled)

In [35]:
X_pca[:1]

array([[ 8.8970764e+01,  1.8328373e+01, -1.9263544e+01, -1.5587357e+01,
        -4.7632145e+01,  5.3526089e+01,  1.2909327e+01, -1.8256685e+01,
         3.6211021e+00, -1.5878495e+01,  2.5628702e+02, -2.4331863e+01,
         4.8973980e+00,  1.8533689e-01,  2.0022793e+01,  3.3933430e+01,
         9.8881996e+01, -3.3170212e+01,  1.0228539e+02,  4.5066171e+00,
        -2.8165766e+01, -8.9614071e-02, -1.1391719e+01, -1.1329915e+01,
         1.7320482e+01, -1.0655308e+01, -5.4672961e+00,  2.3649971e+01,
        -2.2624979e+01, -2.9913643e+01, -5.1912155e+00,  3.9424671e+01,
        -1.1298305e+01, -4.6017570e+00, -2.1380127e+01,  1.8141096e+01,
        -5.3065324e+00,  1.2948828e+01, -1.2238501e+01,  6.9587070e-01,
        -4.0120945e+00,  5.1201946e-01,  1.0402673e+01, -1.2824577e+01,
         9.7495251e+00, -6.7554579e+00,  3.2993751e+00,  1.9961107e+01,
         3.2359062e+01, -7.7225599e+00]], dtype=float32)

In [27]:
dump(scaler, 'scaler.joblib')
dump(pca, 'pca.joblib')
# X_pca = load('X_pca.joblib')
# y = load('y.joblib')

['pca.joblib']

In [None]:
# print(pca.explained_variance_ratio_)

In [None]:
# drop original features, only use if not using pca
df.drop(df.columns[:60], axis=1, inplace=True)

# Cross-validation

# Lasso

In [22]:
rlcv = RollingWindowSplit(n_splits=5, compatible=True)

In [23]:
# takes at least 1 min on pca variables
from sklearn.linear_model import LassoLarsCV
lasso = LassoLarsCV(cv=rlcv, n_jobs=-1).fit(X_pca, y)

In [None]:
# actually the lasso above has seen the entire dataset....

In [28]:
# dump(lasso, 'lassocv.joblib')
lasso = load('lassocv.joblib')

In [29]:
lasso

LassoLarsCV(copy_X=True,
            cv=RollingWindowSplit(compatible=True, max_train_size=None, n_splits=5),
            eps=2.220446049250313e-16, fit_intercept=True, max_iter=500,
            max_n_alphas=1000, n_jobs=-1, normalize=True, positive=False,
            precompute='auto', verbose=False)

In [25]:
def rlcvscore(model):
    cvtrain, cvvalid, cvvalidsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_pca), 1):
        x_train, x_valid = X_pca[train_index], X_pca[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        cvtrain.append(model.score(x_train, y_train))
        cvvalid.append(model.score(x_valid, y_valid))
        sigmoid = (1/(1+np.exp(-0.22*model.predict(x_valid)))-0.5)*20  
        cvvalidsig.append(r2d2(y_valid, sigmoid))
    print(f'{np.array(cvtrain).round(4)}')
    print(f'{np.array(cvvalid).round(4)}')
    print(f'{np.array(cvvalidsig).round(4)}')
    print(f'{np.mean(cvtrain):.4f}, {np.mean(cvvalid):.4f}, {np.mean(cvvalidsig):.4f}')

In [26]:
rlcvscore(lasso) # has all features

[0.0304 0.0465 0.0495 0.05   0.0484]
[0.0465 0.0495 0.05   0.0484 0.047 ]
[0.049  0.0522 0.0526 0.0507 0.0485]
0.0449, 0.0483, 0.0506


In [None]:
# dump(lasso, f'lasso_rlcv_114ft_0.0175_0.0187.joblib')
# lasso = load('lasso_rlcv_114ft_0.0175_0.0187.joblib') 

# RF

In [None]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=6, min_samples_split=1000, min_samples_leaf=1000,
                                 max_features='auto', n_jobs=-1, random_state=41)

In [None]:
rf_model.fit(x_train, y_train);

In [None]:
rlcvscore(rf_model) # realistic cv

In [None]:
a = df.drop('y', axis=1).columns[indices]

In [None]:
# create X with important variables only
X = df.drop('y', axis=1)[a[:20]].values
y = df.y.values

In [None]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=2, min_samples_split=2, min_samples_leaf=5000,
                                 max_features='auto', n_jobs=-1, random_state=41)
rf_model.fit(x_train, y_train);

In [None]:
rlcvscore(rf_model) #n_est 10, depth 2, samples_split 2, samples_leaf 1000, 30 most importantvariables

In [None]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
# Plot the feature importances of the forest
plt.figure(figsize=(15,8))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
# save model
dump(rf, 'model.joblib')

In [None]:
# load model
rf2 = load('model.joblib')

# Fast.ai

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
path = f'D:\Coding\XTX Forecasting Challenge'
data = TabularDataBunch.from_df(path = path, df = df[:int(5e5)], dep_var = 'y', procs=procs,
                                 valid_idx = list(range(int(4e5),int(5e5))))

In [None]:
data.show_batch(rows=10)

In [None]:
# data = (TabularList.from_df(df[:int(5e5)], cont_names=df.columns, procs=procs)
#                            .split_by_idx(list(range(int(0.8*5e5),int(5e5))))
#                            .label_from_df(cols=dep_var, label_cls=FloatList)
#                            .databunch())

In [None]:
learn = tabular_learner(data, layers=[500,200], metrics=r2_score, ps=[0.001,0.01], emb_drop=0.04)

In [None]:
learn.model

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
learn.fit_one_cycle(3, 1e-4, wd=0.1)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.predict(df.iloc[int(8.1e5)])

In [None]:
df.y.iloc[int(8.1e5)]

In [None]:
preds = learn.get_preds()

# Submission testing

In [None]:
def get_next_data_as_numpy_array(iteration):
    return df.iloc[iteration][:60].values

In [None]:
def append_to_df(massive_df, row):
    try: row.index = [massive_df.index[-1] + timedelta(minutes=1)]
    except: row.index = [datetime(1970,1,1)]
    return massive_df.append(row)

In [None]:
def add_resample_features(massive_df, resampled_df):
    leftovers = len(massive_df) % 15
    a = pd.DataFrame()
    def pad_history():
        full_resampled = resampled_df.append(df_mid)
        a = pd.DataFrame([full_resampled.iloc[0] for j in range(30+1-len(full_resampled))])
        a = a.append(full_resampled)
        a.index = pd.date_range(start=df_mid.index[-1], periods=len(a), freq='-15Min').sort_values()
        df_mid_ta = ta.add_all_ta_features(a, "open", "high", "low", "close", "vol", fillna=True)
        return df_mid_ta
    if leftovers == 0:
        df_mid = massive_df.tail(15).midRate.resample('15Min').ohlc()
        df_mid['vol'] = massive_df.tail(15).bidAskVol.resample('15Min').mean()
        df_mid_ta = pad_history()
        resampled_df = resampled_df.append(df_mid_ta)
    else:
        df_mid = massive_df.tail(leftovers).midRate.resample('15Min').ohlc()
        df_mid['vol'] = massive_df.tail(leftovers).bidAskVol.resample('15Min').mean()
        df_mid_ta = pad_history()
    massive_df.update(df_mid_ta)
    massive_df = massive_df.ffill().astype('float32')
    return massive_df, resampled_df

In [None]:
massive_df, resampled_df = pd.DataFrame(), pd.DataFrame()

In [None]:
def compute_cross_sectional(base_row):
    df = pd.DataFrame([base_row])
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]

    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df

In [None]:
for iteration in range(30):
    base_row = get_next_data_as_numpy_array(len(massive_df))
    row = compute_cross_sectional(base_row)
    massive_df = append_to_df(massive_df, row)
    massive_df, resampled_df = add_resample_features(massive_df, resampled_df)
    massive_df = add_time_features(massive_df)
    massive_df = add_manual_time_features(massive_df)

In [None]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
#     print(b1,a1,b2,a2)
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)    
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df

In [None]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df

In [None]:
massive_df

In [None]:
massive_df = add_manual_time_features(massive_df)