# Setting up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
# from fastai import *
# from fastai.tabular import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump, load
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoLarsCV

%matplotlib inline
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
# %%time
# path = 'D://Coding//XTX Forecasting Challenge//data-training.csv'
# df = pd.read_csv(path)

In [3]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [4]:
askRateList = ['askRate' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]

# Exploratory Data Analysis

In [None]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Feature engineering

### Basics

#### Cross-sectional features

In [None]:
# different from submission
def compute_cross_sectional(df):
    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df

# TA

#### Time series features

In [None]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df

#### Manual time features — can consider adding more to the lags list

In [None]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df

In [None]:
df = compute_cross_sectional(df)
df = add_time_features(df)
df = add_manual_time_features(df)

In [None]:
# df.to_feather('intermediate.file')
df = pd.read_feather('intermediate.file', use_threads=8)

#### Tick chart version with ffill

In [None]:
# midrate version
df['time'] = pd.date_range(start='1/1/1970', periods=2999999, freq='T')
df.set_index('time', inplace=True)
df_mid = df.midRate.resample('15Min').ohlc()
df_mid['vol'] = df.bidAskVol.resample('15Min').mean()

In [None]:
# takes 5 min
df_mid_ta = ta.add_all_ta_features(df_mid, "open", "high", "low", "close", "vol", fillna=True)

In [None]:
# dump(df_mid_ta, 'df_mid_ta.joblib')
df_mid_ta = load('df_mid_ta.joblib')

In [None]:
# takes 30s
new_df = df.join(df_mid_ta).ffill().astype('float32')

In [None]:
# dump(new_df, 'new_df.joblib')
new_df = load('new_df.joblib')

# Cross-validation

In [5]:
def create_limited_features_orig(df):
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    for i in range(2,5):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]    
    df['bidAskRatio4'] = df['totalBidVol' + str(4)] / df['totalAskVol' + str(4)]
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in range(6,11):
        addTimeFeatures(i)
    df['time'] = pd.date_range(start='1/1/1970', periods=len(df), freq='T')
    df.set_index('time', inplace=True)
    df_mid = df.midRate.resample('15Min').ohlc()
    df_mid['vol'] = df.bidAskVol.resample('15Min').mean()
    df_mid['volume_adi'] = ta.volume.acc_dist_index(df_mid.high, df_mid.low, df_mid.close, df_mid.vol, fillna=True)
    df_mid['others_dlr'] = ta.others.daily_log_return(df_mid.close, fillna=True)
    df = df.join(df_mid[['volume_adi', 'others_dlr']]).ffill().astype('float32')
    df.fillna(0, inplace=True)
    return df

In [6]:
sparkle = create_limited_features_orig(df); sparkle;

# Limited sparkle

In [None]:
# can we predict volume_adi from the other features?

In [7]:
X = sparkle.drop([*sparkle.columns[:71], 'volume_adi'], axis=1).values
y = sparkle.volume_adi.values

# standardise
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
lasso_adi = LassoLarsCV(cv=rlcv, n_jobs=-1).fit(X_scaled, y)

In [None]:
def rlcvscore_adi(model):
    cvtrain, cvvalid, cvvalidsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_scaled), 1):
        x_train, x_valid = X_scaled[train_index], X_scaled[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        cvtrain.append(model.score(x_train, y_train))
        cvvalid.append(model.score(x_valid, y_valid))
        sigmoid = (1/(1+np.exp(-0.22*model.predict(x_valid)))-0.5)*20  
        cvvalidsig.append(r2d2(y_valid, sigmoid))
    print(f'{np.array(cvtrain).round(4)}')
    print(f'{np.array(cvvalid).round(4)}')
    print(f'{np.array(cvvalidsig).round(4)}')
    print(f'{np.mean(cvtrain):.4f}, {np.mean(cvvalid):.4f}, {np.mean(cvvalidsig):.4f}')

In [None]:
rlcvscore_adi(lasso_adi)

In [None]:
lasso_adi.coef_

In [None]:
preds_adi = X_scaled @ lasso_adi.coef_.T

In [None]:
sparkle.volume_adi = preds_adi

In [None]:
sparkle_lim = create_limited_features(df.head(45).copy()); sparkle_lim

In [None]:
sparkle_lim = create_limited_features(df.head(60).copy()); sparkle_lim

In [None]:
ta.volume.acc_dist_index??

In [None]:
ta.others.daily_log_return??

# Lasso

In [8]:
rlcv = RollingWindowSplit(n_splits=5, compatible=True)

In [18]:
sparkle.drop(sparkle.columns[:71], axis=1).columns

Index(['bidAskRatio4', 'OIR', 'daskRate6', 'dbidRate6', 'daskRate7',
       'dbidRate7', 'daskRate8', 'dbidRate8', 'daskRate9', 'dbidRate9',
       'daskRate10', 'dbidRate10', 'volume_adi', 'others_dlr'],
      dtype='object')

In [9]:
# takes 40s
# undropped
X = sparkle.drop(sparkle.columns[:71], axis=1).values
y = df.y.values

# standardise
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [17]:
dump(scaler, 'scaler_limited.joblib')

['scaler_limited.joblib']

In [10]:
# takes 11s on limited variables, 1 min on pca variables, 16m21s on 232 non-pca variables
lasso = LassoLarsCV(cv=rlcv, n_jobs=-1).fit(X_scaled, y)

In [None]:
# actually the lasso above has seen the entire dataset....

In [11]:
def rlcvscore(model):
    cvtrain, cvvalid, cvvalidsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_scaled), 1):
        x_train, x_valid = X_scaled[train_index], X_scaled[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        cvtrain.append(model.score(x_train, y_train))
        cvvalid.append(model.score(x_valid, y_valid))
        sigmoid = (1/(1+np.exp(-0.22*model.predict(x_valid)))-0.5)*20  
        cvvalidsig.append(r2d2(y_valid, sigmoid))
    print(f'{np.array(cvtrain).round(4)}')
    print(f'{np.array(cvvalid).round(4)}')
    print(f'{np.array(cvvalidsig).round(4)}')
    print(f'{np.mean(cvtrain):.4f}, {np.mean(cvvalid):.4f}, {np.mean(cvvalidsig):.4f}')

In [12]:
rlcvscore(lasso) # limited variables

[0.0604 0.061  0.0628 0.0589 0.0653]
[0.061  0.0628 0.0589 0.0653 0.066 ]
[0.0608 0.0626 0.058  0.0652 0.0653]
0.0617, 0.0628, 0.0624


In [14]:
# dump(lasso, 'lasso_limited.joblib')
lasso = load('lasso_limited.joblib')

In [15]:
lasso.coef_

array([ 0.02572668,  0.05097824, -0.01739648, -0.01683773, -0.00606163,
       -0.00552953, -0.00564391, -0.00511773, -0.00655901, -0.00624305,
       -0.0354066 , -0.03479002,  0.03206284,  0.17627821])

In [16]:
preds = X_scaled @ lasso.coef_.T
r2d2(y, preds)

0.06203365066030242

# RF

In [None]:
for inc, (train_index, valid_index) in enumerate(rlcv.split(X_scaled),1):
    x_train, x_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    print("TRAIN:", (train_index[0], train_index[-1]),
          "VALID:", (valid_index[0], valid_index[-1]),
          "SIZES:", (len(x_train), len(x_valid)))
    break

In [None]:
params_grid = {'max_depth': np.arange(10),
               'min_samples_split': 2*10**np.arange(5),
               'min_samples_leaf': 2*10**np.arange(5)
              }

In [None]:
rf = RandomForestRegressor(n_estimators=30, n_jobs=3, random_state=41)

In [None]:
rf_random = RandomizedSearchCV(rf, params_grid, n_iter=30, n_jobs=3, cv=rlcv, random_state=41, verbose=1)

In [None]:
rf_random.fit(X_scaled, y)

In [None]:
rf_model = RandomForestRegressor(n_estimators=30, max_depth=3, min_samples_split=10, min_samples_leaf=20000,
                                 max_features='auto', n_jobs=-1, random_state=41)

In [None]:
rf_model.fit(x_train, y_train);

In [None]:
rlcvscore(rf_model) # realistic cv, max_depth 3, split 10, leaf 10000
# [0.0354 0.0389 0.0413 0.0334 0.041 ]
# [0.0389 0.0413 0.0334 0.041  0.0275]
# [0.0366 0.0392 0.0293 0.0369 0.0202]
# 0.0380, 0.0364, 0.0324

In [None]:
# for blending validation
def get_preds():
    # undropped
    X = new_df.drop('y', axis=1).values
    y = new_df.y.values
    scaler = load('scaler.joblib')
    X_scaled = scaler.transform(X)
    pca = load('pca.joblib')
    X_pca = pca.transform(X_scaled)
    lasso = load('lassocv.joblib')
    
    lasso_pred_train, lasso_pred_valid, lasso_pred_validsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_pca), 1):
        x_train, x_valid = X_pca[train_index], X_pca[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        # obtain lasso predictions
        lasso_pred_train.append(lasso.predict(x_train))
        lasso_pred_valid.append(lasso.predict(x_valid))
        lasso_pred_validsig.append((1/(1+np.exp(-0.22*lasso.predict(x_valid)))-0.5)*20)
    return lasso_pred_train, lasso_pred_valid, lasso_pred_validsig    
    
def get_dropped_preds():
    # dropped
    X = new_df.drop([*askRateList, *askSizeList, *bidRateList, *bidSizeList, 'y'], axis=1).values
    y = new_df.y.values
    scaler_drop = load('scaler_drop.joblib')
    X_scaled = scaler_drop.transform(X)
    pca_drop = load('pca_drop.joblib')
    X_pca = pca_drop.transform(X_scaled)
#     lasso_drop = load('lassocv_drop.joblib')
    lasso_drop = load('lassocv.joblib') # instead use lasso
    
    y_trainer, y_valider = [], []
    lasso_dpred_train, lasso_dpred_valid, lasso_dpred_validsig = [], [], []
    for inc, (train_index, valid_index) in enumerate(rlcv.split(X_pca), 1):
        x_train, x_valid = X_pca[train_index], X_pca[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]  
        # obtain lasso_drop predictions
        lasso_dpred_train.append(lasso_drop.predict(x_train))
        lasso_dpred_valid.append(lasso_drop.predict(x_valid))
        lasso_dpred_validsig.append((1/(1+np.exp(-0.22*lasso_drop.predict(x_valid)))-0.5)*20)
        y_trainer.append(y_train)
        y_valider.append(y_valid)
    return lasso_dpred_train, lasso_dpred_valid, lasso_dpred_validsig, y_trainer, y_valider    
    
def get_blended_scores():
    # average predictions
    cvtrain, cvvalid, cvvalidsig = [], [], []
    for i in range(5):
        train = r2d2(y_trainer[i], (np.array(lasso_dpred_train[i]) + np.array(lasso_pred_train[i]))/2)
        valid = r2d2(y_valider[i], (np.array(lasso_dpred_valid[i]) + np.array(lasso_pred_valid[i]))/2)
        sigmoid_valid = r2d2(y_valider[i], (np.array(lasso_dpred_validsig[i]) + np.array(lasso_pred_validsig[i]))/2)
        cvtrain.append(train)
        cvvalid.append(valid)
        cvvalidsig.append(sigmoid_valid)   
    print(f'{np.array(cvtrain).round(4)}')
    print(f'{np.array(cvvalid).round(4)}')
    print(f'{np.array(cvvalidsig).round(4)}')
    print(f'{np.mean(cvtrain):.4f}, {np.mean(cvvalid):.4f}, {np.mean(cvvalidsig):.4f}')

In [None]:
lasso_pred_train, lasso_pred_valid, lasso_pred_validsig = get_preds()

In [None]:
lasso_dpred_train, lasso_dpred_valid, lasso_dpred_validsig, y_trainer, y_valider = get_dropped_preds()

In [None]:
get_blended_scores()

# Fast.ai

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
path = f'D:\Coding\XTX Forecasting Challenge'
data = TabularDataBunch.from_df(path = path, df = df[:int(5e5)], dep_var = 'y', procs=procs,
                                 valid_idx = list(range(int(4e5),int(5e5))))

In [None]:
data.show_batch(rows=10)

In [None]:
# data = (TabularList.from_df(df[:int(5e5)], cont_names=df.columns, procs=procs)
#                            .split_by_idx(list(range(int(0.8*5e5),int(5e5))))
#                            .label_from_df(cols=dep_var, label_cls=FloatList)
#                            .databunch())

In [None]:
learn = tabular_learner(data, layers=[500,200], metrics=r2_score, ps=[0.001,0.01], emb_drop=0.04)

In [None]:
learn.model

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
learn.fit_one_cycle(3, 1e-4, wd=0.1)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.predict(df.iloc[int(8.1e5)])

In [None]:
df.y.iloc[int(8.1e5)]

In [None]:
preds = learn.get_preds()

# Submission testing

In [None]:
def get_next_data_as_df(iteration):
    return pd.DataFrame([df.iloc[iteration][:60].values])

In [None]:
def create_limited_features(df):
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    for i in range(2,5):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    df['bidAskRatio4'] = df['totalBidVol' + str(4)] / df['totalAskVol' + str(4)]
    return df

In [None]:
def append_to_df(massive_df, row):
    try: row.index = [massive_df.index[-1] + timedelta(minutes=1)]
    except IndexError: row.index = [datetime(1970,1,1)]
    return massive_df.append(row, sort=False)

In [None]:
def add_time_features(df):
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df

In [None]:
def add_manual_time_features(df):
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in range(6,11):
        addTimeFeatures(i)
    df.fillna(0, inplace=True) # necessary
    return df[-15:]

In [None]:
def add_resample_features(massive_df, resampled_df):
    leftovers = (massive_df.index[-1].to_pydatetime().minute+1) % 15
    def pad_history():
        full_resampled = resampled_df.append(row_ohlcv, sort=False) # (1,5)
        a = pd.DataFrame([full_resampled.iloc[0] for j in range(1+1-len(full_resampled))]) # (1,5)
        a = a.append(full_resampled, sort=False) # (2,5)
        a.index = pd.date_range(start=row_ohlcv.index[-1], periods=len(a), freq='-15Min').sort_values()
        full_resampled['volume_adi'] = ta.volume.acc_dist_index(a.high, a.low, a.close, a.vol, fillna=True)
        full_resampled['others_dlr'] = ta.others.daily_log_return(a.close, fillna=True)
        return full_resampled
    if leftovers == 0:
        row_ohlcv = massive_df.tail(15).midRate.resample('15Min').ohlc().tail(1)
        row_ohlcv['vol'] = massive_df.tail(15).bidAskVol.resample('15Min').mean().tail(1) # row_ohlcv.shape = (1,5)
        full_resampled = pad_history()
        resampled_df = resampled_df.append(full_resampled, sort=False).tail(2) # when iteration=15, leftovers=0, resampled_df=[]
    else:
        row_ohlcv = massive_df.tail(leftovers).midRate.resample('15Min').ohlc().tail(1)
        row_ohlcv['vol'] = massive_df.tail(leftovers).bidAskVol.resample('15Min').mean().tail(1)
        full_resampled = pad_history()
    try: massive_df.drop(['volume_adi', 'others_dlr'], axis=1, inplace=True)
    except KeyError: pass
    massive_df = massive_df.join(full_resampled[['volume_adi', 'others_dlr']])
    massive_df = massive_df.ffill().astype('float32')
    return massive_df, resampled_df

In [None]:
massive_df, resampled_df = pd.DataFrame(), pd.DataFrame()

In [None]:
for iteration in range(32):
    base_row = get_next_data_as_df(iteration)
    row = create_limited_features(base_row)
    massive_df = append_to_df(massive_df, row)
    massive_df = add_time_features(massive_df)
    massive_df = add_manual_time_features(massive_df)
    massive_df, resampled_df = add_resample_features(massive_df, resampled_df)

In [None]:
# why does 30 min here have no values?

In [None]:
massive_df

In [None]:
sparkle_lim = create_limited_features_orig(df.head(132).copy()); sparkle_lim.tail(15)