# Setting up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
# from fastai import *
# from fastai.tabular import *
from tqdm import tqdm_notebook
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoLarsCV
from rolling import RollingWindowSplit
from sklearn.metrics import r2_score as r2d2
from joblib import dump, load
from datetime import datetime, timedelta

%matplotlib inline
%load_ext line_profiler
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
# %%time
# path = 'D://Coding//XTX Forecasting Challenge//data-training.csv'
# df = pd.read_csv(path)

In [3]:
path = 'D://Coding//XTX Forecasting Challenge//data-training.file'
df = pd.read_feather(path, use_threads=8)
df = df.astype('float32')
df.fillna(0, inplace=True)

In [4]:
askRateList = ['askRate' + str(i) for i in range(0,15)]
askSizeList = ['askSize' + str(i) for i in range(0,15)]
bidRateList = ['bidRate' + str(i) for i in range(0,15)]
bidSizeList = ['bidSize' + str(i) for i in range(0,15)]

In [5]:
relevant = ['totalBidVol1', 'totalAskVol1', 'totalBidVol2', 'totalAskVol2',
           'totalBidVol3', 'totalAskVol3', 'totalBidVol4', 'totalAskVol4',
           'totalBidVol5', 'totalAskVol5', 'bidAskRatio4', 'bidAskRatio5', 'OIR',
           'daskRate6', 'dbidRate6', 'daskRate7', 'dbidRate7', 'daskRate8',
           'dbidRate8', 'daskRate9', 'dbidRate9', 'daskRate10', 'dbidRate10',
           'daskRate20', 'midRate', 'bidAskVol', 'others_dlr']

# Exploratory Data Analysis

In [None]:
# # Figuring out what [y] is
# # y(t) is midRate(t+87) - midRate(t), clipped to (-5.5)
# df['expectedY'] = df.midRate.diff(87).shift(-87).clip(-5,5)

# Feature engineering

#### Cross-sectional features

In [None]:
# different from submission
def compute_cross_sectional(df):
    # Cross-sectional features
    df['spread'] = df.askRate0 - df.bidRate0
    df['midRate'] = (df.askRate0 + df.bidRate0) / 2
    df['bidAskVol'] = df.askSize0 + df.bidSize0
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,15):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(1,15):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['totalAvailVol'] = df.totalBidVol14 + df.totalAskVol14
    df['vwaBid'] = np.einsum('ij,ji->i', df[bidRateList], df[bidSizeList].T) / df[bidSizeList].sum(axis=1)
    df['vwaAsk'] = np.einsum('ij,ji->i', df[askRateList], df[askSizeList].T) / df[askSizeList].sum(axis=1)
    df['vwaBidDMid'] = df.midRate - df.vwaBid
    df['vwaAskDMid'] = df.vwaAsk - df.midRate
    df['diff_vwaBidAskDMid'] = df.vwaAskDMid - df.vwaBidDMid
    return df

#### Time series features

In [None]:
def add_time_features(df):
    b1, a1 = (df.bidRate0 < df.bidRate0.shift(1)), (df.askRate0 < df.askRate0.shift(1))
    b2, a2 = (df.bidRate0 == df.bidRate0.shift(1)), (df.askRate0 == df.askRate0.shift(1))
    valsB, valsA = [0, (df.bidSize0 - df.bidSize0.shift(1))], [0, (df.askSize0 - df.askSize0.shift(1))]
    defaultB, defaultA = df.bidSize0, df.askSize0
    df.fillna(0, inplace=True)
    df['deltaVBid'] = np.select([b1,b2], valsB, default=defaultB)
    df['deltaVAsk'] = np.select([a1,a2], valsA, default=defaultA)
    df['VOI'] = df.deltaVBid - df.deltaVAsk
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    return df

#### Manual time features — can consider adding more to the lags list

In [None]:
def add_manual_time_features(df):
    lags = [*np.arange(1,10), *np.arange(10,100,10), *np.arange(100,1000,100)]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df.fillna(0, inplace=True)
    return df

In [None]:
df = compute_cross_sectional(df)
df = add_time_features(df)
df = add_manual_time_features(df)

In [None]:
# df.to_feather('intermediate.file')
df = pd.read_feather('intermediate.file', use_threads=8)

#### Tick chart version with ffill

In [None]:
# midrate version
df['time'] = pd.date_range(start='1/1/1970', periods=2999999, freq='T')
df.set_index('time', inplace=True)
df_mid = df.midRate.resample('15Min').ohlc()
df_mid['vol'] = df.bidAskVol.resample('15Min').mean()

In [None]:
df_mid_ta = ta.add_all_ta_features(df_mid, "open", "high", "low", "close", "vol", fillna=True)

In [None]:
# takes 30s
new_df = df.join(df_mid_ta).ffill().astype('float32')

In [None]:
# dump(new_df, 'new_df.joblib')
new_df = load('new_df.joblib')

# Cross-validation

In [None]:
# the leaderboard set should have 150k: they check running time of 10k in 1h and max 15h

In [7]:
# 70-30 train-valid and test split
def train_test_split(df):
#     test_start = int(0.95*len(df))
    test_start = -100
    train_df = df[:test_start].copy()
    test_df = df[test_start:].copy()
    return train_df, test_df
train_df, test_df = train_test_split(df)
# train_df, test_df = train_test_split(new_df)

In [68]:
def create_limited_features_orig(df):
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['bidAskVol'] = df.bidSize0 + df.askSize0 # necessary only for volume_adi
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,6):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(4,6):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    lags = [*np.arange(6,10), 10]
    def addTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addTimeFeatures(i)
    df['daskRate20'] = df.askRate0.diff(20)
    df.fillna(0, inplace=True)

    df['time'] = pd.date_range(start='1/1/1970', periods=len(df), freq='T')
    df.set_index('time', inplace=True)
    df_mid = df.midRate.resample('15Min').ohlc()
    df_mid['others_dlr'] = ta.others.daily_log_return(df_mid.close, fillna=True)
    df = df.join(df_mid['others_dlr']).ffill().astype('float32')
    df.fillna(0, inplace=True)
    return df, df_mid

In [9]:
rlcv = RollingWindowSplit(n_splits=5, compatible=True)

In [69]:
limited_train, df_mid_train = create_limited_features_orig(train_df)

In [11]:
# undropped
x_train = limited_train.drop(limited_train.columns[:61], axis=1).values
y_train = limited_train.y.values

# standardise
scaler = StandardScaler()
scaler.fit(x_train)
x_scaled_train = scaler.transform(x_train)

In [None]:
# takes 11s on limited variables, 1 min on pca variables, 16m21s on 232 non-pca variables
lasso = LassoLarsCV(cv=rlcv, n_jobs=-1).fit(x_scaled_train, y_train)

In [12]:
# dump(scaler, 'scaler_limited.joblib')
# dump(lasso, 'lasso_limited.joblib')
scaler = load('scaler_limited.joblib')
lasso = load('lasso_limited.joblib')

In [74]:
def score(model):
    limited_test, df_mid_test = create_limited_features_orig(test_df)
    x_test = limited_test.drop(limited_test.columns[:61], axis=1).values
    y_test = limited_test.y.values
    x_scaled_test = scaler.transform(x_test)
    predictions = lasso.predict(x_scaled_test)
    test_score = r2d2(y_test, predictions)
    train_score = lasso.score(x_scaled_train, y_train)
    print(f'{train_score:.4f}, {test_score:.4f}')
    return predictions, limited_test, df_mid_test

In [76]:
predictions, limited_test, df_mid_test = score(lasso)

0.0431, -0.2441


In [None]:
def manual_score(model):
    x_test = test_df[test_df.columns[61:]].values
    y_test = test_df.y.values
    x_scaled_test = scaler.transform(x_test)
    test_preds = x_scaled_test @ lasso.coef_
    print(f'{r2d2(y_test, test_preds):.4f}')

In [None]:
manual_score(lasso) # this is the score if i create the datasets first

In [None]:
# dump(lasso, 'lasso_limited.joblib')
scaler = load('scaler_full.joblib')
lasso = load('lasso_full.joblib')

In [None]:
lassocoef = lasso.coef_[np.where(lasso.coef_)]; lassocoef

In [None]:
coefs = np.insert(np.insert(lassocoef,-2,np.zeros(2)),0,np.zeros(10))

In [None]:
new_df.columns[61:][np.where(lasso.coef_)]

# Fast.ai

In [None]:
dep_var = 'y'
procs = [FillMissing, Normalize]

In [None]:
path = f'D:\Coding\XTX Forecasting Challenge'
data = TabularDataBunch.from_df(path = path, df = df[:int(5e5)], dep_var = 'y', procs=procs,
                                 valid_idx = list(range(int(4e5),int(5e5))))

In [None]:
data.show_batch(rows=10)

In [None]:
# data = (TabularList.from_df(df[:int(5e5)], cont_names=df.columns, procs=procs)
#                            .split_by_idx(list(range(int(0.8*5e5),int(5e5))))
#                            .label_from_df(cols=dep_var, label_cls=FloatList)
#                            .databunch())

In [None]:
learn = tabular_learner(data, layers=[500,200], metrics=r2_score, ps=[0.001,0.01], emb_drop=0.04)

In [None]:
learn.model

In [None]:
learn.lr_find(end_lr=1e1)

In [None]:
learn.recorder.plot()

In [None]:
# model above has already diverged, we will restart.

In [None]:
learn.fit_one_cycle(3, 1e-4, wd=0.1)

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.save('new_fastai')

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.predict(df.iloc[int(8.1e5)])

In [None]:
df.y.iloc[int(8.1e5)]

In [None]:
preds = learn.get_preds()

# Submission testing

In [15]:
def get_next_data_as_df(iteration):
    return pd.DataFrame([df.tail(100).iloc[iteration][:60].values])
#     return pd.DataFrame([test_df.iloc[iteration][:60].values])

In [16]:
def create_limited_features(df):
    df.columns = [*askRateList, *askSizeList, *bidRateList, *bidSizeList]
    df['midRate'] = (df.bidRate0 + df.askRate0) / 2 # necessary for ohlc
    df['bidAskVol'] = df.bidSize0 + df.askSize0 # necessary only for volume_adi
    df['totalBidVol1'] = df.bidSize0 + df.bidSize1
    df['totalAskVol1'] = df.askSize0 + df.askSize1
    for i in range(2,6):
        df['totalBidVol' + str(i)] = df['totalBidVol' + str(i-1)] + df['bidSize' + str(i)]
        df['totalAskVol' + str(i)] = df['totalAskVol' + str(i-1)] + df['askSize' + str(i)]
    for i in range(4,6):
        df['bidAskRatio' + str(i)] = df['totalBidVol' + str(i)] / df['totalAskVol' + str(i)]
    return df

In [17]:
def append_to_df(massive_df, row):
    try: row.index = [massive_df.index[-1] + timedelta(minutes=1)]
    except IndexError: row.index = [datetime(1970,1,1)]
    return massive_df.append(row, sort=False)

In [51]:
def add_time_features(df):
    df['OIR'] = (df.bidSize0 - df.askSize0)/(df.bidSize0 + df.askSize0)
    lags = [*np.arange(6,10), 10]
    def addManualTimeFeatures(i):
        df['daskRate' + str(i)] = df.askRate0.diff(i)
        df['dbidRate' + str(i)] = df.bidRate0.diff(i)
    for i in lags:
        addManualTimeFeatures(i)
    df['daskRate20'] = df.askRate0.diff(20)
    df.fillna(0, inplace=True)
    return df[-25:]

In [19]:
def add_resample_features(massive_df, resampled_df):
    leftovers = (massive_df.index[-1].to_pydatetime().minute+1) % 15
    def pad_history():
        full_resampled = resampled_df.append(row_ohlcv, sort=False)
        a = pd.DataFrame([full_resampled.iloc[0] for j in range(1+1-len(full_resampled))])
        a = a.append(full_resampled, sort=False)
        a.index = pd.date_range(start=row_ohlcv.index[-1], periods=len(a), freq='-15Min').sort_values()
        full_resampled['others_dlr'] = ta.others.daily_log_return(a.close, fillna=True)
        return full_resampled
    if leftovers == 0:
        row_ohlcv = massive_df.tail(15).midRate.resample('15Min').ohlc().tail(1)
        full_resampled = pad_history()
        resampled_df = resampled_df.append(full_resampled, sort=False).tail(2) # take last 2 only
    else:
        row_ohlcv = massive_df.tail(leftovers).midRate.resample('15Min').ohlc().tail(1)
        full_resampled = pad_history()
    try: massive_df.drop('others_dlr', axis=1, inplace=True)
    except KeyError: pass
    massive_df = massive_df.join(full_resampled['others_dlr'])
    massive_df = massive_df.ffill().astype('float32')
    return massive_df, resampled_df

In [20]:
def get_prediction(data):
    X = data.values
    X_scaled = scaler.transform(X)
    return np.clip(lasso.predict(np.atleast_2d(X_scaled)), -5, 5)[0]

In [84]:
massive_df, resampled_df, log_data = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
predictions = []

In [85]:
for iteration in tqdm_notebook(range(30)):    
    base_row = get_next_data_as_df(iteration)
    row = create_limited_features(base_row)
    massive_df = append_to_df(massive_df, row)
    massive_df = add_time_features(massive_df)
    massive_df, resampled_df = add_resample_features(massive_df, resampled_df)
    log_data_row = pd.DataFrame([massive_df.iloc[-1]]) # for debug
    data = pd.DataFrame([massive_df.iloc[-1][relevant]])
    prediction = get_prediction(data)
    predictions.append(prediction)
    log_data = log_data.append(log_data_row, sort=False)  # for debug

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [86]:
log_data

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,askRate10,askRate11,askRate12,askRate13,askRate14,askSize0,askSize1,askSize2,askSize3,askSize4,askSize5,askSize6,askSize7,askSize8,askSize9,askSize10,askSize11,askSize12,askSize13,askSize14,bidRate0,bidRate1,bidRate2,bidRate3,bidRate4,bidRate5,bidRate6,bidRate7,bidRate8,bidRate9,bidRate10,bidRate11,bidRate12,bidRate13,bidRate14,bidSize0,bidSize1,bidSize2,bidSize3,bidSize4,bidSize5,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,midRate,bidAskVol,totalBidVol1,totalAskVol1,totalBidVol2,totalAskVol2,totalBidVol3,totalAskVol3,totalBidVol4,totalAskVol4,totalBidVol5,totalAskVol5,bidAskRatio4,bidAskRatio5,OIR,daskRate6,dbidRate6,daskRate7,dbidRate7,daskRate8,dbidRate8,daskRate9,dbidRate9,daskRate10,dbidRate10,daskRate20,others_dlr
1970-01-01 00:00:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:01:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,3.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,13.0,6.0,24.0,15.0,33.0,31.0,42.0,41.0,50.0,50.0,58.0,0.82,0.862069,-0.538462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:02:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:03:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:04:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,28.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:05:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,28.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:06:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:07:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,14.0,5.0,26.0,14.0,35.0,30.0,44.0,40.0,52.0,49.0,60.0,0.769231,0.816667,-0.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:08:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,1576.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,26.0,1584.0,15.0,12.0,26.0,28.0,35.0,38.0,44.0,47.0,52.0,83.0,60.0,0.903846,1.383333,-0.6,0.0,-0.5,0.0,-0.5,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:09:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,17.0,8.0,26.0,17.0,35.0,33.0,44.0,43.0,52.0,52.0,60.0,0.826923,0.866667,-0.411765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
massive_df

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,askRate10,askRate11,askRate12,askRate13,askRate14,askSize0,askSize1,askSize2,askSize3,askSize4,askSize5,askSize6,askSize7,askSize8,askSize9,askSize10,askSize11,askSize12,askSize13,askSize14,bidRate0,bidRate1,bidRate2,bidRate3,bidRate4,bidRate5,bidRate6,bidRate7,bidRate8,bidRate9,bidRate10,bidRate11,bidRate12,bidRate13,bidRate14,bidSize0,bidSize1,bidSize2,bidSize3,bidSize4,bidSize5,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,midRate,bidAskVol,totalBidVol1,totalAskVol1,totalBidVol2,totalAskVol2,totalBidVol3,totalAskVol3,totalBidVol4,totalAskVol4,totalBidVol5,totalAskVol5,bidAskRatio4,bidAskRatio5,OIR,daskRate6,dbidRate6,daskRate7,dbidRate7,daskRate8,dbidRate8,daskRate9,dbidRate9,daskRate10,dbidRate10,daskRate20,others_dlr
1970-01-01 00:00:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:01:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,3.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,13.0,6.0,24.0,15.0,33.0,31.0,42.0,41.0,50.0,50.0,58.0,0.82,0.862069,-0.538462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:02:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:03:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:04:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,28.0,40.0,95.0,2.0,3.0,1584.25,15.0,8.0,24.0,17.0,33.0,33.0,42.0,43.0,50.0,52.0,58.0,0.86,0.896552,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:05:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,28.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:06:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,10.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,4.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,12.0,5.0,24.0,14.0,33.0,30.0,42.0,40.0,50.0,49.0,58.0,0.8,0.844828,-0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:07:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,2.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,14.0,5.0,26.0,14.0,35.0,30.0,44.0,40.0,52.0,49.0,60.0,0.769231,0.816667,-0.714286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:08:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,1576.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,26.0,1584.0,15.0,12.0,26.0,28.0,35.0,38.0,44.0,47.0,52.0,83.0,60.0,0.903846,1.383333,-0.6,0.0,-0.5,0.0,-0.5,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0
1970-01-01 00:09:00,1584.5,1585.0,1585.5,1586.0,1586.5,1587.0,1587.5,1588.0,1588.5,1589.0,1589.5,1590.0,1591.0,1591.5,1592.0,12.0,14.0,9.0,9.0,8.0,8.0,21.0,10.0,8.0,2.0,45.0,12.0,1.0,1.0,2.0,1584.0,1583.5,1583.0,1582.5,1582.0,1581.5,1581.0,1580.5,1580.0,1579.5,1579.0,1578.5,1578.0,1577.5,1577.0,5.0,3.0,9.0,16.0,10.0,9.0,36.0,20.0,63.0,9.0,26.0,40.0,95.0,2.0,3.0,1584.25,17.0,8.0,26.0,17.0,35.0,33.0,44.0,43.0,52.0,52.0,60.0,0.826923,0.866667,-0.411765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
def true_rlcvscore(test_df, length):
    massive_df, resampled_df = pd.DataFrame(), pd.DataFrame()
    predictions = []
    log_data = pd.DataFrame()  # for debug
    for iteration in tqdm_notebook(range(length)):    
#     for iteration in tqdm_notebook(range(len(test_df))):
        base_row = get_next_data_as_df(iteration)
        row = create_limited_features(base_row)
        massive_df = append_to_df(massive_df, row)
        massive_df = add_time_features(massive_df)
        massive_df, resampled_df = add_resample_features(massive_df, resampled_df)
        log_data_row = pd.DataFrame([massive_df.iloc[-1]]) # for debug
        data = pd.DataFrame([massive_df.iloc[-1][relevant]])
        prediction = get_prediction(data)
        predictions.append(prediction)
        log_data = log_data.append(log_data_row, sort=False)  # for debug
    score = r2d2(test_df.y.head(length), predictions)
    print(score)
    return predictions, score, log_data

In [53]:
test_predictions, test_score, log_data = true_rlcvscore(test_df, 100)

HBox(children=(IntProgress(value=0), HTML(value='')))


-0.08460555600257247


In [87]:
log_data[['midRate', 'others_dlr']]

Unnamed: 0,midRate,others_dlr
1970-01-01 00:00:00,1584.25,0.0
1970-01-01 00:01:00,1584.25,0.0
1970-01-01 00:02:00,1584.25,0.0
1970-01-01 00:03:00,1584.25,0.0
1970-01-01 00:04:00,1584.25,0.0
1970-01-01 00:05:00,1584.25,0.0
1970-01-01 00:06:00,1584.25,0.0
1970-01-01 00:07:00,1584.25,0.0
1970-01-01 00:08:00,1584.0,0.0
1970-01-01 00:09:00,1584.25,0.0


In [83]:
limited_test.drop('y', axis=1)[['midRate', 'others_dlr']]

Unnamed: 0_level_0,midRate,others_dlr
time,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-01-01 00:00:00,1584.25,0.0
1970-01-01 00:01:00,1584.25,0.0
1970-01-01 00:02:00,1584.25,0.0
1970-01-01 00:03:00,1584.25,0.0
1970-01-01 00:04:00,1584.25,0.0
1970-01-01 00:05:00,1584.25,0.0
1970-01-01 00:06:00,1584.25,0.0
1970-01-01 00:07:00,1584.25,0.0
1970-01-01 00:08:00,1584.0,0.0
1970-01-01 00:09:00,1584.25,0.0


In [55]:
log_data == limited_test.drop('y', axis=1)

Unnamed: 0,askRate0,askRate1,askRate2,askRate3,askRate4,askRate5,askRate6,askRate7,askRate8,askRate9,askRate10,askRate11,askRate12,askRate13,askRate14,askSize0,askSize1,askSize2,askSize3,askSize4,askSize5,askSize6,askSize7,askSize8,askSize9,askSize10,askSize11,askSize12,askSize13,askSize14,bidRate0,bidRate1,bidRate2,bidRate3,bidRate4,bidRate5,bidRate6,bidRate7,bidRate8,bidRate9,bidRate10,bidRate11,bidRate12,bidRate13,bidRate14,bidSize0,bidSize1,bidSize2,bidSize3,bidSize4,bidSize5,bidSize6,bidSize7,bidSize8,bidSize9,bidSize10,bidSize11,bidSize12,bidSize13,bidSize14,midRate,bidAskVol,totalBidVol1,totalAskVol1,totalBidVol2,totalAskVol2,totalBidVol3,totalAskVol3,totalBidVol4,totalAskVol4,totalBidVol5,totalAskVol5,bidAskRatio4,bidAskRatio5,OIR,daskRate6,dbidRate6,daskRate7,dbidRate7,daskRate8,dbidRate8,daskRate9,dbidRate9,daskRate10,dbidRate10,daskRate20,others_dlr
1970-01-01 00:00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:01:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:02:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:03:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:04:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:05:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:06:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:07:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:08:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1970-01-01 00:09:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [None]:
# why is logdata.others_dlr only change at row 22, but limited_test changes only at row 18?
# most probably because others_dlr uses closing price, and closing price at 22 drops to 1583.75 probably

In [None]:
# consider changing the ohlc window length, increasing it will reduce the number of mistakes
# but increasing it also reduces the speed of reaction
# try with different values

In [77]:
df_mid_test

Unnamed: 0_level_0,open,high,low,close,others_dlr
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-01 00:00:00,1584.25,1584.25,1584.0,1584.25,0.0
1970-01-01 00:15:00,1584.25,1584.25,1583.75,1583.75,-0.031567
1970-01-01 00:30:00,1583.75,1583.75,1583.5,1583.5,-0.015783
1970-01-01 00:45:00,1583.5,1583.75,1583.25,1583.75,0.015783
1970-01-01 01:00:00,1583.75,1583.75,1583.25,1583.25,-0.031567
1970-01-01 01:15:00,1583.25,1583.25,1583.25,1583.25,0.0
1970-01-01 01:30:00,1583.25,1583.25,1583.25,1583.25,0.0


In [None]:
# %lprun -f true_rlcvscore test_predictions, test_score = true_rlcvscore(test_df, 100)