In [1]:
import numpy as np
import pandas as pd
from datetime import datetime




def compute_vwap(df):
    q = df['foreignNotional']
    p = df['price']
    vwap = np.sum(p * q) / np.sum(q)
    df['vwap'] = vwap
    return df

def ohlc(df):
    df['open'] = df.price.iloc[0]
    df['high'] = df.price.max()
    df['low'] = df.price.min()
    df['close'] = df.price.iloc[-1]
    return df[-1:]

In [2]:
data = pd.read_csv('data/20181204.csv')
data = data[data.symbol == 'XBTUSD']
paths = ['data/20181205.csv','data/20181206.csv','data/20181207.csv', 'data/20181208.csv', 'data/20181209.csv']
for path in paths:
    df = pd.read_csv(path)
    df = df[df.symbol == 'XBTUSD']
    data = data.append(df)


In [3]:
data['timestamp'] = data.timestamp.map(lambda t: datetime.strptime(t[:-3], "%Y-%m-%dD%H:%M:%S.%f")) # timestamp parsing
data.set_index('timestamp', inplace=True)
data.sort_index(inplace=True)

In [4]:
data.head()
data.tail()

Unnamed: 0_level_0,symbol,side,size,price,tickDirection,trdMatchID,grossValue,homeNotional,foreignNotional
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-12-09 23:59:58.267292,XBTUSD,Sell,2613,3526.5,ZeroMinusTick,0e94cd86-ef33-a5e9-5abd-b7f063d6db2e,74096841,0.740968,2613.0
2018-12-09 23:59:58.267292,XBTUSD,Sell,300,3526.5,ZeroMinusTick,f1449def-32ac-d3b1-8f8e-b526fbd58d14,8507100,0.085071,300.0
2018-12-09 23:59:58.267292,XBTUSD,Sell,4,3526.5,ZeroMinusTick,97a7abd3-f5e3-ba0c-9768-b14fcc37c746,113428,0.001134,4.0
2018-12-09 23:59:58.267292,XBTUSD,Sell,30,3526.5,ZeroMinusTick,7a549324-73d1-38d8-23fd-994376e6b943,850710,0.008507,30.0
2018-12-09 23:59:58.267292,XBTUSD,Sell,100,3526.5,ZeroMinusTick,668e03fd-2ca3-6e9c-dc2c-d79791596d8f,2835700,0.028357,100.0


In [5]:
data_cm_dollar = data.assign(cmDollar=data['foreignNotional'].cumsum())
total_dollars = data_cm_dollar.cmDollar.values[-1]
dollars_per_bar = 2e6

print('Total dollars:', total_dollars)
print('Dollars per bar:', dollars_per_bar)
data_dollar_grp = data_cm_dollar.assign(grpId=lambda row: row.cmDollar // dollars_per_bar)
print('Number of dollar bars:', len(data_dollar_grp.groupby(['grpId'])))
data_dollar_ohlc =  data_dollar_grp.groupby('grpId').apply(lambda x: ohlc(compute_vwap(x)))
data_dollar_ohlc.index = data_dollar_ohlc.index.droplevel()
data_dollar_ohlc = data_dollar_ohlc[~data_dollar_ohlc.index.duplicated(keep='first')]

Total dollars: 12306878424.0
Dollars per bar: 2000000.0
Number of dollar bars: 6149


In [43]:
def get_vol(prices, span=100, delta=pd.Timedelta(hours=1)):
    # 1. compute returns of the form p[t]/p[t-1] - 1
    # 1.1 find the timestamps of p[t-1] values
    df0 = prices.index.searchsorted(prices.index - delta)
    df0 = df0[df0 > 0]
    # 1.2 align timestamps of p[t-1] to timestamps of p[t]
    df0 = pd.Series(prices.index[df0-1],    
           index=prices.index[prices.shape[0]-df0.shape[0] : ])
    # 1.3 get values by timestamps, then compute returns
    df0 = prices.loc[df0.index] / prices.loc[df0.values].values - 1
    # 2. estimate rolling standard deviation
    df0 = df0.ewm(span=span).std()
    return df0

In [44]:
def get_horizons(prices, delta=pd.Timedelta(minutes=15)):
    t1 = prices.index.searchsorted(prices.index + delta)
    t1 = t1[t1 < prices.shape[0]]
    t1 = prices.index[t1]
    t1 = pd.Series(t1, index=prices.index[:t1.shape[0]])
    return t1

In [150]:
def get_touches(prices, events, factors=[1, 1]):
    '''
    events: pd dataframe with columns
    t1: timestamp of the next horizon
    threshold: unit height of top and bottom barriers
    side: the side of each bet
    factors: multipliers of the threshold to set the height of 
           top/bottom barriers
    '''
    #print(events)
    #print(prices)
    out = events[['t1']].copy(deep=True)
    if factors[0] > 0: thresh_uppr = factors[0] * events['threshold']
    else: thresh_uppr = pd.Series(index=events.index) # no uppr thresh
    if factors[1] > 0: thresh_lwr = -factors[1] * events['threshold']
    else: thresh_lwr = pd.Series(index=events.index)  # no lwr thresh
    c = 0
    for loc, t1 in events['t1'].iteritems():
        #print(loc, t1)
        df0=prices[loc:t1]                              # path prices
        
        #print(events['side'][c])
        df0=(df0 / prices[loc] - 1) * events.side[loc]  # path returns
        out.loc[loc, 'stop_loss'] = df0[df0 < thresh_lwr[loc]].index.min()  # earliest stop loss
        out.loc[loc, 'take_profit'] = df0[df0 > thresh_uppr[loc]].index.min() # earliest take profit
        #print('trying.... in touches')
        c+=1
    return out

In [151]:
def get_labels(touches):
    out = touches.copy(deep=True)
    # pandas df.min() ignores NaN values
    first_touch = touches[['stop_loss', 'take_profit']].min(axis=1)
    for loc, t in first_touch.iteritems():
        if pd.isnull(t):
            out.loc[loc, 'label'] = 0
        elif t == touches.loc[loc, 'stop_loss']: 
            out.loc[loc, 'label'] = -1
        else:
            out.loc[loc, 'label'] = 1
    return out

In [152]:
data_ohlc = data_dollar_ohlc
data_ohlc = data_ohlc.assign(threshold=get_vol(data_ohlc.close)).dropna()
data_ohlc = data_ohlc.assign(t1=get_horizons(data_ohlc)).dropna()
events = data_ohlc[['t1', 'threshold']] 
events = events.assign(side=pd.Series(1., events.index)) # long only
touches = get_touches(data_ohlc.close, events, [1,1])
print(touches.keys())
touches = get_labels(touches)
data_ohlc = data_ohlc.assign(label=touches.label)

Index(['t1', 'stop_loss', 'take_profit'], dtype='object')


In [153]:
data_ohlc

Unnamed: 0_level_0,symbol,side,size,price,tickDirection,trdMatchID,grossValue,homeNotional,foreignNotional,cmDollar,grpId,vwap,open,high,low,close,threshold,t1,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-12-04 01:10:40.787689,XBTUSD,Buy,400,3852.0,ZeroPlusTick,8602980a-839a-4ce0-a11b-25c5c0d2b32a,10384400,0.103844,400.0,1.019487e+08,50.0,3851.204119,3850.0,3852.5,3849.5,3852.0,0.002980,2018-12-04 01:30:59.937654,0.0
2018-12-04 01:14:00.876062,XBTUSD,Buy,1000,3847.5,ZeroPlusTick,ed4c1400-2ded-d554-3081-689b24c0826d,25991000,0.259910,1000.0,1.039983e+08,51.0,3851.481761,3852.0,3855.0,3847.0,3847.5,0.002513,2018-12-04 01:30:59.937654,1.0
2018-12-04 01:16:27.850240,XBTUSD,Buy,12000,3857.0,PlusTick,5f14af67-4b84-5909-08df-fe1fd5067404,311124000,3.111240,12000.0,1.059980e+08,52.0,3853.966010,3847.0,3858.0,3847.0,3857.0,0.002189,2018-12-04 01:33:16.610192,0.0
2018-12-04 01:18:14.564663,XBTUSD,Sell,85,3859.0,ZeroMinusTick,ce3f66f6-0349-88be-76b4-c630edbcb1cc,2202605,0.022026,85.0,1.079761e+08,53.0,3860.547363,3857.0,3864.5,3856.5,3859.0,0.002303,2018-12-04 01:33:16.610192,0.0
2018-12-04 01:22:01.296072,XBTUSD,Buy,174,3860.0,ZeroPlusTick,a32eadf8-ce47-2057-2fd1-6748684c65d3,4507818,0.045078,174.0,1.099929e+08,54.0,3857.849017,3859.0,3860.0,3856.0,3860.0,0.002675,2018-12-04 01:39:40.601451,0.0
2018-12-04 01:24:06.969465,XBTUSD,Buy,12318,3859.5,ZeroPlusTick,1cb0ac69-81d7-481d-aece-24be9065484d,319159380,3.191594,12318.0,1.119761e+08,55.0,3861.923815,3860.0,3864.5,3859.0,3859.5,0.002634,2018-12-04 01:39:40.601451,0.0
2018-12-04 01:30:59.937654,XBTUSD,Buy,6143,3856.5,ZeroPlusTick,a38de0c1-76e0-adad-c68f-52f9864f9ac4,159287990,1.592880,6143.0,1.139989e+08,56.0,3858.643596,3859.5,3864.5,3856.0,3856.5,0.002590,2018-12-04 01:46:06.584535,1.0
2018-12-04 01:33:16.610192,XBTUSD,Buy,2000,3865.0,ZeroPlusTick,70128a6f-0875-38b3-90c7-e34024dd34b2,51746000,0.517460,2000.0,1.159951e+08,57.0,3860.431428,3856.5,3865.0,3856.0,3865.0,0.002670,2018-12-04 01:49:44.893421,-1.0
2018-12-04 01:33:42.951728,XBTUSD,Buy,20,3867.5,ZeroPlusTick,8e639be3-63f9-bc08-471c-81d5af84d626,517120,0.005171,20.0,1.179665e+08,58.0,3867.142491,3865.0,3869.0,3865.0,3867.5,0.002614,2018-12-04 01:49:44.893421,-1.0
2018-12-04 01:35:25.997166,XBTUSD,Sell,1800,3863.5,ZeroMinusTick,b6752a01-e999-e957-24ef-391f85bdf80f,46589400,0.465894,1800.0,1.199801e+08,59.0,3867.279771,3867.5,3869.5,3863.5,3863.5,0.002635,2018-12-04 01:53:36.904369,-1.0


In [154]:
from imblearn.over_sampling import SMOTE
X = data_ohlc[['open', 'close', 'high', 'low', 'vwap']].values
y = np.squeeze(data_ohlc[['label']].values)
X_train, y_train = X[:4500], y[:4500]
X_test, y_test = X[4500:], y[4500:]
sm = SMOTE()
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [155]:
set(y_train)

{-1.0, 0.0, 1.0}

In [156]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train_res, y_train_res)
y_pred = clf.predict(X_test)



In [157]:
def true_binary_label(y_pred, y_test):
    bin_label = np.zeros_like(y_pred)
    for i in range(y_pred.shape[0]):
        if y_pred[i] != 0 and y_pred[i]*y_test[i] > 0:
            bin_label[i] = 1  # true positive
    return bin_label
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(true_binary_label(y_pred, y_test), y_pred != 0)

In [158]:
cm

array([[1001,  503],
       [   0,   76]])

In [160]:

y_train_pred = clf.predict(X_train) 

X_train_meta = np.hstack([y_train_pred[:, None], X_train])
X_test_meta = np.hstack([y_pred[:, None], X_test])

y_train_meta = true_binary_label(y_train_pred, y_train)

sm = SMOTE()
X_train_meta_res, y_train_meta_res = sm.fit_sample(X_train_meta, y_train_meta)
model_secondary = LogisticRegression().fit(X_train_meta_res, y_train_meta_res)
y_pred_meta = model_secondary.predict(X_test_meta)

cm= confusion_matrix(true_binary_label(y_pred, y_test), (y_pred * y_pred_meta) != 0)



In [161]:
cm

array([[1153,  351],
       [  32,   44]])

In [163]:
from sklearn.metrics import classification_report

print(classification_report(true_binary_label(y_pred, y_test),(y_pred * y_pred_meta) != 0))

              precision    recall  f1-score   support

         0.0       0.97      0.77      0.86      1504
         1.0       0.11      0.58      0.19        76

   micro avg       0.76      0.76      0.76      1580
   macro avg       0.54      0.67      0.52      1580
weighted avg       0.93      0.76      0.83      1580



In [166]:
for i in y_pred != 0:
    print(i)

False
False
False
True
False
False
True
True
True
False
True
True
True
True
False
True
True
True
False
True
False
True
False
False
True
True
False
False
True
True
False
True
True
True
True
False
False
False
True
False
False
False
True
True
False
True
True
False
False
False
False
True
False
True
False
False
False
False
True
False
True
True
True
True
True
False
True
True
True
False
True
False
True
False
True
False
False
False
False
True
True
False
False
False
False
False
False
False
False
False
False
True
False
False
False
True
False
True
False
False
False
False
False
True
True
True
False
False
False
False
False
False
True
False
True
True
True
True
True
True
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
False
False
False
False
False
False
False
True
True
False
False
False
False
True
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
False
False
False
True
True
True
False
False
True
False
True
False
False
False
Fals

In [167]:
0==False

True

In [168]:
1==False

False