In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate

from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Train and test data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

nan_values = {'opened_position_qty ': (df_train['transacted_qty'] + df_train['d_open_interest']) / 2 , 
              'closed_position_qty': (df_train['transacted_qty'] - df_train['d_open_interest']) / 2}

nan_values_test = {'opened_position_qty ': (df_test['transacted_qty'] + df_test['d_open_interest']) / 2 , 
              'closed_position_qty': (df_test['transacted_qty'] - df_test['d_open_interest']) / 2}

df_train = df_train.fillna(value=nan_values)
df_test = df_test.fillna(value=nan_values_test)

In [3]:
df_train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,51.5,51.5,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [91]:
# bid ask volume imbalance (A signed quantity indicating the number of shares at the bid
# minus the number of shares at the ask in the current order books.)

df_ex = df_ex.assign(bid_ask_vol_imbalance = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))

In [7]:
# signed_transaction volume = d_open_interest

In [13]:
# best_bid_ask_spread level 1 - 5
# indicator - difference between best bid price and best ask price
df_ex = df_ex.assign(bid_ask_spread_l1 = lambda x: (x.bid1 - x.ask1))
df_ex = df_ex.assign(bid_ask_spread_l2 = lambda x: (x.bid2 - x.ask2))
df_ex = df_ex.assign(bid_ask_spread_l3 = lambda x: (x.bid3 - x.ask3))
df_ex = df_ex.assign(bid_ask_spread_l4 = lambda x: (x.bid4 - x.ask4))
df_ex = df_ex.assign(bid_ask_spread_l5 = lambda x: (x.bid5 - x.ask5))

In [58]:
# diff_price: A feature measuring the recent directional movement of executed prices
df_ex = df_ex.assign(diff_price = lambda x: (x.last_price - x.mid))

In [12]:
# Trade imbalance - level 1 - 5
df_ex = df_ex.assign(imbalance_level1 = \
                        lambda x: (x.bid1vol - x.ask1vol) / (x.bid1vol + x.ask1vol))

df_ex = df_ex.assign(imbalance_level2 = \
                        lambda x: (x.bid2vol - x.ask2vol) / (x.bid2vol + x.ask2vol))

df_ex = df_ex.assign(imbalance_level3 = \
                        lambda x: (x.bid3vol - x.ask3vol) / (x.bid3vol + x.ask3vol))

df_ex = df_ex.assign(imbalance_level4 = \
                        lambda x: (x.bid4vol - x.ask4vol) / (x.bid4vol + x.ask4vol))

df_ex = df_ex.assign(imbalance_level5 = \
                        lambda x: (x.bid5vol - x.ask5vol) / (x.bid5vol + x.ask5vol))

In [132]:
cols = ['transacted_qty', 'd_open_interest', 
        'bid1vol', 'ask1vol', 'bid2vol', 'ask2vol', 'diff_price', 
        'bid_ask_spread_l2', 
        'bid_ask_spread_l1', 'imbalance_level1', 'imbalance_level2']

In [133]:
X_train = np.asarray(df_ex[cols])
y_train = np.asarray(df_ex['y'])

In [134]:
# Random forest

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=10, min_samples_split=2, 
                                  random_state=0, n_estimators=10, min_samples_leaf=4, 
                                  max_features='sqrt', criterion='gini')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.661 (+/-0.003)
RandomForestClassifier average log_loss: 0.613 (+/-0.003)
RandomForestClassifier average auc: 0.660 (+/-0.007)


In [105]:
# bid ask volume imbalance (A signed quantity indicating the number of shares at the bid
# minus the number of shares at the ask in the current order books.)

df_test = df_test.assign(bid_ask_vol_imbalance = \
                        lambda x: (x.bid1vol+x.bid2vol+x.bid3vol+x.bid4vol+x.bid5vol) / \
                                  (x.ask1vol+x.ask2vol+x.ask3vol+x.ask4vol+x.ask5vol))

df_test = df_test.assign(bid_ask_spread_l1 = lambda x: (x.bid1 - x.ask1))
df_test = df_test.assign(bid_ask_spread_l2 = lambda x: (x.bid2 - x.ask2))
df_test = df_test.assign(bid_ask_spread_l3 = lambda x: (x.bid3 - x.ask3))
df_test = df_test.assign(bid_ask_spread_l4 = lambda x: (x.bid4 - x.ask4))
df_test = df_test.assign(bid_ask_spread_l5 = lambda x: (x.bid5 - x.ask5))

df_test = df_test.assign(diff_price = lambda x: (x.last_price - x.mid))

df_test = df_test.assign(imbalance_level1 = \
                        lambda x: (x.bid1vol - x.ask1vol) / (x.bid1vol + x.ask1vol))

df_test = df_test.assign(imbalance_level2 = \
                        lambda x: (x.bid2vol - x.ask2vol) / (x.bid2vol + x.ask2vol))

df_test = df_test.assign(imbalance_level3 = \
                        lambda x: (x.bid3vol - x.ask3vol) / (x.bid3vol + x.ask3vol))

df_test = df_test.assign(imbalance_level4 = \
                        lambda x: (x.bid4vol - x.ask4vol) / (x.bid4vol + x.ask4vol))

df_test = df_test.assign(imbalance_level5 = \
                        lambda x: (x.bid5vol - x.ask5vol) / (x.bid5vol + x.ask5vol))

In [106]:
X_test = np.asarray(df_test[cols])

In [107]:
clf_test.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [81]:
clf_final = RandomForestClassifier(max_depth=5, min_samples_split=2, 
                                  random_state=0, n_estimators=10, 
                                  max_features='sqrt', criterion='gini')
clf_final.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [108]:
test_pred = clf_test.predict_proba(X_test)[:, 1]
sample_id = np.asarray(df_test['id'])
predictions = {'id': sample_id, 'Predicted': test_pred}
df_output = pd.DataFrame(predictions, columns=['id', 'Predicted'])
df_output.set_index('id')

Unnamed: 0_level_0,Predicted
id,Unnamed: 1_level_1
592380,0.547844
592381,0.183566
592382,0.388425
592383,0.448332
592384,0.303461
...,...
784234,0.343298
784235,0.120547
784236,0.488931
784237,0.369744


In [109]:
df_output.to_csv('prediction/betty_rf_f.csv', index=None)