In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate

In [86]:
# Train and test data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

nan_values = {'opened_position_qty ': (df_train['transacted_qty'] + df_train['d_open_interest']) / 2 , 
              'closed_position_qty': (df_train['transacted_qty'] - df_train['d_open_interest']) / 2}
df_train = df_train.fillna(value=nan_values)

#### Test 1: Drop all bid[3-5][vol], ask[3-5][vol], id, and mid. Keep opened_position_qty and closed_position_qty. 

In [43]:
df_ex_1 = df_train.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [44]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_ex_1[cols_to_norm] = df_ex_1[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [45]:
X_train = np.asarray(df_ex_1.iloc[:, 0:13])
y_train = np.asarray(df_ex_1['y'])

In [46]:
scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.643 (+/-0.000)
RandomForestClassifier average log_loss: 0.644 (+/-0.002)
RandomForestClassifier average auc: 0.598 (+/-0.015)


#### Test 2: Drop all bid[3-5][vol], ask[3-5][vol], id, mid, transaction_qty, d_open_interest

In [47]:
df_ex_2 = df_train.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 
                                 'transacted_qty', 'd_open_interest', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [48]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_ex_2[cols_to_norm] = df_ex_2[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [49]:
X_train = np.asarray(df_ex_2.iloc[:, 0:11])
y_train = np.asarray(df_ex_2['y'])

In [50]:
scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.643 (+/-0.000)
RandomForestClassifier average log_loss: 0.635 (+/-0.001)
RandomForestClassifier average auc: 0.626 (+/-0.006)


#### Test 3: Drop all bid[3-5][vol], ask[3-5][vol], id, mid, transaction_qty, d_open_interest, last_price

In [51]:
df_ex_3 = df_train.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 'last_price', 
                                 'transacted_qty', 'd_open_interest', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [52]:
cols_to_norm = ['bid1', 'bid2', 'ask1', 'ask2']
df_ex_3[cols_to_norm] = df_ex_3[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [53]:
X_train = np.asarray(df_ex_3.iloc[:, 0:10])
y_train = np.asarray(df_ex_3['y'])

In [54]:
scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.643 (+/-0.000)
RandomForestClassifier average log_loss: 0.636 (+/-0.001)
RandomForestClassifier average auc: 0.623 (+/-0.007)


#### Test 4: Drop all bid[3-5][vol], ask[3-5][vol], mid, transaction_qty, d_open_interest

In [56]:
df_ex_4 = df_train.drop(columns=['bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid',
                                 'transacted_qty', 'd_open_interest', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [57]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_ex_4[cols_to_norm] = df_ex_4[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [62]:
X_train = np.asarray(df_ex_4.iloc[:, 0:12])
y_train = np.asarray(df_ex_4['y'])

In [63]:
# don't include ID

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.586 (+/-0.114)
RandomForestClassifier average log_loss: 0.664 (+/-0.054)
RandomForestClassifier average auc: 0.619 (+/-0.014)


#### Test 5: Drop all bid[4-5][vol], ask[4-5][vol], id, mid, transaction_qty, d_open_interest

In [64]:
df_ex_5 = df_train.drop(columns=['bid4vol', 'bid5vol', 
                                 'ask4vol', 'ask5vol', 
                                 'mid', 'id'
                                 'transacted_qty', 'd_open_interest', 
                                 'ask4', 'ask5', 
                                 'bid4', 'bid5'])

In [65]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'bid3', 'ask1', 'ask2', 'ask3']
df_ex_5[cols_to_norm] = df_ex_5[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [70]:
X_train = np.asarray(df_ex_5.iloc[:, 0:16])
y_train = np.asarray(df_ex_5['y'])

In [71]:
# bid/ask should be kept at smaller than 2

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results: ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results: 
RandomForestClassifier average accuracy: 0.586 (+/-0.114)
RandomForestClassifier average log_loss: 0.671 (+/-0.070)
RandomForestClassifier average auc: 0.603 (+/-0.048)


In [87]:
df_train_final = df_train.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 
                                 'transacted_qty', 'd_open_interest', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [88]:
X_train_final = np.asarray(df_train_final.iloc[:, 0:11])
y_train_final = np.asarray(df_train_final['y'])

In [94]:
nan_values = {'opened_position_qty ': (df_test['transacted_qty'] + df_test['d_open_interest']) / 2 , 
              'closed_position_qty': (df_test['transacted_qty'] - df_test['d_open_interest']) / 2}
df_test_final = df_test.fillna(value=nan_values)

df_test_final = df_test_final.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 
                                 'transacted_qty', 'd_open_interest', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [95]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_test_final[cols_to_norm] = df_test_final[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [96]:
X_test = np.asarray(df_test_final.iloc[:, 0:11])

In [97]:
clf_final = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')
clf_final.fit(X_train_final, y_train_final)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [99]:
test_pred = clf_final.predict_proba(X_test)[:, 1]
sample_id = np.asarray(df_test['id'])
predictions = {'id': sample_id, 'Predicted': test_pred}
df_output = pd.DataFrame(predictions, columns=['id', 'Predicted'])
df_output.set_index('id')

Unnamed: 0_level_0,Predicted
id,Unnamed: 1_level_1
592380,0.458917
592381,0.281462
592382,0.341447
592383,0.455875
592384,0.289943
...,...
784234,0.363996
784235,0.368065
784236,0.363996
784237,0.368065


In [100]:
df_output.to_csv('prediction/betty_rf_2.csv', index=None)