In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_validate

### RandomForest, but test with different # of features. All price-related columns normalized. 

In [14]:
# Train and test data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

#### TEST 1: Drop all bid[3-5][vol], ask[3-5][vol], and mid. Keep opened_position_qty and closed_position_qty; fill NaNs with mean value of column. 

In [15]:
df_ex_1 = df_train.drop(columns=['id', 
                                 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'mid', 
                                 'ask3', 'ask4', 'ask5', 
                                 'bid3', 'bid4', 'bid5'])

In [24]:
nan_values = {'opened_position_qty ': df_ex_1['opened_position_qty '].mean(), 
              'closed_position_qty': df_ex_1['closed_position_qty'].mean()}
df_ex_1 = df_ex_1.fillna(value=nan_values)

In [27]:
cols_to_norm = ['last_price', 'bid1', 'bid2', 'ask1', 'ask2']
df_ex_1[cols_to_norm] = df_ex_1[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [74]:
X_train = np.asarray(df_ex_1.iloc[:, 0:13])
y_train = np.asarray(df_ex_1['y'])

In [75]:
# This is very sad

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results (entropy): ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results (entropy): 
RandomForestClassifier average accuracy: 0.643 (+/-0.000)
RandomForestClassifier average log_loss: 0.644 (+/-0.002)
RandomForestClassifier average auc: 0.598 (+/-0.015)


#### TEST 2: Drop NOTHING. Fill NaNs with mean value of column. 

In [76]:
nan_values = {'opened_position_qty ': df_train['opened_position_qty '].mean(), 
              'closed_position_qty': df_train['closed_position_qty'].mean()}
df_ex_2 = df_train.fillna(value=nan_values)

In [77]:
cols_to_norm = ['last_price', 'mid', 
                'bid1', 'bid2', 'bid3', 'bid4', 'bid5', 
                'ask1', 'ask2', 'ask3', 'ask4', 'ask5']
df_ex_2[cols_to_norm] = df_ex_2[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [81]:
X_train = np.asarray(df_ex_2.iloc[:, 0:27])
y_train = np.asarray(df_ex_2['y'])

In [82]:
# also sad

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results (entropy): ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results (entropy): 
RandomForestClassifier average accuracy: 0.597 (+/-0.092)
RandomForestClassifier average log_loss: 0.667 (+/-0.040)
RandomForestClassifier average auc: 0.575 (+/-0.026)


#### TEST 3: Drop NOTHING. Fill NaNs with median value of column. 

In [95]:
nan_values = {'opened_position_qty ': df_train['opened_position_qty '].median(), 
              'closed_position_qty': df_train['closed_position_qty'].median()}
df_ex_3 = df_train.fillna(value=nan_values)

In [96]:
cols_to_norm = ['last_price', 'mid', 
                'bid1', 'bid2', 'bid3', 'bid4', 'bid5', 
                'ask1', 'ask2', 'ask3', 'ask4', 'ask5']
df_ex_3[cols_to_norm] = df_ex_3[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [97]:
X_train = np.asarray(df_ex_3.iloc[:, 0:27])
y_train = np.asarray(df_ex_3['y'])

In [98]:
# same thing, which means that actual open and close might be very far away from what we believe

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results (entropy): ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results (entropy): 
RandomForestClassifier average accuracy: 0.597 (+/-0.092)
RandomForestClassifier average log_loss: 0.667 (+/-0.040)
RandomForestClassifier average auc: 0.575 (+/-0.026)


#### TEST 4: Drop NOTHING. Fill NaNs with median value of column. Don't normalize. 

In [92]:
nan_values = {'opened_position_qty ': df_train['opened_position_qty '].median(), 
              'closed_position_qty': df_train['closed_position_qty'].median()}
df_ex_4 = df_train.fillna(value=nan_values)

In [93]:
X_train = np.asarray(df_ex_4.iloc[:, 0:27])
y_train = np.asarray(df_ex_4['y'])

In [94]:
# normalization doesnt make a difference

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=5, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results (entropy): ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results (entropy): 
RandomForestClassifier average accuracy: 0.597 (+/-0.092)
RandomForestClassifier average log_loss: 0.667 (+/-0.040)
RandomForestClassifier average auc: 0.575 (+/-0.026)


#### TEST 5: Drop all bid[1-5][vol], ask[1-5][vol]. Don't keep opened_position and closed_position. Don't normalize. 

In [53]:
df_ex_5 = df_train.drop(columns=['id', 'opened_position_qty ', 'closed_position_qty',
                                 'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 
                                 'ask1vol', 'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 
                                 'ask1', 'ask2', 'ask3', 'ask4', 'ask5', 
                                 'bid1', 'bid2', 'bid3', 'bid4', 'bid5'])

In [86]:
cols_to_norm = ['last_price', 'mid']
df_ex_5[cols_to_norm] = df_ex_5[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [89]:
X_train = np.asarray(df_ex_5.iloc[:, 0:4])
y_train = np.asarray(df_ex_5['y'])

In [91]:
# yeah this is not good

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

clf_test = RandomForestClassifier(max_depth=2, min_samples_split=2, 
                                  random_state=1, n_estimators=10, 
                                  max_features='sqrt')

results = cross_validate(clf_test, X_train, y_train, cv=8, scoring=list(scoring.values()), 
                         return_train_score=False)
    
print('\nK-fold cross-validation results (entropy): ')
for sc in range(len(scoring)):
    print(clf_test.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
    if list(scoring.values())[sc]=='neg_log_loss' 
    else results['test_%s' % list(scoring.values())[sc]].mean(), 
        results['test_%s' % list(scoring.values())[sc]].std()))


K-fold cross-validation results (entropy): 
RandomForestClassifier average accuracy: 0.638 (+/-0.013)
RandomForestClassifier average log_loss: 0.656 (+/-0.014)
RandomForestClassifier average auc: 0.518 (+/-0.027)
