In [1]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (11,6)
import pandas as pd
pd.set_option('display.max_columns', 50)

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('data.csv')
# display(df.describe())
# display(df.head())

### Class Proportions (Response = 'ordersuccess')

In [3]:
success = df[df['ordersuccess']==1]
fail = df[df['ordersuccess']==0]

success_num = len(success)
fail_num = len(fail)
total = len(df)

print('Number of order successes: ', success_num)
print('Proportion of order successes: ', success_num / (success_num + fail_num))
print('Number of order failures: ', fail_num)
print('Proportion of order failures: ', fail_num / (success_num + fail_num))

Number of order successes:  28190
Proportion of order successes:  0.05021044213331149
Number of order failures:  533247
Proportion of order failures:  0.9497895578666885


## Preprocessing

In [4]:
cont_features = df.loc[:,['size','minexecqty','limitprice','prevailbid','prevailask','prevailbidsize','prevailasksize',
                          'dispatcherrebalance','lotsize','averagespread','misavgbidsize1min','misavgasksize1min', 
                          'misavgspread1min', 'misoddlotvolume1min', 'misadfvolume1min','misvolume1min']]

dummies = pd.get_dummies(df.loc[:,['side','venue','venuetype','securitycategory',
                                   'sector','mktcap']].astype('category'),drop_first=True)

features = pd.concat([cont_features,dummies], axis=1)

response = df.loc[:,'ordersuccess']

df = pd.concat([features,response],axis=1)

## Train / Test Split

In [5]:
train = df.head(round(total*.70))
test = df.tail(round(total*.30))

train_success = train[train['ordersuccess']==1]
train_fail = train[train['ordersuccess']==0]

test_success = test[test['ordersuccess']==1]
test_fail = test[test['ordersuccess']==0]

## Model Iterations

In [6]:
def train_model(train_fail, train_success):
    downsampled_fails = train_fail.sample(n=len(train_success))

    #recombine and shuffle data
    train_downsampled = pd.concat([downsampled_fails, train_success],axis=0)
    train_downsampled = train_downsampled.sample(frac=1).reset_index(drop=True)
    
    #split into x and y
    train_features = train_downsampled.loc[:,train_downsampled.columns != 'ordersuccess']
    train_response = train_downsampled.loc[:,'ordersuccess']
    
    #train model
    RF = RandomForestClassifier()
    RF.fit(train_features, train_response)
    return RF


train_features = train.loc[:,train.columns != 'ordersuccess']
test_features = test.loc[:,test.columns != 'ordersuccess']
test_features = test_features.reset_index(drop=True)


new_features_train = pd.DataFrame({})
new_features_test = pd.DataFrame({})
for i in range(0, 5):
    model = train_model(train_fail, train_success)
    y_pred_train = pd.DataFrame(model.predict(train_features))
    y_pred_train = y_pred_train.rename(columns={0:"Model %s" %i})
    new_features_train = pd.concat([new_features_train, y_pred_train], axis=1)
    
    y_pred_test = pd.DataFrame(model.predict(test_features))
    y_pred_test = y_pred_test.rename(columns={0:"Model %s" %i})
    new_features_test = pd.concat([new_features_test, y_pred_test], axis=1)
    
    
train_features = pd.concat([train_features, new_features_train], axis=1)
test_features = pd.concat([test_features, new_features_test], axis=1)

display(train_features.head())
display(test_features.head())

Unnamed: 0,size,minexecqty,limitprice,prevailbid,prevailask,prevailbidsize,prevailasksize,dispatcherrebalance,lotsize,averagespread,misavgbidsize1min,misavgasksize1min,misavgspread1min,misoddlotvolume1min,misadfvolume1min,misvolume1min,side_Sell,venue_BAML,venue_BARX,venue_BATS,venue_BATY,venue_BIDS,venue_BLKX,venue_CAES,venue_CDED,...,securitycategory_MISC,securitycategory_OTHER_DEP_RCPT,securitycategory_PREFERRED,securitycategory_REIT,securitycategory_RIGHT,securitycategory_UNIT,securitycategory_WARRANT,sector_Communications,sector_Consumer Cyclical,sector_Consumer Non-cyclical,sector_Diversified,sector_Energy,sector_Financial,sector_Government,sector_Industrial,sector_Technology,sector_Test,sector_Utilities,mktcap_MID,mktcap_SMALL,Model 0,Model 1,Model 2,Model 3,Model 4
0,600,0,5.78,5.78,5.82,200,200,0,100,0.033333,176,316,0.045597,373,1227,2100,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
1,500,0,5.78,5.78,5.82,200,200,0,100,0.033333,176,316,0.045597,373,1227,2100,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
2,100,0,22.2,22.2,22.23,500,1700,0,100,0.037368,631,1767,0.02837,0,0,200,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,500,0,5.78,5.78,5.82,200,200,0,100,0.033333,176,316,0.045597,373,1227,2100,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,200,0,22.2,22.2,22.23,500,1700,0,100,0.037368,631,1767,0.02837,0,0,200,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


Unnamed: 0,size,minexecqty,limitprice,prevailbid,prevailask,prevailbidsize,prevailasksize,dispatcherrebalance,lotsize,averagespread,misavgbidsize1min,misavgasksize1min,misavgspread1min,misoddlotvolume1min,misadfvolume1min,misvolume1min,side_Sell,venue_BAML,venue_BARX,venue_BATS,venue_BATY,venue_BIDS,venue_BLKX,venue_CAES,venue_CDED,...,securitycategory_MISC,securitycategory_OTHER_DEP_RCPT,securitycategory_PREFERRED,securitycategory_REIT,securitycategory_RIGHT,securitycategory_UNIT,securitycategory_WARRANT,sector_Communications,sector_Consumer Cyclical,sector_Consumer Non-cyclical,sector_Diversified,sector_Energy,sector_Financial,sector_Government,sector_Industrial,sector_Technology,sector_Test,sector_Utilities,mktcap_MID,mktcap_SMALL,Model 0,Model 1,Model 2,Model 3,Model 4
0,300,100,25.77,25.77,25.81,900,1600,0,100,0.035556,672,1590,0.032889,0,0,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,400,100,25.77,25.77,25.81,900,1600,0,100,0.035556,672,1590,0.032889,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,300,100,25.77,25.77,25.81,900,1600,0,100,0.035556,672,1590,0.032889,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,400,100,25.77,25.77,25.81,900,1600,0,100,0.035556,672,1590,0.032889,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,300,100,25.77,25.77,25.81,900,1600,0,100,0.035556,672,1590,0.032889,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## Model Evaluation Functions

In [7]:
def cm2df(cm, labels): 
    """converts a numpy array confusion matrix to a pandas dataframe, with class labels"""
    df = pd.DataFrame()
    # rows
    for i, row_label in enumerate(labels):
        rowdata={}
        # columns
        for j, col_label in enumerate(labels): 
            rowdata[col_label]=cm[i,j]
        df = df.append(pd.DataFrame.from_dict({row_label:rowdata}, orient='index'))
    return df[labels]

def model_eval(y_test, y_pred): 
    """prints out a confusion matrix (pandas dataframe) and classification report"""
    conf_mat = confusion_matrix(y_test, y_pred)
    cm_as_df=cm2df(conf_mat,[0,1])
    new_names = {0:'Order Fail',1:'Order Success'}
    cm_as_df = cm_as_df.rename(index=new_names, 
                                 columns=new_names)
    print('\n')
    print('CONFUSION MATRIX (predicted along top, actual along side): ')
    display(cm_as_df)

    print('\n')
    print(classification_report(y_test,y_pred,target_names=['Order Fail','Order Success']))
    
def cross_validation(model, features, response, num_folds):
    metrics = ['precision_macro', 'recall_macro', "f1_macro"]
    cv = cross_validate(model, features, response, scoring=metrics, cv=num_folds)
    print("Test data set average precision across 5 folds:")
    print(cv['test_precision_macro'])
    print("\nTest data set average recall across 5 folds:")
    print(cv['test_recall_macro'])
    print("\nTest data set average fscore across 5 folds:")
    print(cv['test_f1_macro'])
    
def plot_roc_curve(model, X_test, y_test):
    probabilities = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probabilities)
    print('AUC: %.3f' % auc)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, probabilities)
    # plot no skill curve
    plt.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

## Random Forest

In [8]:

# model_eval(test_response, y_pred)

# plot_roc_curve(RF, test_features, test_response)

# cross_validation(RF, features, response, 5)

# print(sorted(zip(map(lambda x: round(x, 4), RF.feature_importances_), train_features.columns), 
#              reverse=True))