In [2]:
#general
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (9,4)
import pandas as pd
pd.set_option('display.max_columns', 50)
import random

#preprocessing
from datetime import datetime, timedelta
import category_encoders as ce

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#model validation / evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

## Model Evaluation Functions

In [3]:
def cm2df(cm, labels): 
    """converts a numpy array confusion matrix to a pandas dataframe, with class labels"""
    df = pd.DataFrame()
    # rows
    for i, row_label in enumerate(labels):
        rowdata={}
        # columns
        for j, col_label in enumerate(labels): 
            rowdata[col_label]=cm[i,j]
        df = df.append(pd.DataFrame.from_dict({row_label:rowdata}, orient='index'))
    return df[labels]

def model_eval(y_test, y_pred): 
    """prints out a confusion matrix (pandas dataframe) and classification report"""
    conf_mat = confusion_matrix(y_test, y_pred)
    cm_as_df=cm2df(conf_mat,[0,1])
    new_names = {0:'Order Fail',1:'Order Success'}
    cm_as_df = cm_as_df.rename(index=new_names, 
                                 columns=new_names)
    print('\n')
    print('CONFUSION MATRIX (predicted along top, actual along side): ')
    display(cm_as_df)

    print('\n')
    print(classification_report(y_test,y_pred,target_names=['Order Fail','Order Success']))
    
def cross_validation(model, features, response, num_folds):
    metrics = ['precision_macro', 'recall_macro', "f1_macro"]
    cv = cross_validate(model, features, response, scoring=metrics, cv=num_folds)
    print("Test data set average precision across 5 folds:")
    print(cv['test_precision_macro'])
    print("\nTest data set average recall across 5 folds:")
    print(cv['test_recall_macro'])
    print("\nTest data set average fscore across 5 folds:")
    print(cv['test_f1_macro'])
    
def plot_roc_curve(model, X_test, y_test):
    probabilities = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probabilities)
    print('AUC: %.3f' % auc)

    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, probabilities)
    # plot no skill curve
    plt.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()

In [4]:
df = pd.read_csv('data.csv')
# display(df.describe())
display(df.head())

Unnamed: 0,symbol,tradedate,starttime,side,size,strategyphase,venue,venuetype,ordertype,peginstruction,tif,minexecqty,limitprice,prevailbid,prevailask,prevailbidsize,prevailasksize,dispatcherrebalance,lotsize,securitycategory,sector,mktcap,adv20d,averagespread,misbin,misavgbidsize1min,misavgasksize1min,misavgspread1min,misoddlotvolume1min,misadfvolume1min,misvolume1min,lastordersuccess,lastorderfullsuccess,lastorderduration,lastfillduration,ordersuccess
0,C02112214394819,20180914,10:22:03.668,Sell,600,PingDarkTouch,BARX,Dark,LIMIT,NONE,IOC,0,5.78,5.78,5.82,200,200,0,100,COMMON,Energy,SMALL,0.0,0.033333,10:22:00,176,316,0.045597,373,1227,2100,0.0,0.0,862392.0,,0
1,C02112214394819,20180914,10:22:03.668,Sell,500,PingDarkTouch,CICX,Dark,LIMIT,NONE,IOC,0,5.78,5.78,5.82,200,200,0,100,COMMON,Energy,SMALL,0.0,0.033333,10:22:00,176,316,0.045597,373,1227,2100,0.0,0.0,862392.0,,0
2,C102221371819,20180914,10:07:16.012,Sell,100,PingDarkPxImp,UBSA,Dark,LIMIT,MID,IOC,0,22.2,22.2,22.23,500,1700,0,100,FUND,,SMALL,0.0,0.037368,10:07:00,631,1767,0.02837,0,0,200,0.0,0.0,361595.0,,0
3,C02112214394819,20180914,10:22:03.668,Sell,500,PingDarkTouch,JPMX,Dark,LIMIT,NONE,IOC,0,5.78,5.78,5.82,200,200,0,100,COMMON,Energy,SMALL,0.0,0.033333,10:22:00,176,316,0.045597,373,1227,2100,0.0,0.0,862392.0,,0
4,C102221371819,20180914,10:07:16.013,Sell,200,PingDarkTouch,JSES,Single-Dealer,LIMIT,NONE,IOC,0,22.2,22.2,22.23,500,1700,0,100,FUND,,SMALL,0.0,0.037368,10:07:00,631,1767,0.02837,0,0,200,,,,,0


### Class Proportions (Response = 'ordersuccess')

In [5]:
success = df[df['ordersuccess']==1]
fail = df[df['ordersuccess']==0]

success_num = len(success)
fail_num = len(fail)
total = len(df)

print('Number of order successes: ', success_num)
print('Proportion of order successes: ', success_num / (success_num + fail_num))
print('Number of order failures: ', fail_num)
print('Proportion of order failures: ', fail_num / (success_num + fail_num))

Number of order successes:  28190
Proportion of order successes:  0.05021044213331149
Number of order failures:  533247
Proportion of order failures:  0.9497895578666885


## Preprocessing

In [6]:
def getTimeMS(t):
    if '.' in t:
        timestamp, ms = t.split('.')
    else:
        timestamp = t
        ms = 0
    time_obj = datetime.strptime(timestamp, '%H:%M:%S')
    epoch = datetime(1900, 1, 1, 0, 0, 0, 0)
    milliseconds = (time_obj - epoch) // timedelta(milliseconds=1)
    return milliseconds + float(ms)*1000

df['time_ms'] = df['starttime'].apply(lambda x: getTimeMS(x))


#continuous features
cont_features = df.loc[:,['time_ms','size','minexecqty','limitprice','prevailbid','prevailask','prevailbidsize','prevailasksize',
                          'dispatcherrebalance','lotsize','averagespread','misavgbidsize1min','misavgasksize1min', 
                          'misavgspread1min', 'misoddlotvolume1min', 'misadfvolume1min','misvolume1min']]

#one-hot encoded features
dummies = pd.get_dummies(df.loc[:,['side','venuetype','securitycategory', 'peginstruction',
                                   'sector','mktcap']].astype('category'),drop_first=True)

#binary encoded features
enc = ce.BinaryEncoder()
binary_sym = enc.fit_transform(df.loc[:,'symbol'].as_matrix())
names = {'0_0':'symbol_0', '0_1':'symbol_1','0_2':'symbol_2','0_3':'symbol_3','0_4':'symbol_4',
         '0_5':'symbol_5','0_6':'symbol_6','0_7':'symbol_7','0_8':'symbol_8','0_9':'symbol_9',
         '0_10':'symbol_10','0_11':'symbol_11','0_12':'symbol_12'}
binary_sym = binary_sym.rename(columns=names)

binary_venue = enc.fit_transform(df.loc[:,'venue'].as_matrix())
names = {'0_0':'venue_0', '0_1':'venue_1','0_2':'venue_2',
         '0_3':'venue_3','0_4':'venue_4','0_5':'venue_5','0_6':'venue_6'}
binary_venue = binary_venue.rename(columns=names)


adv20d = df.loc[:,'adv20d'].fillna(0)


features = pd.concat([adv20d, cont_features, dummies, binary_sym, binary_venue], axis=1)
response = df.loc[:,'ordersuccess']

df = pd.concat([features,response],axis=1)



## Train / Test Split

In [9]:
df = df.sample(frac=1)
train = df.head(round(total*.70))
train = train.reset_index(drop=True)
test = df.tail(round(total*.30))
test = test.reset_index(drop=True)

train_success = train[train['ordersuccess']==1]
train_success = train_success.reset_index(drop=True)
train_fail = train[train['ordersuccess']==0]

test_success = test[test['ordersuccess']==1]
test_fail = test[test['ordersuccess']==0]

## Model Iterations

In [38]:
def train_model(train_fail, train_success, sample):
    #downsampled_fails = train_fail.sample(n=len(train_success))
    downsampled_fails = train_fail.loc[sample,:]
    #recombine and shuffle data
    train_downsampled = pd.concat([downsampled_fails, train_success],axis=0)
    train_downsampled = train_downsampled.sample(frac=1).reset_index(drop=True)
    
    #split into x and y
    train_features = train_downsampled.loc[:,train_downsampled.columns != 'ordersuccess']
    train_response = train_downsampled.loc[:,'ordersuccess']
    
    #train model
    #tune parameters
    RF = RandomForestClassifier()
    RF.fit(train_features, train_response)
    return RF


train_features = train.loc[:,train.columns != 'ordersuccess']
test_features = test.loc[:,test.columns != 'ordersuccess']
test_features = test_features.reset_index(drop=True)
#display(train_features)

#write code to keep track of how many rows are covered
new_features_train = pd.DataFrame({})
new_features_test = pd.DataFrame({})
indices_used = set()
total_indices_train_fails = train_fail.index.tolist()
successes = len(train_success)
iterations = 1
while len(indices_used) < .8*len(train_fail) and iterations < 50:
    iterations+=1
    sample = random.sample(total_indices_train_fails, len(train_success))
    indices_used.update(sample)
    model = train_model(train_fail, train_success, sample)
    y_pred_train = pd.DataFrame(model.predict(train_features))
    y_pred_train = y_pred_train.rename(columns={0:"Model %s" %i})
    new_features_train = pd.concat([new_features_train, y_pred_train], axis=1)
    
    y_pred_test = pd.DataFrame(model.predict(test_features))
    y_pred_test = y_pred_test.rename(columns={0:"Model %s" %i})
    new_features_test = pd.concat([new_features_test, y_pred_test], axis=1)
    
print(iterations)
train_features = pd.concat([train_features, new_features_train], axis=1)
test_features = pd.concat([test_features, new_features_test], axis=1)

#display(train_features.head())
#display(test_features.head())

train_response = train.loc[:,'ordersuccess']
test_response = test.loc[:,'ordersuccess']
test_response = test_response.reset_index(drop=True)

#dataframes to store only the model prediction columns
train_model_pred = train_features.iloc[:,67:]
test_model_pred = test_features.iloc[:,67:]



KeyboardInterrupt: 

## Random Forest Feature Selection

In [27]:
RF = RandomForestClassifier()
RF.fit(train_model_pred, train_response)
# y_pred = RF.predict(test_model_pred)

# model_eval(test_response, y_pred)

# plot_roc_curve(RF, test_model_pred, test_response)
# cross_validation(RF, features, response, 5)

#select top n features
important_features = sorted(zip(map(lambda x: round(x, 4), RF.feature_importances_), 
                                train_model_pred.columns), reverse=True)[0:10]
print(important_features)

NameError: name 'train_model_pred' is not defined

## Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(train_model_pred, train_response)

y_pred = logreg.predict(test_model_pred)
model_eval(test_response, y_pred)

plot_roc_curve(logreg, test_model_pred, test_response)
# cross_validation(logreg, features, response, 5)

## Linear SVC

In [None]:
clf = svm.LinearSVC().fit(train_model_pred, train_response) #linear kernel function
y_pred = clf.predict(test_model_pred)

model_eval(test_response, y_pred)

plot_roc_curve(logreg, test_model_pred, test_response)
# cross_validation(logreg, features, response, 5)

## Linear SVC with Feature Selection

In [None]:
names = []
for i in important_features:
    names.append(i[1])
    
clf = svm.LinearSVC().fit(train_model_pred.loc[:, names], train_response) #linear kernel function
y_pred = clf.predict(test_model_pred.loc[:, names])
model_eval(test_response, y_pred)

## Neural Net (sklearn)

In [None]:
clfANN = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
           beta_1=0.95, beta_2=0.9995, early_stopping=False,
           epsilon=1e-05, hidden_layer_sizes=(100, 100),
           learning_rate='constant', learning_rate_init=0.01,
           max_iter=3000, momentum=0.9,
           nesterovs_momentum=True, power_t=0.5, random_state=0,
           shuffle=True, solver='adam', tol=0.001,
           validation_fraction=0.1, verbose=True, warm_start=False)


clfANN.fit(train_model_pred, train_response)
y_pred = clfANN.predict(test_model_pred)
# y_pred = (clfANN.predict_proba(test_model_pred)[:,1] >= 0.1).astype(bool) #adjust classification threshold

model_eval(test_response, y_pred)
plot_roc_curve(clfANN, test_model_pred, test_response)
# cross_validation(clfANN, features, response, 5)