In [32]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, roc_curve, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('Messages_allDay_AMZN_withY.csv')

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Time_stamp,Type,OrderID,V,Dir,Time,V_ask_1,V_bid_1,V_ask_2,...,mid-price 2_,bid-ask spread 3_,mid-price 3_,bid-ask spread 4_,mid-price 4_,bid-ask spread 5_,mid-price 5_,Mid_price,MidPrice_Moves,SpdCros_Moves
0,0,34200.18961,1,11885113,21,1,30:00.2,100,21,100,...,223.585,0.93,223.535,1.21,223.645,1.4,223.7,223.88,2,2
1,1,34200.18961,1,3911376,20,-1,30:00.2,100,21,20,...,223.57,0.92,223.53,0.96,223.52,1.25,223.625,223.88,2,2
2,2,34200.18961,1,11534792,100,1,30:00.2,100,21,20,...,223.855,0.81,223.585,0.93,223.535,1.21,223.645,223.88,2,2
3,3,34200.18961,1,1365373,13,-1,30:00.2,100,21,20,...,223.855,0.81,223.585,0.93,223.535,1.21,223.645,223.88,2,2
4,4,34200.18961,1,11474176,2,1,30:00.2,100,21,20,...,223.855,0.34,223.82,0.82,223.59,1.18,223.66,223.88,2,2


In [6]:
df.shape

(153490, 59)

In [7]:
df.columns

Index(['Unnamed: 0', 'Time_stamp', 'Type', 'OrderID', 'V', 'Dir', 'Time',
       'V_ask_1', 'V_bid_1', 'V_ask_2', 'V_bid_2', 'V_ask_3', 'V_bid_3',
       'V_ask_4', 'V_bid_4', 'V_ask_5', 'V_bid_5', 'Label', 'Level',
       'Mean_V_ask', 'Mean_V_bid', 'V_accu', 'P_', 'P_ask_1_', 'P_bid_1_',
       'P_ask_2_', 'P_bid_2_', 'P_ask_3_', 'P_bid_3_', 'P_ask_4_', 'P_bid_4_',
       'P_ask_5_', 'P_bid_5_', 'bid-ask spread 1_', 'mid-price 1_',
       'd_P_ask_51_', 'd_P_bid_51_', 'd_P_ask_21_', 'd_P_bid_21_',
       'd_P_ask_32_', 'd_P_bid_32_', 'd_P_ask_43_', 'd_P_bid_43_',
       'd_P_ask_54_', 'd_P_bid_54_', 'Mean_P_ask_', 'Mean_P_bid_', 'P_accu_',
       'bid-ask spread 2_', 'mid-price 2_', 'bid-ask spread 3_',
       'mid-price 3_', 'bid-ask spread 4_', 'mid-price 4_',
       'bid-ask spread 5_', 'mid-price 5_', 'Mid_price', 'MidPrice_Moves',
       'SpdCros_Moves'],
      dtype='object')

In [8]:
# Extract observations between 10:30~10:55
df_simplify = df[(df['Time_stamp']>=37800)&(df['Time_stamp']<=39300)]

In [9]:
df_simplify.shape

(12274, 59)

In [10]:
# Original training set: 10:30~10:50
df_train = df_simplify[(df_simplify['Time_stamp']>=37800)&(df_simplify['Time_stamp']<=39000)]

In [11]:
df_train.shape

(8749, 59)

In [12]:
# Orginal testing set: first 10 rows after 10:50
df_test = df_simplify.iloc[8749:8799]

In [13]:
df_test.shape

(50, 59)

In [14]:
# Extract observations in the training set where y=0/1/2
df_train_y_0 = df_train[df_train['MidPrice_Moves'] == 0]
df_train_y_1 = df_train[df_train['MidPrice_Moves'] == 1]
df_train_y_2 = df_train[df_train['MidPrice_Moves'] == 2]

In [15]:
df_train_y_0.shape

(778, 59)

In [16]:
df_train_y_1.shape

(776, 59)

In [17]:
df_train_y_2.shape

(7195, 59)

In [18]:
# Keep y=0:y=1:y=2 = 1:1:2
df_train_y_2_new = df_train_y_2.sample(777*2)

In [19]:
df_train_y_2_new.shape

(1554, 59)

In [20]:
# Concat the dataframe to make a new training set
df_train_new = pd.concat([df_train_y_0,df_train_y_1,df_train_y_2_new])

In [21]:
df_train_new.shape

(3108, 59)

In [22]:
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [23]:
# Extract x and y variable in the training set and testing set
x_train = df_train_new.drop(['Unnamed: 0','Time_stamp','OrderID','Time','Label','MidPrice_Moves','SpdCros_Moves'],axis = 1)
y_train = df_train_new['MidPrice_Moves']
x_test = df_test.drop(['Unnamed: 0','Time_stamp','Time','OrderID','Label','MidPrice_Moves','SpdCros_Moves'],axis = 1)
y_test = df_test['MidPrice_Moves']

In [24]:
x_train.shape

(3108, 52)

In [25]:
y_train.shape

(3108,)

In [26]:
x_test.shape

(50, 52)

In [27]:
y_test.shape

(50,)

In [28]:
# Train Logistic Regression Model
LRM = linear_model.LogisticRegression(C=0.1, penalty='l2')
LRM.fit(x_train, y_train)

# Report training accuracy, testing accuracy
training_accuracy=LRM.score(x_train,y_train)
print('The training accuracy is:', training_accuracy)
testing_accuracy=LRM.score(x_test,y_test)
print('The testing accuracy is', testing_accuracy)

# Report testing set confusion matrix
y_true = y_test
y_pred = LRM.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred)) 
print("The testing set confusion matrix is:")
ConfusionMatrix

The training accuracy is: 0.706885456885
The testing accuracy is 0.82
The testing set confusion matrix is:


Unnamed: 0,0,1,2
0,1,1,0
1,1,7,0
2,1,6,33


In [62]:
def print_eval(model, x, y, threshold=None):
    """Prints some evaluation metrics"""
    if threshold is not None:
        y_pred = predict_threshold(model, x, threshold)
    else:
        y_pred = model.predict(x)
    print('{:15} {:.5f}'.format('Accuracy:', accuracy_score(y, y_pred)))
    print('{:15} {:.5f}'.format('Precision:', precision_score(y, y_pred,average='micro')))
    print('{:15} {:.5f}'.format('Recall:', recall_score(y, y_pred,average='micro')))
    # print('{:15} {:.5f}'.format('AUC:', roc_auc_score(y, y_pred)))
    print('{:15} {:.5f}'.format('F1 score:', f1_score(y, y_pred,average='micro')))    
    print('')
    # print_confusions(model, x, y, y_pred)
    
def predict_threshold(model, x, threshold):
    return (model.predict_proba(x)[:,1] > threshold).astype(int) 

def print_confusions(model, x, y, y_pred):
    conf_train = pd.DataFrame(data=confusion_matrix(y, y_pred), 
                          columns=[0, 1],
                          index=[0, 1]).iloc[::-1,::-1].T

    print('Confusion (rows: pred, cols: actual)\n{}'.format(conf_train))
    
# print recall curves vs threshold and precision

def plot_precision_recall_curve(model, x, y):
    precision, recall, thresholds = precision_recall_curve(y_true = y, probas_pred = model.predict_proba(x)[:,1])
    fpr, tpr, threshold = roc_curve(y_test, model.predict_proba(x)[:,1])
    
    # ROC curve
    fig, ax = plt.subplots(1,2,figsize = (10,5))
    ax[0].plot(fpr, tpr)
    ax[0].set_xlabel('FPR')
    ax[0].set_ylabel('TPR')
    ax[0].set_title('ROC Curve')
    
    # recall vs precision curve
    ax[1].plot(precision, recall)
    ax[1].set_xlabel('precision')
    ax[1].set_ylabel('recall')
    ax[1].set_title('Recall vs Precision')

    return precision, recall, thresholds

In [64]:
print_eval(model=LRM, x=x_test,y=y_test)

Accuracy:       0.82000
Precision:      0.82000
Recall:         0.82000
F1 score:       0.82000



In [65]:
# Report training set confusion matrix
y_true = y_train
y_pred = LRM.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred)) 
ConfusionMatrix

Unnamed: 0,0,1,2
0,479,285,14
1,316,452,8
2,136,152,1266


In [66]:
# Train SVM Model
from sklearn.svm import SVC

SVM = SVC()
SVM.fit(x_train, y_train)

training_accuracy_SVM=SVM.score(x_train,y_train)
print('The training accuracy of SVM model is:', training_accuracy_SVM)

test_accuracy_SVM=SVM.score(x_test,y_test)
print('The test accuracy of SVM model is', test_accuracy_SVM)

The training accuracy of SVM model is: 0.987773487773
The test accuracy of SVM model is 0.8


In [67]:
# Report testing set confusion matrix
y_true = y_test
y_pred = SVM.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred)) 
print('The testing set confusion matrix is:')
ConfusionMatrix

The testing set confusion matrix is:


Unnamed: 0,0,1,2
0,0,0,2
1,0,0,8
2,0,0,40


In [68]:
# Report trainig set confusion matrix
y_true = y_train
y_pred = SVM.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred)) 
print('The training set confusion matrix is:')
ConfusionMatrix

The training set confusion matrix is:


Unnamed: 0,0,1,2
0,764,0,14
1,1,758,17
2,2,4,1548
