In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, accuracy_score, roc_curve, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('Messages_allDay_AMZN_withY.csv')

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,...,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Mid_price,MidPrice_Moves,SpdCros_Moves
0,0,34200.18961,1,11885113,21,2238100,1,30:00.2,2239500,100,...,2235850,9300,2235350,12100,2236450,14000,2237000,2238800.0,2,2
1,1,34200.18961,1,3911376,20,2239600,-1,30:00.2,2239500,100,...,2235700,9200,2235300,9600,2235200,12500,2236250,2238800.0,2,2
2,2,34200.18961,1,11534792,100,2237500,1,30:00.2,2239500,100,...,2238550,8100,2235850,9300,2235350,12100,2236450,2238800.0,2,2
3,3,34200.18961,1,1365373,13,2240000,-1,30:00.2,2239500,100,...,2238550,8100,2235850,9300,2235350,12100,2236450,2238800.0,2,2
4,4,34200.18961,1,11474176,2,2236500,1,30:00.2,2239500,100,...,2238550,3400,2238200,8200,2235900,11800,2236600,2238800.0,2,2


In [4]:
df.shape

(153490, 59)

In [5]:
df.columns

Index(['Unnamed: 0', 'Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time',
       'P_ask_1', 'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2',
       'P_bid_2', 'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3',
       'P_ask_4', 'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5',
       'P_bid_5', 'V_bid_5', 'Label', 'Level', 'bid-ask spread 1',
       'mid-price 1', 'd_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21',
       'd_P_ask_32', 'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54',
       'd_P_bid_54', 'Mean_P_ask', 'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid',
       'P_accu', 'V_accu', 'bid-ask spread 2', 'mid-price 2',
       'bid-ask spread 3', 'mid-price 3', 'bid-ask spread 4', 'mid-price 4',
       'bid-ask spread 5', 'mid-price 5', 'Mid_price', 'MidPrice_Moves',
       'SpdCros_Moves'],
      dtype='object')

### Split training set and testing set

In [6]:
# Extract observations between 10:30~10:55
df_simplify = df[(df['Time_stamp']>=37800)&(df['Time_stamp']<=39300)]

In [7]:
df_simplify.shape

(12274, 59)

In [8]:
# Original training set: 10:30~10:50
df_train = df_simplify[(df_simplify['Time_stamp']>=37800)&(df_simplify['Time_stamp']<=39000)]

In [9]:
df_train.shape

(8749, 59)

In [10]:
# Orginal testing set: first 50 rows after 10:50
df_test = df_simplify.iloc[8749:8799]

In [11]:
df_test.shape

(50, 59)

In [12]:
# Extract observations in the training set where y=0/1/2
df_train_y_0 = df_train[df_train['MidPrice_Moves'] == 0]
df_train_y_1 = df_train[df_train['MidPrice_Moves'] == 1]
df_train_y_2 = df_train[df_train['MidPrice_Moves'] == 2]

In [13]:
df_train_y_0.shape

(778, 59)

In [14]:
df_train_y_1.shape

(776, 59)

In [15]:
df_train_y_2.shape

(7195, 59)

In [16]:
# Keep y=0:y=1:y=2 = 1:1:2
df_train_y_2_new = df_train_y_2.sample(777*2)

In [17]:
df_train_y_2_new.shape

(1554, 59)

In [18]:
# Concat the dataframe to make a new training set
df_train_new = pd.concat([df_train_y_0,df_train_y_1,df_train_y_2_new])

In [19]:
df_train_new.shape

(3108, 59)

In [20]:
# Extract x and y variable in the training set and testing set
x_train = df_train_new.drop(['Unnamed: 0','Time_stamp','Time','OrderID','Label','MidPrice_Moves','SpdCros_Moves'],axis = 1)
y_train = df_train_new['MidPrice_Moves']
x_test = df_test.drop(['Unnamed: 0','Time_stamp','Time','OrderID','Label','MidPrice_Moves','SpdCros_Moves'],axis = 1)
y_test = df_test['MidPrice_Moves']

In [21]:
x_train.shape

(3108, 52)

In [22]:
y_train.shape

(3108,)

In [23]:
x_test.shape

(50, 52)

In [24]:
y_test.shape

(50,)

### Try logistic regression model

In [37]:
def print_eval(model, x, y):
    """Prints some evaluation metrics"""
    y_pred = model.predict(x)
    print('{:15} {:.5f}'.format('Accuracy:', accuracy_score(y, y_pred)))
    print('{:15} {:.5f}'.format('Precision:', precision_score(y, y_pred,average='weighted')))
    print('{:15} {:.5f}'.format('Recall:', recall_score(y, y_pred,average='weighted')))
    print('{:15} {:.5f}'.format('F1 score:', f1_score(y, y_pred,average='weighted'))) 

In [47]:
# Train Logistic Regression Model
LRM = linear_model.LogisticRegression(C=0.1, penalty='l2')
LRM.fit(x_train, y_train)

# Report training accuracy, testing accuracy
training_accuracy=LRM.score(x_train,y_train)
print('The training accuracy is:', training_accuracy)
testing_accuracy=LRM.score(x_test,y_test)
print('The testing accuracy is', testing_accuracy)

# Report testing set confusion matrix
y_true = y_test
y_pred = LRM.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print("The testing set confusion matrix is:")
ConfusionMatrix

The training accuracy is: 0.586229086229
The testing accuracy is 0.82
The testing set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,1,1
Actual 1,0,1,7
Actual 2,0,0,40


In [39]:
# Report testing set performance
print_eval(model=LRM, x=x_test,y=y_test)

Accuracy:       0.82000
Precision:      0.74667
Recall:         0.82000
F1 score:       0.75927


In [51]:
# Report training set confusion matrix
y_true = y_train
y_pred = LRM.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
ConfusionMatrix

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,200,114,464
Actual 1,160,149,467
Actual 2,54,27,1473


In [52]:
# Report training set performance
print_eval(model=LRM, x=x_train,y=y_train)

Accuracy:       0.58623
Precision:      0.55558
Recall:         0.58623
F1 score:       0.52596


### Try SVM Model

In [53]:
# Train SVM Model
from sklearn.svm import SVC

SVM = SVC()
SVM.fit(x_train, y_train)

training_accuracy_SVM=SVM.score(x_train,y_train)
print('The training accuracy of SVM model is:', training_accuracy_SVM)

test_accuracy_SVM=SVM.score(x_test,y_test)
print('The test accuracy of SVM model is', test_accuracy_SVM)

The training accuracy of SVM model is: 0.999356499356
The test accuracy of SVM model is 0.8


In [54]:
# Report testing set confusion matrix
y_true = y_test
y_pred = SVM.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The testing set confusion matrix is:')
ConfusionMatrix

The testing set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,0,2
Actual 1,0,0,8
Actual 2,0,0,40


In [44]:
# Report testing set performance
print_eval(model=SVM, x=x_test,y=y_test)

Accuracy:       0.80000
Precision:      0.64000
Recall:         0.80000
F1 score:       0.71111


In [55]:
# Report trainig set confusion matrix
y_true = y_train
y_pred = SVM.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The training set confusion matrix is:')
ConfusionMatrix

The training set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,776,0,2
Actual 1,0,776,0
Actual 2,0,0,1554


In [46]:
# Report training set performance
print_eval(model=SVM, x=x_train,y=y_train)

Accuracy:       0.99936
Precision:      0.99936
Recall:         0.99936
F1 score:       0.99936


### Try Random Forest Model

In [56]:
from sklearn.ensemble import RandomForestClassifier
# Train a random forest model
random_forest = RandomForestClassifier(n_estimators=1000)
## Train the random forest model using the training set
random_forest.fit(x_train, y_train)
## Report training accuracy
training_accuracy_random_forest=random_forest.score(x_train,y_train)
print('The training accuracy of random_forest model is:', training_accuracy_random_forest)
## Report test accuracy
test_accuracy_random_forest=random_forest.score(x_test,y_test)
print('The test accuracy of random_forest model is', test_accuracy_random_forest)

The training accuracy of random_forest model is: 1.0
The test accuracy of random_forest model is 0.88


In [57]:
# Report testing set confusion matrix
y_true = y_test
y_pred = random_forest.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The testing set confusion matrix is:')
ConfusionMatrix

The testing set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,2,0,0
Actual 1,0,8,0
Actual 2,1,5,34


In [58]:
# Report testing set performance
print_eval(model=random_forest, x=x_test,y=y_test)

Accuracy:       0.88000
Precision:      0.92513
Recall:         0.88000
F1 score:       0.88904


In [59]:
# Report trainig set confusion matrix
y_true = y_train
y_pred = random_forest.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The training set confusion matrix is:')
ConfusionMatrix

The training set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,778,0,0
Actual 1,0,776,0
Actual 2,0,0,1554


In [60]:
# Report training set performance
print_eval(model=random_forest, x=x_train,y=y_train)

Accuracy:       1.00000
Precision:      1.00000
Recall:         1.00000
F1 score:       1.00000


### Try Boosting Model

In [61]:
import xgboost as xgb
# Train a xgboost model
## Create xgboost object
xgboost = xgb.XGBClassifier(n_estimators=1000)
## Train the xgboost model using the training set
xgboost.fit(x_train, y_train)
## Report training accuracy
training_accuracy_xgboost=xgboost.score(x_train,y_train)
print('The training accuracy of xgboost model is:', training_accuracy_xgboost)
## Report test accuracy
test_accuracy_xgboost=xgboost.score(x_test,y_test)
print('The test accuracy of xgboost model is', test_accuracy_xgboost)

The training accuracy of xgboost model is: 1.0
The test accuracy of xgboost model is 0.88


In [62]:
# Report testing set confusion matrix
y_true = y_test
y_pred = xgboost.predict(x_test)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The testing set confusion matrix is:')
ConfusionMatrix

The testing set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,2,0,0
Actual 1,0,8,0
Actual 2,2,4,34


In [63]:
# Report testing set performance
print_eval(model=xgboost, x=x_test,y=y_test)

Accuracy:       0.88000
Precision:      0.92667
Recall:         0.88000
F1 score:       0.88980


In [64]:
# Report trainig set confusion matrix
y_true = y_train
y_pred = xgboost.predict(x_train)
ConfusionMatrix = pd.DataFrame(confusion_matrix(y_true, y_pred),
                               columns=['Predicted 0','Predicted 1','Predicted 2'], 
                               index=['Actual 0','Actual 1','Actual 2']) 
print('The training set confusion matrix is:')
ConfusionMatrix

The training set confusion matrix is:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,778,0,0
Actual 1,0,776,0
Actual 2,0,0,1554


In [65]:
# Report training set performance
print_eval(model=xgboost, x=x_train,y=y_train)

Accuracy:       1.00000
Precision:      1.00000
Recall:         1.00000
F1 score:       1.00000
