In [72]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
# loading machine learning required packages
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import time
# for multi-class logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.learning_curve import validation_curve

In [50]:
df = pd.read_csv("Messages_withY_AMZN.csv")

In [51]:
df.columns

Index(['Unnamed: 0', 'Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time',
       'P_ask_1', 'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2',
       'P_bid_2', 'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3',
       'P_ask_4', 'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5',
       'P_bid_5', 'V_bid_5', 'bid-ask spread 1', 'mid-price 1', 'd_P_ask_51',
       'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32', 'd_P_bid_32',
       'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54', 'Mean_P_ask',
       'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu', 'V_accu',
       'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3', 'mid-price 3',
       'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5', 'mid-price 5',
       'Mid_price', 'MidPrice_Moves', 'SpdCros_Moves'],
      dtype='object')

In [52]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,...,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Mid_price,MidPrice_Moves,SpdCros_Moves
0,0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460000,2239500,100,...,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0,2235650.0,2,2
1,1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608000,2239500,100,...,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0,2238800.0,0,2
2,2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608000,2239500,100,...,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0,2238800.0,2,2
3,3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608000,2239500,100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2
4,4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608000,2239500,100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2


### Experience with Logistic Regression

In [82]:
Xtrain = np.array(df[(df['Time_stamp']<=36000+60*30)]\
                  [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])

In [83]:
ytrain = np.array(df[(df['Time_stamp']<=36000+60*30)][['MidPrice_Moves']])

In [84]:
lr = LogisticRegression().fit(Xtrain, ytrain)

In [85]:
## on training data
yhat = lr.predict(Xtrain)

In [86]:
## calculate accuracy
print("training set accuracy is {0:.2f}.".format(accuracy_score(ytrain, yhat)))

training set accuracy is 0.81.


In [87]:
Xtest = np.array(df[(df['Time_stamp']>36000+60*30) & (df['Time_stamp']<36600+60*30)]\
                 [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])

In [88]:
# on test set
yhat_test = lr.predict(Xtest)
ytest = np.array(df[(df['Time_stamp']>36000+60*30)&(df['Time_stamp']<36600+60*30)][['MidPrice_Moves']])
print(len(ytest),len(yhat_test))

4961 4961


In [90]:
# on testing data
print("testing set accuracy is {0:.2f}.".format(accuracy_score(ytest, yhat_test)))

testing set accuracy is 0.83.


### Tuning...

In [91]:
C_param_range = [0.001,0.01,0.1,1,10,100,200,1000]
midPriceMoves_acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
midPriceMoves_acc_table['C_parameter'] = C_param_range
#plt.figure(figsize=(10, 10))

In [92]:
j = 0 
for i in C_param_range:
    # apply logistic regression model to traing data
    lr = LogisticRegression(penalty='l2',C=i,random_state=0)
    lr.fit(Xtrain,ytrain)
    # predicting using the model
    y_pred = lr.predict(Xtest)
    # saving accuracy score in table
    midPriceMoves_acc_table.iloc[j,1] = accuracy_score(ytest,y_pred)
    j += 1

In [93]:
midPriceMoves_acc_table

Unnamed: 0,C_parameter,Accuracy
0,0.001,0.834106
1,0.01,0.834106
2,0.1,0.834106
3,1.0,0.834106
4,10.0,0.834106
5,100.0,0.834106
6,200.0,0.834106
7,1000.0,0.834106


### Testing. Putting things into a loop

In [108]:
def PredictY(tradingDay_df,trainingTime_mins,predictionTime_mins,target_Y,C_param):
    ## return two things:
    train_accs, test_accs = [], []
    # make a copy of the dataframe
    df = tradingDay_df.copy(deep=False)
    st_time,ed_time  = int(list(df['Time_stamp'])[0]), int(list(df['Time_stamp'])[-1])  # in seconds
    ## st and ed denote the start time and end time for each training session
    st, ed = st_time, st_time + trainingTime_mins * 60 
    # when last prediction time is not beyond the total trading time window
    while (ed + predictionTime_mins*60 <= ed_time):
        Xtrain = np.array(df[(df['Time_stamp']<=ed)]\
                  [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
        ytrain = np.array(df[(df['Time_stamp']<=ed)][[target_Y]])
        # train a model
        lr = LogisticRegression(penalty='l1',C=C_param,random_state=0)
        lr.fit(Xtrain,ytrain)
        # on training data
        yhat = lr.predict(Xtrain)
        train_accs.append("%.2f" % accuracy_score(ytrain,yhat))
        # move on to test set
        Xtest = np.array(df[(df['Time_stamp']>ed) & (df['Time_stamp']<ed + predictionTime_mins*60)]\
                 [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
        # on test set
        yhat_test = lr.predict(Xtest)
        ytest = np.array(df[(df['Time_stamp']>ed)&(df['Time_stamp']<ed + predictionTime_mins*60)][[target_Y]])
        test_accs.append("%.2f" % accuracy_score(ytest,yhat_test))
        ### moving the training session forward
        st += trainingTime_mins * 60
        ed += trainingTime_mins * 60
    return train_accs, test_accs

In [None]:
train_accs, test_accs = PredictY(tradingDay_df=df, trainingTime_mins=30,predictionTime_mins=10,
                                 target_Y='MidPrice_Moves',C_param=0.1)

In [106]:
All_test_accs = []
for i in C_param_range:
    l1,l2 = PredictY(tradingDay_df=df, trainingTime_mins=30,predictionTime_mins=10,
                                 target_Y='MidPrice_Moves',C_param=i)
    All_test_accs.append(l2)

In [107]:
for item in All_test_accs:
    print(item)

['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
['0.83', '0.83', '0.88', '0.87', '0.81', '0.84', '0.76', '0.82', '0.85', '0.86', '0.79', '0.81']
