In [1]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
# loading machine learning required packages
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import time
# for multi-class logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.learning_curve import validation_curve
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold



In [2]:
df = pd.read_csv("Messages_withY_AMZN.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time',
       'P_ask_1', 'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2',
       'P_bid_2', 'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3',
       'P_ask_4', 'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5',
       'P_bid_5', 'V_bid_5', 'bid-ask spread 1', 'mid-price 1', 'd_P_ask_51',
       'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32', 'd_P_bid_32',
       'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54', 'Mean_P_ask',
       'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu', 'V_accu',
       'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3', 'mid-price 3',
       'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5', 'mid-price 5',
       'Mid_price', 'MidPrice_Moves', 'SpdCros_Moves'],
      dtype='object')

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,...,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Mid_price,MidPrice_Moves,SpdCros_Moves
0,0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460000,2239500,100,...,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0,2235650.0,2,2
1,1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608000,2239500,100,...,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0,2238800.0,0,2
2,2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608000,2239500,100,...,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0,2238800.0,2,2
3,3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608000,2239500,100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2
4,4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608000,2239500,100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2


### Experience with Logistic Regression

In [20]:
Xtrain = np.array(df[(df['Time_stamp']<=36000+60*30)]\
                  [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
ytrain = np.array(df[(df['Time_stamp']<=36000+60*30)][['MidPrice_Moves']])
lr = LogisticRegression().fit(Xtrain, ytrain)
## on training data
yhat = lr.predict(Xtrain)
## calculate accuracy
print("training set accuracy is {0:.2f}.".format(accuracy_score(ytrain, yhat)))
Xtest = np.array(df[(df['Time_stamp']>36000+60*30) & (df['Time_stamp']<36600+60*30)]\
                 [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
# on test set
yhat_test = lr.predict(Xtest)
ytest = np.array(df[(df['Time_stamp']>36000+60*30)&(df['Time_stamp']<36600+60*30)][['MidPrice_Moves']])
#print(len(ytest),len(yhat_test))
# on testing data
print("testing set accuracy is {0:.2f}.".format(accuracy_score(ytest, yhat_test)))

training set accuracy is 0.81.
testing set accuracy is 0.83.


### Tuning... (without K-fold)

In [47]:
C_param_range = [1e-5,1e-4,0.001,0.01,0.1,1,10,100,1000,1e4,1e5]

In [48]:
midPriceMoves_acc_table = pd.DataFrame(columns = ['C_parameter','Accuracy'])
midPriceMoves_acc_table['C_parameter'] = C_param_range
#plt.figure(figsize=(10, 10))

In [49]:
j = 0 
for i in C_param_range:
    # apply logistic regression model to traing data
    lr = LogisticRegression(penalty='l2',C=i,random_state=0)
    lr.fit(Xtrain,ytrain)
    # predicting using the model
    y_pred = lr.predict(Xtest)
    # saving accuracy score in table
    midPriceMoves_acc_table.iloc[j,1] = accuracy_score(ytest,y_pred)
    j += 1

In [73]:
midPriceMoves_acc_table

Unnamed: 0,C_parameter,Accuracy
0,1e-05,0.834106
1,0.0001,0.834106
2,0.001,0.834106
3,0.01,0.834106
4,0.1,0.834106
5,1.0,0.834106
6,10.0,0.834106
7,100.0,0.834106
8,1000.0,0.834106
9,10000.0,0.834106


### L2 Regularization, validation and test set split

In [69]:
# create 10 folds for cross-validation analysis
skf_l2 = StratifiedKFold(n_splits=10,random_state=100,shuffle=True)
midPriceMoves_acc_table_2 = pd.DataFrame(columns = ['C_parameter','Accuracy'])
midPriceMoves_acc_table_2['C_parameter'] = C_param_range

In [72]:
j = 0
for i in C_param_range:
    ###
    acc_sum = 0
    # apply logistic regression model to traing data
    lr = LogisticRegression(penalty='l2',C=i,random_state=100)
    # create training fold and validation fold
    for fold_train_index, fold_validation_index in skf_l2.split(Xtrain,ytrain):
        x_fold_train, x_fold_validation = Xtrain[fold_train_index], Xtrain[fold_validation_index]
        y_fold_train, y_fold_validation = ytrain[fold_train_index], ytrain[fold_validation_index]
        ### Covert array into dataframe
        x_fold_train_df = pd.DataFrame(x_fold_train)
        x_fold_validation_df = pd.DataFrame(x_fold_validation)
        y_fold_train_df = pd.DataFrame(y_fold_train)
        y_fold_validation_df = pd.DataFrame(y_fold_validation)
        ### train logistic regression model using training fold
        lr.fit(x_fold_train_df,y_fold_train_df)
        ### get the validation accuracy of the current lr model
        validation_accuracy_l2 = lr.score(x_fold_validation_df,y_fold_validation_df)
        ### saving it to the table
        acc_sum += validation_accuracy_l2
    # saving accuracy score in table
    midPriceMoves_acc_table_2.iloc[j,1] = acc_sum/10
    j += 1

In [74]:
midPriceMoves_acc_table_2

Unnamed: 0,C_parameter,Accuracy
0,1e-05,0.812753
1,0.0001,0.812753
2,0.001,0.812753
3,0.01,0.812753
4,0.1,0.812753
5,1.0,0.812753
6,10.0,0.812753
7,100.0,0.812753
8,1000.0,0.812753
9,10000.0,0.812753


### Testing. Putting things into a loop

In [5]:
def PredictY(tradingDay_df,trainingTime_mins,predictionTime_mins,target_Y,C_param):
    ## return two things:
    train_accs, test_accs = [], []
    # make a copy of the dataframe
    df = tradingDay_df.copy(deep=False)
    st_time,ed_time  = int(list(df['Time_stamp'])[0]), int(list(df['Time_stamp'])[-1])  # in seconds
    ## st and ed denote the start time and end time for each training session
    st, ed = st_time, st_time + trainingTime_mins * 60 
    # when last prediction time is not beyond the total trading time window
    while (ed + predictionTime_mins*60 <= ed_time):
        Xtrain = np.array(df[(df['Time_stamp']<=ed)]\
                  [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
        ytrain = np.array(df[(df['Time_stamp']<=ed)][[target_Y]])
        # train a model
        lr = LogisticRegression(penalty='l2',C=C_param,random_state=0)
        lr.fit(Xtrain,ytrain)
        # on training data
        yhat = lr.predict(Xtrain)
        train_accs.append("%.2f" % accuracy_score(ytrain,yhat))
        # move on to test set
        Xtest = np.array(df[(df['Time_stamp']>ed) & (df['Time_stamp']<ed + predictionTime_mins*60)]\
                 [df.columns.difference(['Time','Time_stamp', 'Unnamed: 0','MidPrice_Moves','SpdCros_Moves'])])
        # on test set
        yhat_test = lr.predict(Xtest)
        ytest = np.array(df[(df['Time_stamp']>ed)&(df['Time_stamp']<ed + predictionTime_mins*60)][[target_Y]])
        test_accs.append("%.2f" % accuracy_score(ytest,yhat_test))
        ### moving the training session forward
        st += trainingTime_mins * 60
        ed += trainingTime_mins * 60
    ### changing the type
    for index, item in enumerate(test_accs):
        test_accs[index] = float(item)
    for index, item in enumerate(train_accs):
        train_accs[index] = float(item)
    return test_accs

In [6]:
predictionTimes = [1,5,10,20,30]
for times in predictionTimes:
    result = PredictY(tradingDay_df=df,trainingTime_mins=30,predictionTime_mins=times,
                         target_Y='MidPrice_Moves',C_param=0.1)
    print("for prediction time {}, average accuracy is {}".format(times, np.round(sum(result)/len(result),3)))

for prediction time 1, average accuracy is 0.795
for prediction time 5, average accuracy is 0.824
for prediction time 10, average accuracy is 0.829
for prediction time 20, average accuracy is 0.828
for prediction time 30, average accuracy is 0.826


In [10]:
trainingTimes = [10,20,30]
for times in trainingTimes:
    result = PredictY(tradingDay_df=df,trainingTime_mins=times,predictionTime_mins=10,
                         target_Y='MidPrice_Moves',C_param=0.1)
    print("for training time {}, average accuracy is {}".format(times, np.round(sum(result)/len(result),3)))

for training time 10, average accuracy is 0.825
for training time 20, average accuracy is 0.829
for training time 30, average accuracy is 0.829


In [11]:
### bigger loop; takes freakng long time
for t1 in trainingTimes:
    for t2 in predictionTimes:
        result = PredictY(tradingDay_df=df,trainingTime_mins=t1,predictionTime_mins=t2,
                         target_Y='MidPrice_Moves',C_param=0.1)
        print("for training time {} mins and prediction time {} mins, average accuracy is {}"
              .format(t1, t2, np.round(sum(result)/len(result),3)))

for training time 10 mins and prediction time 1 mins, average accuracy is 0.806
for training time 10 mins and prediction time 5 mins, average accuracy is 0.823
for training time 10 mins and prediction time 10 mins, average accuracy is 0.825
for training time 10 mins and prediction time 20 mins, average accuracy is 0.825
for training time 10 mins and prediction time 30 mins, average accuracy is 0.819
for training time 20 mins and prediction time 1 mins, average accuracy is 0.821
for training time 20 mins and prediction time 5 mins, average accuracy is 0.832
for training time 20 mins and prediction time 10 mins, average accuracy is 0.829
for training time 20 mins and prediction time 20 mins, average accuracy is 0.826
for training time 20 mins and prediction time 30 mins, average accuracy is 0.827
for training time 30 mins and prediction time 1 mins, average accuracy is 0.795
for training time 30 mins and prediction time 5 mins, average accuracy is 0.824
for training time 30 mins and pred

In [15]:
test_accs_20train_5predict = PredictY(df,20,5,target_Y='MidPrice_Moves',C_param=0.1)
#test_accs_20train_5predict

In [14]:
accs_table_20train_5predict = pd.DataFrame(columns = ['Start Time of training session','Accuracy'])
accs_table_20train_5predict['Accuracy'] = test_accs_20train_5predict
times = ['9:30','9:50','10:10','10:30','10:50','11:10','11:30','11:50',
         '12:10','12:30','12:50','1:10','1:30','1:50','2:10','2:30','2:50',
         '3:10','3:30']
accs_table_20train_5predict['Start Time of training session'] = times
accs_table_20train_5predict

Unnamed: 0,Start Time of training session,Accuracy
0,9:30,0.84
1,9:50,0.85
2,10:10,0.83
3,10:30,0.84
4,10:50,0.86
5,11:10,0.89
6,11:30,0.83
7,11:50,0.81
8,12:10,0.88
9,12:30,0.75


### Predicting values of mid-price 