In [33]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
# loading machine learning required packages
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import time
# for multi-class logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.learning_curve import validation_curve
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

In [34]:
# Add Two Extra Columns: signs of mid-price and signs of spread crossing 
def AddMidPriceAndSpreadCrossing(df):   
    df2 = df.copy(deep=False)
    
    # 1) add mid-price movement column first    # criterion: [0: upward;  1: downward;  2: stationary] 
    midprices = []
    for index, row in df2.iterrows():
        midprices.append((row['P_ask_1_'] + row['P_bid_1_'])/2)
    df2['Mid_price'] = midprices
    # decide if mid-price moves up or down, or stays stationary
    midpriceMoves = [2]
    for i in range(1,len(midprices)):
        if (midprices[i] > midprices[i-1]):
            midpriceMoves.append(0)
        elif (midprices[i] < midprices[i-1]):
            midpriceMoves.append(1)
        else: 
            midpriceMoves.append(2)
    # checking 
    if (len(midpriceMoves)!=len(df2.index)):
        print('error! Lengths do not match.')
        return
    # good to go
    df2['MidPrice_Moves'] = midpriceMoves
    
    
    # 2) decide spread crossing movements   # criterion: [0: upward;  1: downward;  2: no spread crossing] 
    sprCros = [2]
    bestBids = df2.P_bid_1_.tolist()
    bestAsks = df2.P_ask_1_.tolist()
    for i in range(1,len(bestBids)):
        if (bestBids[i] > bestAsks[i-1]):
            sprCros.append(0)
        elif (bestAsks[i] < bestBids[i-1]):
            sprCros.append(1)
        elif ((bestAsks[i]>=bestBids[i-1])&(bestBids[i]<=bestAsks[i-1])):
            sprCros.append(2)
        else: 
            print(i,'error!')
    # checking 
    if (len(midpriceMoves)!=len(df2.index)):
        print('error! Lengths do not match.')
        return   
    # good to go
    df2['SpdCros_Moves'] = sprCros
    
    return df2

In [35]:
file_path = "Messages_allDay_AAPL.csv"
df = pd.read_csv(file_path)
df = df.drop(columns=['Unnamed: 0'])

In [36]:
df.head(5)

Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_accu,V_accu,bid-ask spread 2,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5
0,34200.004241,1,16113575,18,5853300,1,2012-06-21 09:30:00.004241000,5859400,200,5853300,...,61500,683,6800,5856400.0,10000,5856000.0,18800,5859500.0,19800,5859600.0
1,34200.004261,1,16113584,18,5853200,1,2012-06-21 09:30:00.004261000,5859400,200,5853300,...,58000,670,6600,5856500.0,8000,5857000.0,17900,5859950.0,19400,5859800.0
2,34200.004447,1,16113594,18,5853100,1,2012-06-21 09:30:00.004447000,5859400,200,5853300,...,55000,741,6600,5856500.0,7900,5857050.0,15900,5860950.0,18500,5860250.0
3,34200.025552,1,16120456,18,5859100,-1,2012-06-21 09:30:00.025552000,5859100,18,5853300,...,44600,709,6200,5856300.0,6700,5856450.0,8000,5857000.0,17900,5859950.0
4,34200.02558,1,16120480,18,5859200,-1,2012-06-21 09:30:00.025579000,5859100,18,5853300,...,34900,427,6000,5856200.0,6300,5856250.0,6800,5856400.0,10000,5856000.0


In [37]:
df.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'Label', 'Level', 'bid-ask spread 1', 'mid-price 1',
       'd_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32',
       'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54',
       'Mean_P_ask', 'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu',
       'V_accu', 'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3',
       'mid-price 3', 'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5',
       'mid-price 5'],
      dtype='object')

In [38]:
### Convert back to dollars, for all price-related columns
OldCols = ['P','P_ask_1','P_bid_1','P_ask_2','P_bid_2','P_ask_3','P_bid_3','P_ask_4','P_bid_4','P_ask_5','P_bid_5',
        'bid-ask spread 1','mid-price 1','d_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32',
       'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54',
       'Mean_P_ask', 'Mean_P_bid','P_accu','bid-ask spread 2','mid-price 2','bid-ask spread 3','mid-price 3',
       'bid-ask spread 4','mid-price 4','bid-ask spread 5','mid-price 5']

for col in OldCols:
    temp = []
    for index, row in df.iterrows():
        temp.append(float(row[col]/10000))
    df[col+'_'] = temp
    del temp
    df = df.drop(columns=[col])

In [39]:
df_withY = AddMidPriceAndSpreadCrossing(df)

In [40]:
df_withY.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'Dir', 'Time', 'V_ask_1',
       'V_bid_1', 'V_ask_2', 'V_bid_2', 'V_ask_3', 'V_bid_3', 'V_ask_4',
       'V_bid_4', 'V_ask_5', 'V_bid_5', 'Label', 'Level', 'Mean_V_ask',
       'Mean_V_bid', 'V_accu', 'P_', 'P_ask_1_', 'P_bid_1_', 'P_ask_2_',
       'P_bid_2_', 'P_ask_3_', 'P_bid_3_', 'P_ask_4_', 'P_bid_4_', 'P_ask_5_',
       'P_bid_5_', 'bid-ask spread 1_', 'mid-price 1_', 'd_P_ask_51_',
       'd_P_bid_51_', 'd_P_ask_21_', 'd_P_bid_21_', 'd_P_ask_32_',
       'd_P_bid_32_', 'd_P_ask_43_', 'd_P_bid_43_', 'd_P_ask_54_',
       'd_P_bid_54_', 'Mean_P_ask_', 'Mean_P_bid_', 'P_accu_',
       'bid-ask spread 2_', 'mid-price 2_', 'bid-ask spread 3_',
       'mid-price 3_', 'bid-ask spread 4_', 'mid-price 4_',
       'bid-ask spread 5_', 'mid-price 5_', 'Mid_price', 'MidPrice_Moves',
       'SpdCros_Moves'],
      dtype='object')

In [41]:
from collections import Counter
c1 = Counter(df_withY.MidPrice_Moves.tolist())
c2 = Counter(df_withY.SpdCros_Moves.tolist())

In [42]:
c1

Counter({0: 32031, 1: 32319, 2: 225905})

In [43]:
c2

Counter({2: 290255})

In [44]:
# write to a new csv
newFile = file_path.split('.')[0] + '_withY.csv'
df_withY.to_csv(newFile)

In [45]:
df_withY.head()

Unnamed: 0,Time_stamp,Type,OrderID,V,Dir,Time,V_ask_1,V_bid_1,V_ask_2,V_bid_2,...,mid-price 2_,bid-ask spread 3_,mid-price 3_,bid-ask spread 4_,mid-price 4_,bid-ask spread 5_,mid-price 5_,Mid_price,MidPrice_Moves,SpdCros_Moves
0,34200.004241,1,16113575,18,1,2012-06-21 09:30:00.004241000,200,18,200,150,...,585.64,1.0,585.6,1.88,585.95,1.98,585.96,585.635,2,2
1,34200.004261,1,16113584,18,1,2012-06-21 09:30:00.004261000,200,18,200,18,...,585.65,0.8,585.7,1.79,585.995,1.94,585.98,585.635,2,2
2,34200.004447,1,16113594,18,1,2012-06-21 09:30:00.004447000,200,18,200,18,...,585.65,0.79,585.705,1.59,586.095,1.85,586.025,585.635,2,2
3,34200.025552,1,16120456,18,-1,2012-06-21 09:30:00.025552000,18,18,200,18,...,585.63,0.67,585.645,0.8,585.7,1.79,585.995,585.62,1,2
4,34200.02558,1,16120480,18,-1,2012-06-21 09:30:00.025579000,18,18,18,18,...,585.62,0.63,585.625,0.68,585.64,1.0,585.6,585.62,2,2


In [46]:
df_withY.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'Dir', 'Time', 'V_ask_1',
       'V_bid_1', 'V_ask_2', 'V_bid_2', 'V_ask_3', 'V_bid_3', 'V_ask_4',
       'V_bid_4', 'V_ask_5', 'V_bid_5', 'Label', 'Level', 'Mean_V_ask',
       'Mean_V_bid', 'V_accu', 'P_', 'P_ask_1_', 'P_bid_1_', 'P_ask_2_',
       'P_bid_2_', 'P_ask_3_', 'P_bid_3_', 'P_ask_4_', 'P_bid_4_', 'P_ask_5_',
       'P_bid_5_', 'bid-ask spread 1_', 'mid-price 1_', 'd_P_ask_51_',
       'd_P_bid_51_', 'd_P_ask_21_', 'd_P_bid_21_', 'd_P_ask_32_',
       'd_P_bid_32_', 'd_P_ask_43_', 'd_P_bid_43_', 'd_P_ask_54_',
       'd_P_bid_54_', 'Mean_P_ask_', 'Mean_P_bid_', 'P_accu_',
       'bid-ask spread 2_', 'mid-price 2_', 'bid-ask spread 3_',
       'mid-price 3_', 'bid-ask spread 4_', 'mid-price 4_',
       'bid-ask spread 5_', 'mid-price 5_', 'Mid_price', 'MidPrice_Moves',
       'SpdCros_Moves'],
      dtype='object')