In [120]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math

In [121]:
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path,ordb_path):
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date 
    date = msg_path.split(".")[0].split("_")[1]
    
    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "Size","Price","Direction"]
    df_msg = pd.read_csv(msg_path,names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['Ask Price','Ask Size','Bid Price','Bid Size']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item)+str(" ")+str(i+1))
    df_ordb = pd.read_csv(ordb_path,names=ordb_names)
    
    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'],unit="s",origin=pd.Timestamp(date))
    
    # combine two dataframes and return it
    df = pd.concat([df_msg,df_ordb],axis=1)
    return df

In [122]:
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
df.head(5)

Unnamed: 0,Time_stamp,Type,OrderID,Size,Price,Direction,Time,Ask Price 1,Ask Size 1,Bid Price 1,...,Bid Price 3,Bid Size 3,Ask Price 4,Ask Size 4,Bid Price 4,Bid Size 4,Ask Price 5,Ask Size 5,Bid Price 5,Bid Size 5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2230400,100,2242500,100,2230000,10,2244000,547,2226200,100
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2242500,100,2230400,100,2244000,547,2230000,10
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2240000,220,2230400,100,2242500,100,2230000,10
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,220,2230700,200,2242500,100,2230400,100
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,233,2230700,200,2242500,100,2230400,100


In [123]:
cancelledOrderID = set(df[(df['Type']==2)].OrderID.tolist())
#len(cancelledOrderID)
lookup = {1:'Submission', 2:'Cancellation', 3:'Deletion', 4:'Execution Visible',5:'Execution Hidden',7:'Trading Halt'}
lookup[1]

'Submission'

In [124]:
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("Bid" in col) | ("Ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'Size') | (col == 'Price')):
            new_col.append(col+str(" ")+state_type)  
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return 

In [125]:
def GroupByOrderType(df,typeID):                        # for now, typeID should only be 2 or 3 or 4 or 5
    # lookup dictionary
    lookup = {1:'Submission', 2:'Cancellation', 3:'Deletion', 4:'Execution Visible',5:'Execution Hidden',7:'Trading Halt'}
    # getting all the orders of type typeID
    orderID_set = set(df[(df['Type']==typeID)].OrderID.tolist())
    df_subset = df.loc[df['OrderID'].isin(orderID_set)]
    
    # get initial states when orders of this type is submitted
    df_init = df_subset[(df_subset['Type']==1)]
    RenameColumn(df_init,state_type=lookup[1])
    
    # get the end states of type typeID
    df_end = df_subset[(df_subset['Type']==typeID)]
    RenameColumn(df_end,state_type=lookup[typeID])
    
    # merge these two dataframes
    df = df_init.merge(df_end, left_on='OrderID', right_on='OrderID', how='outer')

    return df

In [126]:
df_test = GroupByOrderType(df,typeID=2)
len(df_test.index)

450

In [127]:
len(df[(df['Type']==2)].index)

450

In [128]:
df_test.columns

Index(['Time_stamp_x', 'Type_x', 'OrderID', 'Size Submission',
       'Price Submission', 'Direction_x', 'Time Submission',
       'Ask Price 1 Submission', 'Ask Size 1 Submission',
       'Bid Price 1 Submission', 'Bid Size 1 Submission',
       'Ask Price 2 Submission', 'Ask Size 2 Submission',
       'Bid Price 2 Submission', 'Bid Size 2 Submission',
       'Ask Price 3 Submission', 'Ask Size 3 Submission',
       'Bid Price 3 Submission', 'Bid Size 3 Submission',
       'Ask Price 4 Submission', 'Ask Size 4 Submission',
       'Bid Price 4 Submission', 'Bid Size 4 Submission',
       'Ask Price 5 Submission', 'Ask Size 5 Submission',
       'Bid Price 5 Submission', 'Bid Size 5 Submission', 'Time_stamp_y',
       'Type_y', 'Size Cancellation', 'Price Cancellation', 'Direction_y',
       'Time Cancellation', 'Ask Price 1 Cancellation',
       'Ask Size 1 Cancellation', 'Bid Price 1 Cancellation',
       'Bid Size 1 Cancellation', 'Ask Price 2 Cancellation',
       'Ask Size 2 Cancel

In [129]:
df_test.head(5)

Unnamed: 0,Time_stamp_x,Type_x,OrderID,Size Submission,Price Submission,Direction_x,Time Submission,Ask Price 1 Submission,Ask Size 1 Submission,Bid Price 1 Submission,...,Bid Price 3 Cancellation,Bid Size 3 Cancellation,Ask Price 4 Cancellation,Ask Size 4 Cancellation,Bid Price 4 Cancellation,Bid Size 4 Cancellation,Ask Price 5 Cancellation,Ask Size 5 Cancellation,Bid Price 5 Cancellation,Bid Size 5 Cancellation
0,34400.045403,1.0,21748178,400.0,2239900.0,-1.0,2012-06-21 09:33:20.045403,2239900.0,400.0,2236000.0,...,2234900,61,2242000,200,2233900,200,2242500,50,2232500,10
1,34406.5441,1.0,21866417,400.0,2239600.0,-1.0,2012-06-21 09:33:26.544100,2239600.0,500.0,2236500.0,...,2236000,507,2242500,50,2235000,94,2242900,30,2234900,50
2,35153.192597,1.0,35085399,400.0,2237400.0,-1.0,2012-06-21 09:45:53.192597,2237100.0,1.0,2235100.0,...,2234700,100,2237500,100,2234100,100,2238700,29,2234000,100
3,35194.067921,1.0,35699321,400.0,2237300.0,-1.0,2012-06-21 09:46:34.067921,2237300.0,400.0,2235500.0,...,2234900,50,2238600,100,2234700,100,2238700,29,2234100,100
4,35414.485101,1.0,38755648,6.0,2239900.0,-1.0,2012-06-21 09:50:14.485101,2239900.0,6.0,2237600.0,...,2237400,113,2240300,13,2236900,100,2240400,15,2236600,975
