In [51]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
import time

## Helper function

In [52]:
## FOR STATISTICAL ANALYSIS
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path, ordb_path):
    t = time.time()
    print('running...')
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date
    date = msg_path.split(".")[0].split("_")[1]
    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "V", "P", "Dir"]  # V: size; P: price; Dir: direction
    df_msg = pd.read_csv(msg_path, names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['P_ask', 'V_ask', 'P_bid', 'V_bid']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item) + str("_") + str(i + 1))
    df_ordb = pd.read_csv(ordb_path, names=ordb_names)
    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'], unit="s", origin=pd.Timestamp(date))
    # combine two dataframes and return it
    df = pd.concat([df_msg, df_ordb], axis=1)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Renaming columns as to strip out redundnat words
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("bid" in col) | ("ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'V') | (col == 'P')):
            new_col.append(col+str(" ")+state_type)
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return

def GroupByOrderType(df,typeID):                        # for now, typeID should only be 2 or 3 or 4 or 5
    t = time.time()
    print('running...')
    # lookup dictionary
    lookup = {1:'Sub', 2:'Can', 3:'Del', 4:'ExVis',5:'ExHid',7:'Halt'}
    # getting all the orders of type typeID
    orderID_set = set(df[(df['Type']==typeID)].OrderID.tolist())
    df_subset = df.loc[df['OrderID'].isin(orderID_set)]
    
    # get initial states when orders of this type is submitted
    df_init = df_subset[(df_subset['Type']==1)]
    RenameColumn(df_init,state_type=lookup[1])
    
    # get the end states of type typeID
    df_end = df_subset[(df_subset['Type']==typeID)]
    RenameColumn(df_end,state_type=lookup[typeID])
    
    # merge these two dataframes
    df = df_init.merge(df_end, left_on='OrderID', right_on='OrderID', how='outer')
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [53]:
## FOR PAPER 462
# Compute Time-insensitive variables
def ComputeTimeInsenstiveSet(df_original, levels=5):
    df = df_original.copy(deep=False)
    t = time.time()
    print('running...')
    for i in range(1,levels+1):
        # compute bid-ask spreads and mid-prices (v2)
        df["bid-ask spread {}".format(i)] = df["P_ask_{}".format(i)] - df["P_bid_{}".format(i)]
        df["mid-price {}".format(i)] = (df["P_ask_{}".format(i)] + df["P_bid_{}".format(i)])/2
        # compute price differences (v3)
        df["d_P_ask_{}{}".format(levels,1)] = df["P_ask_{}".format(levels)] - df["P_ask_1"]
        df["d_P_bid_{}{}".format(levels,1)] = df["P_bid_{}".format(levels)] - df["P_bid_1"]
        for j in range(1,levels):
            df["d_P_ask_{}{}".format(j+1,j)] = np.abs(df["P_ask_{}".format(j+1)] - df["P_ask_{}".format(j)])
            df["d_P_bid_{}{}".format(j+1,j)] = np.abs(df["P_bid_{}".format(j+1)] - df["P_bid_{}".format(j)])
        # compute mean prices and volumes (v4)
        df["Mean_P_ask"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_P_bid"] = df[["P_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_ask"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_bid"] = df[["V_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        # compute accumulated differences (v5)
        df["P_accu"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["P_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
        df["V_accu"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["V_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
    # returning...
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Compute Time-sensitive variables
def ComputeTimeSensitiveSet(df_original,levels=5,dt_secs=1):
    df = df_original.copy(deep=False)
    t = time.time()
    print('running...')
    for index, row in df.iterrows():
        t = row['Time_stamp']
        temp = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)] # for v6
        # Note: direction = 1 <--> buy/bid order;   direction = -1 <--> sell/ask order
        temp_la = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==-1)] # limit ask order
        temp_lb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==1)]  # limit bid order
        temp_ca = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==-1)&(df['Type']==2)] # cancelled limit ask order
        temp_cb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==1)&(df['Type']==2)]  # cancelled limit buy order
        for i in range(1,levels+1):
            # compute price and volume derivatives (v6)
            df.set_value(index,'der_P_ask_{}'.format(i),
                         (temp['P_ask_{}'.format(i)].iloc[-1]-temp['P_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_P_bid_{}'.format(i),
                         (temp['P_bid_{}'.format(i)].iloc[-1]-temp['P_bid_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_ask_{}'.format(i),
                         (temp['V_ask_{}'.format(i)].iloc[-1]-temp['V_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_bid_{}'.format(i),
                         (temp['V_bid_{}'.format(i)].iloc[-1]-temp['V_bid_{}'.format(i)].iloc[0])/dt_secs)
            # compute average intensity of each type (v7)
            df.set_value(index,'lam_la',int(len(temp_la.index)/dt_secs))
            df.set_value(index,'lam_lb',int(len(temp_lb.index)/dt_secs))
            df.set_value(index,'lam_ca',int(len(temp_ca.index)/dt_secs))
            df.set_value(index,'lam_cb',int(len(temp_cb.index)/dt_secs))
            # compute relative intensity indicators (v8) ???
            # compute accelerations (market/limit)  (v9) ???
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [54]:
# Function of Adding the labels (lb, la, mb, ma) to classify all of the order
def AddLabels(df_original):
    df = df_original.copy(deep=False)
    t = time.time()
    print('running...')
    labels = []
    for index, row in df.iterrows():
        if (row['Type']==1):
            if (row['Dir']==-1):
                labels.append('la')
            elif (row['Dir']==1):
                labels.append('lb')
        elif (row['Type'] in [4,5]):  # -1: sell limit order; 1: buy limit order; 
            if (row['Dir']==-1):      # But execution of buy/sell is initiated by a market sell/buy trade                    
                labels.append('mb')
            elif (row['Dir']==1):
                labels.append('ma')
        elif (row['Type']==2):
            labels.append('pc')
        elif (row['Type']==3):
            labels.append('c')
        else:
            labels.append(None)
            print('error! index {} has unknown order type.'.format(index))
    ## 
    df['Label'] = labels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [55]:
# Function of Classifying the level where action takes
def AddPriceLevels(df_original):
    df = df_original.copy(deep=False)  
    t = time.time()
    print('running...')
    if ('Label' not in df.columns):
        print("You haven't appended labels to messages.")
        return
    nlevels = 5
    levels = [None]
    for index, row in df.iterrows():   # 1: buy   -1: sell
        if (index > 0):
            bid_levels = np.array(df.iloc[index-1][["P_bid_{}".format(i) for i in range(1,nlevels+1)]])
            ask_levels = np.array(df.iloc[index-1][["P_ask_{}".format(i) for i in range(1,nlevels+1)]])
            price = row['P']
            # for label 'la', 'lb'
            if (row["Label"] in ['la','lb']):
                if (row['Dir']==-1):
                    if (price > max(ask_levels)):
                        i = "out-of-bounds"
                    else:
                        i = np.argmax(price<=ask_levels)
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    if (price < min(bid_levels)):
                        i = "out-of_bounds"
                    else:
                        i = np.argmax(price>=bid_levels)
                    levels.append(int(i+1))
            # for 'c', 'pc'
            elif (row["Label"] in ['c','pc']):
                if (row['Dir']==-1):
                    i = np.where(price==ask_levels)[0]
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    i = np.where(price==bid_levels)[0]
                    levels.append(int(i+1))
            # for 'mb', 'ma', always happen at the best level
            elif (row["Label"] in ['mb','ma']):
                levels.append(int(1))
            else:
                levels.append(None)     
    if (len(df.index)!=len(levels)):
        print('Error. Lengths don\'t match.')
        return
    df['Level'] = levels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

### Processing starts from here

#### 1) MERGING MESSAGE FILE AND ORDERBOOK FILE

In [56]:
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
df.head(5)

running...
Completed. It takes about 0.02 minutes.


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_bid_3,V_bid_3,P_ask_4,V_ask_4,P_bid_4,V_bid_4,P_ask_5,V_ask_5,P_bid_5,V_bid_5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2230400,100,2242500,100,2230000,10,2244000,547,2226200,100
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2242500,100,2230400,100,2244000,547,2230000,10
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2240000,220,2230400,100,2242500,100,2230000,10
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,220,2230700,200,2242500,100,2230400,100
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,233,2230700,200,2242500,100,2230400,100


#### 2) ADDING LABELS

In [57]:
df = AddLabels(df)

running...
Completed. It takes about 0.20 minutes.


In [58]:
df.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'Label'],
      dtype='object')

#### 3) ADDING PRICE LEVEL INFORMATION

In [59]:
df = AddPriceLevels(df)

running...
Completed. It takes about 4.78 minutes.


In [60]:
print('after adding labels and price levels, dataset looks like: ')
df.head(5)

after adding labels and price levels, dataset looks like: 


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_ask_4,V_ask_4,P_bid_4,V_bid_4,P_ask_5,V_ask_5,P_bid_5,V_bid_5,Label,Level
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2242500,100,2230000,10,2244000,547,2226200,100,mb,
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2242500,100,2230400,100,2244000,547,2230000,10,lb,1.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2240000,220,2230400,100,2242500,100,2230000,10,la,2.0
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2240000,220,2230700,200,2242500,100,2230400,100,lb,2.0
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2240000,233,2230700,200,2242500,100,2230400,100,la,4.0


#### 4) GROUPING ALL MESSAGES BY ORDER IDS

In [92]:
def GetOrderHistory(df_original):
    df = df_original.copy(deep=False) 
    t = time.time()
    print('running...')
    table = {}
    for index, row in df.iterrows():
        if (row['OrderID']!=0):   # don't need records where orderID==0 
            d = {row['Time_stamp']: [row['Type'],row['Dir'],row['V'],row['P'],row['Label'],row['Level']]}
            if (row['OrderID'] not in table.keys()):
                li = []
                li.append(d)
                table[row['OrderID']] = li
            else:
                table[row['OrderID']].append(d)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))        
    return table

In [95]:
OrderHistory = GetOrderHistory(df)

running...
Completed. It takes about 0.33 minutes.


### Testing

In [84]:
df_test = df[(df['OrderID']==1365373) | (df['OrderID']==11534792) | (df['OrderID']==11885113)]
df_test = df_test[['Time_stamp','Type','OrderID','V','P','Dir','Label','Level']]
df_test

Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Label,Level
1,34200.189608,1,11885113,21,2238100,1,lb,1.0
3,34200.189608,1,11534792,100,2237500,1,lb,2.0
4,34200.189608,1,1365373,13,2240000,-1,la,4.0
28,34200.190226,4,11885113,21,2238100,1,ma,1.0
29,34200.190226,4,11534792,26,2237500,1,ma,1.0
51,34200.403006,4,1365373,13,2240000,-1,mb,1.0
415,34230.0133,3,11534792,74,2237500,1,c,5.0


In [85]:
table = {}
for index, row in df_test.iterrows():
    d = {row['Time_stamp']: [row['Type'],row['Dir'],row['V'],row['P'],row['Label'],row['Level']]}
    if (row['OrderID'] not in table.keys()):
        li = []
        li.append(d)
        table[row['OrderID']] = li
    else: 
        table[row['OrderID']].append(d)

In [86]:
table

{1365373: [{34200.18960767: [1, -1, 13, 2240000, 'la', 4.0]},
  {34200.403006021: [4, -1, 13, 2240000, 'mb', 1.0]}],
 11534792: [{34200.18960767: [1, 1, 100, 2237500, 'lb', 2.0]},
  {34200.190226476: [4, 1, 26, 2237500, 'ma', 1.0]},
  {34230.013300014: [3, 1, 74, 2237500, 'c', 5.0]}],
 11885113: [{34200.18960767: [1, 1, 21, 2238100, 'lb', 1.0]},
  {34200.190226476: [4, 1, 21, 2238100, 'ma', 1.0]}]}

In [88]:
table.pop(1365373,None)
type(table)

dict