In [20]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
import time
from collections import Counter

## Helper function

In [21]:
## FOR STATISTICAL ANALYSIS
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path, ordb_path):
    t = time.time()
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date
    date = msg_path.split(".")[0].split("_")[1]
    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "V", "P", "Dir"]  # V: size; P: price; Dir: direction
    df_msg = pd.read_csv(msg_path, names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['P_ask', 'V_ask', 'P_bid', 'V_bid']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item) + str("_") + str(i + 1))
    df_ordb = pd.read_csv(ordb_path, names=ordb_names)
    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'], unit="s", origin=pd.Timestamp(date))
    # combine two dataframes and return it
    df = pd.concat([df_msg, df_ordb], axis=1)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Renaming columns as to strip out redundnat words
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("bid" in col) | ("ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'V') | (col == 'P')):
            new_col.append(col+str(" ")+state_type)
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return

def GroupByOrderType(df,typeID):                        # for now, typeID should only be 2 or 3 or 4 or 5
    t = time.time()
    # lookup dictionary
    lookup = {1:'Sub', 2:'Can', 3:'Del', 4:'ExVis',5:'ExHid',7:'Halt'}
    # getting all the orders of type typeID
    orderID_set = set(df[(df['Type']==typeID)].OrderID.tolist())
    df_subset = df.loc[df['OrderID'].isin(orderID_set)]
    
    # get initial states when orders of this type is submitted
    df_init = df_subset[(df_subset['Type']==1)]
    RenameColumn(df_init,state_type=lookup[1])
    
    # get the end states of type typeID
    df_end = df_subset[(df_subset['Type']==typeID)]
    RenameColumn(df_end,state_type=lookup[typeID])
    
    # merge these two dataframes
    df = df_init.merge(df_end, left_on='OrderID', right_on='OrderID', how='outer')
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [22]:
## FOR PAPER 462
# Compute Time-insensitive variables
def ComputeTimeInsenstiveSet(df_original, levels=5):
    df = df_original.copy(deep=False)
    t = time.time()
    for i in range(1,levels+1):
        # compute bid-ask spreads and mid-prices (v2)
        df["bid-ask spread {}".format(i)] = df["P_ask_{}".format(i)] - df["P_bid_{}".format(i)]
        df["mid-price {}".format(i)] = (df["P_ask_{}".format(i)] + df["P_bid_{}".format(i)])/2
        # compute price differences (v3)
        df["d_P_ask_{}{}".format(levels,1)] = df["P_ask_{}".format(levels)] - df["P_ask_1"]
        df["d_P_bid_{}{}".format(levels,1)] = df["P_bid_{}".format(levels)] - df["P_bid_1"]
        for j in range(1,levels):
            df["d_P_ask_{}{}".format(j+1,j)] = np.abs(df["P_ask_{}".format(j+1)] - df["P_ask_{}".format(j)])
            df["d_P_bid_{}{}".format(j+1,j)] = np.abs(df["P_bid_{}".format(j+1)] - df["P_bid_{}".format(j)])
        # compute mean prices and volumes (v4)
        df["Mean_P_ask"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_P_bid"] = df[["P_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_ask"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_bid"] = df[["V_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        # compute accumulated differences (v5)
        df["P_accu"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["P_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
        df["V_accu"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["V_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
    # returning...
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Compute Time-sensitive variables
def ComputeTimeSensitiveSet(df_original,levels=5,dt_secs=1):
    df = df_original.copy(deep=False)
    t = time.time()
    for index, row in df.iterrows():
        t = row['Time_stamp']
        temp = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)] # for v6
        # Note: direction = 1 <--> buy/bid order;   direction = -1 <--> sell/ask order
        temp_la = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==-1)] # limit ask order
        temp_lb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==1)]  # limit bid order
        temp_ca = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==-1)&(df['Type']==2)] # cancelled limit ask order
        temp_cb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==1)&(df['Type']==2)]  # cancelled limit buy order
        for i in range(1,levels+1):
            # compute price and volume derivatives (v6)
            df.set_value(index,'der_P_ask_{}'.format(i),
                         (temp['P_ask_{}'.format(i)].iloc[-1]-temp['P_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_P_bid_{}'.format(i),
                         (temp['P_bid_{}'.format(i)].iloc[-1]-temp['P_bid_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_ask_{}'.format(i),
                         (temp['V_ask_{}'.format(i)].iloc[-1]-temp['V_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_bid_{}'.format(i),
                         (temp['V_bid_{}'.format(i)].iloc[-1]-temp['V_bid_{}'.format(i)].iloc[0])/dt_secs)
            # compute average intensity of each type (v7)
            df.set_value(index,'lam_la',int(len(temp_la.index)/dt_secs))
            df.set_value(index,'lam_lb',int(len(temp_lb.index)/dt_secs))
            df.set_value(index,'lam_ca',int(len(temp_ca.index)/dt_secs))
            df.set_value(index,'lam_cb',int(len(temp_cb.index)/dt_secs))
            # compute relative intensity indicators (v8) ???
            # compute accelerations (market/limit)  (v9) ???
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [23]:
# Function of Adding the labels (lb, la, mb, ma) to classify all of the order
def AddLabels(df_original):
    df = df_original.copy(deep=False)
    t = time.time()
    labels = []
    for index, row in df.iterrows():
        if (row['Type']==1):
            if (row['Dir']==-1):
                labels.append('la')
            elif (row['Dir']==1):
                labels.append('lb')
        elif (row['Type'] in [4,5]):  # -1: sell limit order; 1: buy limit order; 
            if (row['Dir']==-1):      # But execution of buy/sell is initiated by a market sell/buy trade                    
                labels.append('mb')
            elif (row['Dir']==1):
                labels.append('ma')
        elif (row['Type']==2):
            labels.append('pc')
        elif (row['Type']==3):
            labels.append('c')
        else:
            labels.append(None)
            print('error! index {} has unknown order type.'.format(index))
    ## 
    df['Label'] = labels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [24]:
# Function of Classifying the level where action takes
def AddPriceLevels(df_original):
    df = df_original.copy(deep=False)  
    t = time.time()
    if ('Label' not in df.columns):
        print("You haven't appended labels to messages.")
        return
    nlevels = 5
    levels = [None]
    for index, row in df.iterrows():   # 1: buy   -1: sell
        if (index > 0):
            bid_levels = np.array(df.iloc[index-1][["P_bid_{}".format(i) for i in range(1,nlevels+1)]])
            ask_levels = np.array(df.iloc[index-1][["P_ask_{}".format(i) for i in range(1,nlevels+1)]])
            price = row['P']
            # for label 'la', 'lb'
            if (row["Label"] in ['la','lb']):
                if (row['Dir']==-1):
                    if (price > max(ask_levels)):
                        i = "out-of-bounds"
                    else:
                        i = np.argmax(price<=ask_levels)
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    if (price < min(bid_levels)):
                        i = "out-of_bounds"
                    else:
                        i = np.argmax(price>=bid_levels)
                    levels.append(int(i+1))
            # for 'c', 'pc'
            elif (row["Label"] in ['c','pc']):
                if (row['Dir']==-1):
                    i = np.where(price==ask_levels)[0]
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    i = np.where(price==bid_levels)[0]
                    levels.append(int(i+1))
            # for 'mb', 'ma', always happen at the best level
            elif (row["Label"] in ['mb','ma']):
                levels.append(int(1))
            else:
                levels.append(None)     
    if (len(df.index)!=len(levels)):
        print('Error. Lengths don\'t match.')
        return
    df['Level'] = levels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [25]:
def AddVolatilityAndNOB(df_original,dt_mins):
    df = df_original.copy(deep=False)
    t2 = time.time()
    dt = dt_mins * 60 # convert minutes to seconds
    norders, volts = [],[]
    steps = int(len(df.index)/10)
    for index, row in df.iterrows():
        t = row['Time_stamp']
        subset = df[(df['Time_stamp']>t-dt)&(df['Time_stamp']<=t)]
        # computing...
        norder = len(subset[(subset['Type']==1)].index) # type 1: limit order submitted
        volt = np.std(subset[(subset['Type']==4)|(subset['Type']==5)]['P']) # type 4 and 5: execution
        # appending
        norders.append(norder)
        volts.append(volt)
        if (len(volts)% steps == 0):
            print("{0:.2f}%..".format(len(volts)*100/len(df.index)))
    df['Volts'] = volts
    df['N_OrderSubmit'] = norders
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t2)/60))
    return df

In [26]:
def AddMarketInformation(df_original,dt_mins):
    df = df_original.copy(deep=False)
    t2 = time.time()
    dt = dt_mins * 60 # convert minutes to seconds
    submit_rates, cancel_rates, execute_rates = [],[],[]
    steps = int(len(df.index)/10)
    for index, row in df.iterrows():
        t = row['Time_stamp']
        subset = df[(df['Time_stamp']>t-dt)&(df['Time_stamp']<=t)]
        # computing...
        submit = subset[(subset['Label']=='la')|(subset['Label']=='lb')] # newly-added orders
        cancel = subset[(subset['Label']=='c')|(subset['Label']=='pc')] # cancelled orders
        execute = subset[(subset['Label']=='ma')|(subset['Label']=='mb')] # executed orders/ market orders come
        # appending
        submit_rates.append(np.round(len(submit.index)*100/len(subset.index),2))
        cancel_rates.append(np.round(len(cancel.index)*100/len(subset.index),2))
        execute_rates.append(np.round(len(execute.index)*100/len(subset.index),2))
        
        if (len(submit_rates)% steps == 0):
            print("{0:.2f}%..".format(len(submit_rates)*100/len(df.index)))
    df['Submit_rate'] = submit_rates
    df['Cancel_rate'] = cancel_rates
    df['Execute_rate'] = execute_rates
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t2)/60))
    return df

### Processing starts from here

In [27]:
# 1) MERGING MESSAGE FILE AND ORDERBOOK FILE
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
#df.head(5)

Completed. It takes about 0.02 minutes.


In [28]:
# 2) ADDING LABELS
df = AddLabels(df)

Completed. It takes about 0.31 minutes.


In [29]:
#df.columns

In [30]:
# 3) ADDING PRICE LEVEL INFORMATION
df = AddPriceLevels(df)

Completed. It takes about 5.35 minutes.


In [31]:
print('after adding labels and price levels, dataset looks like: ')
df.head(3)

after adding labels and price levels, dataset looks like: 


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_ask_4,V_ask_4,P_bid_4,V_bid_4,P_ask_5,V_ask_5,P_bid_5,V_bid_5,Label,Level
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2242500,100,2230000,10,2244000,547,2226200,100,mb,
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2242500,100,2230400,100,2244000,547,2230000,10,lb,1.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2240000,220,2230400,100,2242500,100,2230000,10,la,2.0


In [32]:
# 4) ADDING TIME-INSENSITIVE FEATURES
df = ComputeTimeInsenstiveSet(df)

Completed. It takes about 0.02 minutes.


In [41]:
print('after appending time-insensitive information:')
df.head(3)

after appending time-insensitive information:


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_accu,V_accu,bid-ask spread 2,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,56800,557,9200,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,44900,636,8100,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,40500,109,7800,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0


In [42]:
### ADDING FEATURES W.R.T DT= 10 MINS

In [43]:
# 5) ADD VOLATILITY AND NUMBER OF LIMIT ORDERS SUBMITTED
df_2 = AddVolatilityAndNOB(df_original=df,dt_mins=10)

10.00%..
20.00%..
30.00%..
40.00%..
50.00%..
60.00%..
70.00%..
80.00%..
90.00%..
100.00%..
Completed. It takes about 15.92 minutes.


In [44]:
print('after adding volatility and # of orders submitted: ')
df_2.head(3)

after adding volatility and # of orders submitted: 


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,bid-ask spread 2,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Volts,N_OrderSubmit
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,9200,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0,0.0,0
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,8100,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0,0.0,27
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,7800,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0,0.0,27


In [45]:
# 6) AND MARKET  INFORMATION
df_2 = AddMarketInformation(df_original=df_2,dt_mins=10)

10.00%..
20.00%..
30.00%..
40.00%..
50.00%..
60.00%..
70.00%..
80.00%..
90.00%..
100.00%..
Completed. It takes about 33.90 minutes.


In [46]:
# 7) SHRINK THE TIMEFRAME FROM ONLY 10 A.M. TO 4 P.M.
firstT = df['Time_stamp'].tolist()[0]
T = firstT + 60 * 30
df_noFirst30Mins = df_2[(df_2['Time_stamp'])>=T]
print('previous df length: {};\nlength after removing first 30 mins: {}'.format(len(df.index),len(df_noFirst30Mins.index)))

previous df length: 155935;
length after removing first 30 mins: 142246


#### 8) VERIFICATION

In [56]:
print(len(df_noFirst30Mins.columns))
df_noFirst30Mins.columns

60


Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'Label', 'Level', 'bid-ask spread 1', 'mid-price 1',
       'd_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32',
       'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54',
       'Mean_P_ask', 'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu',
       'V_accu', 'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3',
       'mid-price 3', 'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5',
       'mid-price 5', 'Volts', 'N_OrderSubmit', 'Submit_rate', 'Cancel_rate',
       'Execute_rate'],
      dtype='object')

In [57]:
print(len(df.columns))
df.columns

55


Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'Label', 'Level', 'bid-ask spread 1', 'mid-price 1',
       'd_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32',
       'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54',
       'Mean_P_ask', 'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu',
       'V_accu', 'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3',
       'mid-price 3', 'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5',
       'mid-price 5'],
      dtype='object')

In [48]:
df_noFirst30Mins.head(5)

Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Volts,N_OrderSubmit,Submit_rate,Cancel_rate,Execute_rate
13689,36000.040921,3,46494609,19,2243500,-1,2012-06-21 10:00:00.040920,2243500,100,2241700,...,2242650.0,3300,2242750.0,3500,2242750.0,2082.61304,2830,50.56,43.99,5.45
13690,36000.040981,1,46531222,19,2241500,1,2012-06-21 10:00:00.040981,2243500,100,2241700,...,2242650.0,3300,2242750.0,3500,2242750.0,2082.61304,2831,50.57,43.98,5.45
13691,36000.042313,3,46520399,100,2244400,-1,2012-06-21 10:00:00.042313,2243500,100,2241700,...,2242650.0,3300,2242750.0,3500,2242750.0,2082.61304,2831,50.56,43.99,5.45
13692,36000.214317,3,46531222,19,2241500,1,2012-06-21 10:00:00.214317,2243500,100,2241700,...,2242650.0,3300,2242750.0,3500,2242750.0,2082.61304,2831,50.55,44.0,5.45
13693,36000.214402,1,46542025,19,2243500,-1,2012-06-21 10:00:00.214402,2243500,119,2241700,...,2242650.0,3300,2242750.0,3500,2242750.0,2082.61304,2832,50.56,43.99,5.45


####  9) OUTPUT IT TO A CSV

In [58]:
df_noFirst30Mins.to_csv("Messages_10to4.csv")
df.to_csv("Messages_allDay.csv")

## TESTING.... 