In [1]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan

In [2]:
## FOR STATISTICAL ANALYSIS
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path, ordb_path):
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date
    date = msg_path.split(".")[0].split("_")[1]

    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "V", "P", "Dir"]  # V: size; P: price; Dir: direction
    df_msg = pd.read_csv(msg_path, names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['P_ask', 'V_ask', 'P_bid', 'V_bid']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item) + str("_") + str(i + 1))
    df_ordb = pd.read_csv(ordb_path, names=ordb_names)

    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'], unit="s", origin=pd.Timestamp(date))

    # combine two dataframes and return it
    df = pd.concat([df_msg, df_ordb], axis=1)
    return df

# Renaming columns as to strip out redundnat words
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("bid" in col) | ("ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'V') | (col == 'P')):
            new_col.append(col+str(" ")+state_type)
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return

In [3]:
## FOR PAPER 462
# Compute Time-insensitive variables
def ComputeTimeInsenstiveSet(df_original, levels=5):
    df = df_original.copy(deep=False)
    for i in range(1,levels+1):
        # compute bid-ask spreads and mid-prices (v2)
        df["bid-ask spread {}".format(i)] = df["P_ask_{}".format(i)] - df["P_bid_{}".format(i)]
        df["mid-price {}".format(i)] = (df["P_ask_{}".format(i)] + df["P_bid_{}".format(i)])/2
        # compute price differences (v3)
        df["d_P_ask_{}{}".format(levels,1)] = df["P_ask_{}".format(levels)] - df["P_ask_1"]
        df["d_P_bid_{}{}".format(levels,1)] = df["P_bid_{}".format(levels)] - df["P_bid_1"]
        for j in range(1,levels):
            df["d_P_ask_{}{}".format(j+1,j)] = np.abs(df["P_ask_{}".format(j+1)] - df["P_ask_{}".format(j)])
            df["d_P_bid_{}{}".format(j+1,j)] = np.abs(df["P_bid_{}".format(j+1)] - df["P_bid_{}".format(j)])
        # compute mean prices and volumes (v4)
        df["Mean_P_ask"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_P_bid"] = df[["P_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_ask"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_bid"] = df[["V_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        # compute accumulated differences (v5)
        df["P_accu"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["P_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
        df["V_accu"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["V_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
    # returning...
    return df

# Compute Time-sensitive variables
def ComputeTimeSensitiveSet(df_original,levels=5,dt_secs=1):
    df = df_original.copy(deep=False)
    #
    for index, row in df.iterrows():
        t = row['Time_stamp']
        temp = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)] # for v6
        # Note: direction = 1 <--> buy/bid order;   direction = -1 <--> sell/ask order
        temp_la = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==-1)] # limit ask order
        temp_lb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==1)]  # limit bid order
        temp_ca = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==-1)&(df['Type']==2)] # cancelled limit ask order
        temp_cb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==1)&(df['Type']==2)]  # cancelled limit buy order
        for i in range(1,levels+1):
            # compute price and volume derivatives (v6)
            df.set_value(index,'der_P_ask_{}'.format(i),
                         (temp['P_ask_{}'.format(i)].iloc[-1]-temp['P_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_P_bid_{}'.format(i),
                         (temp['P_bid_{}'.format(i)].iloc[-1]-temp['P_bid_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_ask_{}'.format(i),
                         (temp['V_ask_{}'.format(i)].iloc[-1]-temp['V_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_bid_{}'.format(i),
                         (temp['V_bid_{}'.format(i)].iloc[-1]-temp['V_bid_{}'.format(i)].iloc[0])/dt_secs)
            # compute average intensity of each type (v7)
            df.set_value(index,'lam_la',int(len(temp_la.index)/dt_secs))
            df.set_value(index,'lam_lb',int(len(temp_lb.index)/dt_secs))
            df.set_value(index,'lam_ca',int(len(temp_ca.index)/dt_secs))
            df.set_value(index,'lam_cb',int(len(temp_cb.index)/dt_secs))
            # compute relative intensity indicators (v8) ???
            # compute accelerations (market/limit)  (v9) ???

    return df

In [4]:
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
df.head(5)

Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_bid_3,V_bid_3,P_ask_4,V_ask_4,P_bid_4,V_bid_4,P_ask_5,V_ask_5,P_bid_5,V_bid_5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2230400,100,2242500,100,2230000,10,2244000,547,2226200,100
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2242500,100,2230400,100,2244000,547,2230000,10
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2230700,200,2240000,220,2230400,100,2242500,100,2230000,10
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,220,2230700,200,2242500,100,2230400,100
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2231800,100,2240000,233,2230700,200,2242500,100,2230400,100


In [5]:
# df_sens = ComputeTimeInsenstiveSet(df)
# df_sens.columns.values.tolist()

In [6]:
# df_sensInsens = ComputeTimeSensitiveSet(df)

In [7]:
# df_sensInsens.head(5)