In [2]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
# loading machine learning required packages
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import time

In [3]:
## FOR STATISTICAL ANALYSIS
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path, ordb_path):
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date
    date = msg_path.split(".")[0].split("_")[1]

    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "V", "P", "Dir"]  # V: size; P: price; Dir: direction
    df_msg = pd.read_csv(msg_path, names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['P_ask', 'V_ask', 'P_bid', 'V_bid']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item) + str("_") + str(i + 1))
    df_ordb = pd.read_csv(ordb_path, names=ordb_names)

    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'], unit="s", origin=pd.Timestamp(date))

    # combine two dataframes and return it
    df = pd.concat([df_msg, df_ordb], axis=1)
    return df

# Renaming columns as to strip out redundnat words
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("bid" in col) | ("ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'V') | (col == 'P')):
            new_col.append(col+str(" ")+state_type)
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return

def GroupByOrderType(df,typeID):                        # for now, typeID should only be 2 or 3 or 4 or 5
    # lookup dictionary
    lookup = {1:'Sub', 2:'Can', 3:'Del', 4:'ExVis',5:'ExHid',7:'Halt'}
    # getting all the orders of type typeID
    orderID_set = set(df[(df['Type']==typeID)].OrderID.tolist())
    df_subset = df.loc[df['OrderID'].isin(orderID_set)]
    
    # get initial states when orders of this type is submitted
    df_init = df_subset[(df_subset['Type']==1)]
    RenameColumn(df_init,state_type=lookup[1])
    
    # get the end states of type typeID
    df_end = df_subset[(df_subset['Type']==typeID)]
    RenameColumn(df_end,state_type=lookup[typeID])
    
    # merge these two dataframes
    df = df_init.merge(df_end, left_on='OrderID', right_on='OrderID', how='outer')

    return df

In [4]:
## FOR PAPER 462
# Compute Time-insensitive variables
def ComputeTimeInsenstiveSet(df_original, levels=5):
    df = df_original.copy(deep=False)
    for i in range(1,levels+1):
        # compute bid-ask spreads and mid-prices (v2)
        df["bid-ask spread {}".format(i)] = df["P_ask_{}".format(i)] - df["P_bid_{}".format(i)]
        df["mid-price {}".format(i)] = (df["P_ask_{}".format(i)] + df["P_bid_{}".format(i)])/2
        # compute price differences (v3)
        df["d_P_ask_{}{}".format(levels,1)] = df["P_ask_{}".format(levels)] - df["P_ask_1"]
        df["d_P_bid_{}{}".format(levels,1)] = df["P_bid_{}".format(levels)] - df["P_bid_1"]
        for j in range(1,levels):
            df["d_P_ask_{}{}".format(j+1,j)] = np.abs(df["P_ask_{}".format(j+1)] - df["P_ask_{}".format(j)])
            df["d_P_bid_{}{}".format(j+1,j)] = np.abs(df["P_bid_{}".format(j+1)] - df["P_bid_{}".format(j)])
        # compute mean prices and volumes (v4)
        df["Mean_P_ask"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_P_bid"] = df[["P_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_ask"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_bid"] = df[["V_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        # compute accumulated differences (v5)
        df["P_accu"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["P_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
        df["V_accu"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["V_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
    # returning...
    return df

# Compute Time-sensitive variables
def ComputeTimeSensitiveSet(df_original,levels=5,dt_secs=1):
    df = df_original.copy(deep=False)
    #
    for index, row in df.iterrows():
        t = row['Time_stamp']
        temp = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)] # for v6
        # Note: direction = 1 <--> buy/bid order;   direction = -1 <--> sell/ask order
        temp_la = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==-1)] # limit ask order
        temp_lb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==1)]  # limit bid order
        temp_ca = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==-1)&(df['Type']==2)] # cancelled limit ask order
        temp_cb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==1)&(df['Type']==2)]  # cancelled limit buy order
        for i in range(1,levels+1):
            # compute price and volume derivatives (v6)
            df.set_value(index,'der_P_ask_{}'.format(i),
                         (temp['P_ask_{}'.format(i)].iloc[-1]-temp['P_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_P_bid_{}'.format(i),
                         (temp['P_bid_{}'.format(i)].iloc[-1]-temp['P_bid_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_ask_{}'.format(i),
                         (temp['V_ask_{}'.format(i)].iloc[-1]-temp['V_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_bid_{}'.format(i),
                         (temp['V_bid_{}'.format(i)].iloc[-1]-temp['V_bid_{}'.format(i)].iloc[0])/dt_secs)
            # compute average intensity of each type (v7)
            df.set_value(index,'lam_la',int(len(temp_la.index)/dt_secs))
            df.set_value(index,'lam_lb',int(len(temp_lb.index)/dt_secs))
            df.set_value(index,'lam_ca',int(len(temp_ca.index)/dt_secs))
            df.set_value(index,'lam_cb',int(len(temp_cb.index)/dt_secs))
            # compute relative intensity indicators (v8) ???
            # compute accelerations (market/limit)  (v9) ???

    return df

In [5]:
# Add Two Extra Columns: signs of mid-price and signs of spread crossing 
def AddMidPriceAndSpreadCrossing(df):   
    df2 = df.copy(deep=False)
    
    # 1) add mid-price column first    # criterion: [0: upward;  1: downward;  2: stationary] 
    midprices = []
    for index, row in df2.iterrows():
        midprices.append((row['P_ask_1'] + row['P_bid_1'])/2)
    df2['Mid_price'] = midprices
    # decide if mid-price moves up or down, or stays stationary
    midpriceMoves = [2]
    for i in range(1,len(midprices)):
        if (midprices[i] > midprices[i-1]):
            midpriceMoves.append(0)
        elif (midprices[i] < midprices[i-1]):
            midpriceMoves.append(1)
        else: 
            midpriceMoves.append(2)
    # checking 
    if (len(midpriceMoves)!=len(df2.index)):
        print('error! Lengths do not match.')
        return
    # good to go
    df2['MidPrice_Moves'] = midpriceMoves
    
    
    # 2) decide spread crossing movements   # criterion: [0: upward;  1: downward;  2: no spread crossing] 
    sprCros = [2]
    bestBids = df2['P_bid_1']
    bestAsks = df2['P_ask_1']
    for i in range(1,len(bestBids)):
        if (bestBids[i] > bestAsks[i-1]):
            sprCros.append(0)
        elif (bestAsks[i] < bestBids[i-1]):
            sprCros.append(1)
        elif ((bestAsks[i]>=bestBids[i-1])&(bestBids[i]<=bestAsks[i-1])):
            sprCros.append(2)
        else: 
            print(i,'error!')
    # checking 
    if (len(midpriceMoves)!=len(df2.index)):
        print('error! Lengths do not match.')
        return   
    # good to go
    df2['SpdCros_Moves'] = sprCros
    
    return df2

#### MERGING a LOB

In [6]:
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
#df.head(5)

#### ADDING TIME IN-SENSITIVE INFORMATION

In [7]:
df_TimeInsens = ComputeTimeInsenstiveSet(df)

#### ADD MID-PRICE INFO AND SPREAD CROSSING INFO

In [8]:
df_withY = AddMidPriceAndSpreadCrossing(df_TimeInsens)

In [9]:
df_withY.head(5)

Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Mid_price,MidPrice_Moves,SpdCros_Moves
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0,2235650.0,2,2
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0,2238800.0,0,2
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0,2238800.0,2,2
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0,2238800.0,2,2


In [10]:
#len(df_withY.columns.tolist())
df_withY.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'bid-ask spread 1', 'mid-price 1', 'd_P_ask_51',
       'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32', 'd_P_bid_32',
       'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54', 'Mean_P_ask',
       'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu', 'V_accu',
       'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3', 'mid-price 3',
       'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5', 'mid-price 5',
       'Mid_price', 'MidPrice_Moves', 'SpdCros_Moves'],
      dtype='object')

### Test1: Predict Mid-Price Movement

In [11]:
## find distribution of each type
from collections import Counter
counts = Counter(df_withY["MidPrice_Moves"])
print(counts.most_common())

[(2, 128378), (1, 13877), (0, 13680)]


In [12]:
start = pd.to_datetime(df_withY['Time'].tolist()[0])
end = pd.to_datetime(df_withY['Time'].tolist()[-1])
timeT = end-start
totalMins = np.round((timeT / pd.Timedelta('1 hour'))*60,2)
print('total minutes for our T is {}'.format(totalMins))
print('so for each minute, there are about {} features'.format(np.round(len(df_withY.index)/totalMins),0))

total minutes for our T is 390.0
so for each minute, there are about 400.0 features


In [None]:
# we would use the first 200 features to predict next 5; that is set delta_t = 5
basicAndTimeInsensitiveSet= ['P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'bid-ask spread 1', 'mid-price 1', 'd_P_ask_51',
       'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32', 'd_P_bid_32',
       'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54', 'Mean_P_ask',
       'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid', 'P_accu', 'V_accu',
       'bid-ask spread 2', 'mid-price 2', 'bid-ask spread 3', 'mid-price 3',
       'bid-ask spread 4', 'mid-price 4', 'bid-ask spread 5', 'mid-price 5']

first200 = df_withY[:200]
x = first200[basicAndTimeInsensitiveSet]
y = first200['MidPrice_Moves']

In [None]:
start_time = time.time()

# shuffle the dataset
x, y = shuffle(x, y, random_state=0)
# Split the dataset in two equal parts
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=0)

# tuning set 1) Set the parameters by cross-validation
# parameters = [{'kernel': ['rbf'],
#                'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],
#                 'C': [1, 10, 100, 1000]},
#               {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# tuning set 2)
parameters = [{'kernel': ['rbf'],
               'gamma': [1e-4, 0.01, 0.1, 0.5],
                'C': [1, 10]},
              {'kernel': ['linear'], 'C': [1, 10]}]
print("# Tuning hyper-parameters")
print()
clf = GridSearchCV(svm.SVC(decision_function_shape='ovr'), parameters, cv=5)
clf.fit(x_train, y_train)

time = time.time() - start_time
print("Hyperparameter tuning took {} seconds".format(time))

# Tuning hyper-parameters



In [None]:
# df_sens = ComputeTimeInsenstiveSet(df)
# df_sens.columns.values.tolist()

In [None]:
# df_sensInsens = ComputeTimeSensitiveSet(df)

In [None]:
# df_sensInsens.head(5)

In [None]:
print('hello world')