In [41]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
import time
from collections import Counter

## Helper function

In [42]:
## SUBJECT TO CHANGE... COLUMNS ARE ADAPTIVE
# Function of getting history for all orders
def GetOrderHistory(df_original):
    df = df_original.copy(deep=False) 
    t = time.time()
    table = {}
    for index, row in df.iterrows():
        if (row['OrderID']!=0):   # don't need records where orderID==0; remove row['Type'], row['Dir'] is okay?
            d = {row['Time_stamp']: [ row['V'],row['P'],row['Label'],row['Level'],\
                                      row['Volts'],row['mid-price 1'],row['N_OrderSubmit'],row['bid-ask spread 1'],\
                                      row['Submit_rate'],row['Cancel_rate'],row['Execute_rate'] 
                                    ]}
            if (row['OrderID'] not in table.keys()):
                li = []
                li.append(d)
                table[row['OrderID']] = li
            else:
                table[row['OrderID']].append(d)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))        
    return table

In [43]:
def ReturnTwoStates_orders(df_original,type_list,orderhistory):
    df = df_original.copy(deep=False)
    t = time.time()
    TwoStates = {}
    # only want records that are of those types, and OrderID != 0
    df = df.loc[df['Type'].isin(type_list)]
    Ids = set(df['OrderID'].tolist())
    Ids.discard(0)
    # look through all order ids in the dictionary
    for k, v in orderhistory.items():
        if k in Ids:
            temp_v = orderhistory[k]
            # avoid cases where there is only one event happened to a single order
            if (len(temp_v)>=2):
                # get first event and the very last event
                st_ed = temp_v[::len(temp_v)-1]
                TwoStates[k] = st_ed
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))  
    return TwoStates

In [44]:
def HistoryToDF_2(history_dict):
    t = time.time()
    colnames = ['OrderID','Time','V','P','Label','Level',\
                'Volts','mid-price 1','N_OrderSubmit','Spread','Submit_rate','Cancel_rate','Execute_rate']
    twoStatesName = ['_st','_ed']
    cols = [colnames[0]]
    for i in range(len(twoStatesName)):
        for j in range(1,len(colnames)):
            cols.append(colnames[j]+twoStatesName[i])
    df = pd.DataFrame(columns=cols)
    # traverse through every single orderID
    ct = 0 
    totalItems = len(history_dict.keys())
    steps = int(totalItems/10)
    for k, v in history_dict.items():
        new_row = []
        new_row.append(k)
        new_row.append(list(v[0].keys())[0])
        new_row.extend(list(v[0].values())[0])
        new_row.append(list(v[-1].keys())[0])
        new_row.extend(list(v[-1].values())[0])
        # appending to the dataframe
        df.loc[ct] = new_row
        ct += 1
        if (ct%steps==0):
            print("{0:.2f}%...".format(ct*100/totalItems))
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [45]:
def HistoryToCSV(history_dict,file_name):
    t = time.time()
    import csv
    colnames = ['OrderID','Time','V','P','Label','Level',\
                'Volts','mid-price 1','N_OrderSubmit','Spread','Submit_rate','Cancel_rate','Execute_rate']
    twoStatesName = ['_st','_ed']
    header = [colnames[0]]
    for i in range(len(twoStatesName)):
        for j in range(1,len(colnames)):
            header.append(colnames[j]+twoStatesName[i])
    # initialize a writer
    with open(file_name,'w',newline="") as f:
        writer = csv.writer(f)
        # write heeder row first
        #writer.writerows(header)
        writer.writerows([header])
        # traverse through every single orderID
        ct = 0 
        totalItems = len(history_dict.keys())
        steps = int(totalItems/10)
        for k, v in history_dict.items():
            new_row =[]
            new_row.append(k)
            new_row.append(list(v[0].keys())[0])
            new_row.extend(list(v[0].values())[0])
            new_row.append(list(v[-1].keys())[0])
            new_row.extend(list(v[-1].values())[0])
            # write the new row to csv
            writer.writerows([new_row])
            ct += 1
            if (ct%steps==0):
                print("{0:.2f}%".format(ct*100/totalItems))
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return 

### Processing starts from here

## READ THE BIG CSV, CONTINUE...

In [46]:
#df = pd.read_csv("Messages_10amTo4pm_GOOG.csv")
#df = pd.read_csv("Messages_10amTo4pm_AMZN.csv")
df = pd.read_csv("Messages_10amTo4pm_AAPL.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,...,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5,Volts,N_OrderSubmit,Submit_rate,Cancel_rate,Execute_rate
0,30619,36000.037423,3,46527525,17,5861500,-1,2012-06-21 10:00:00.037423000,5861300,18,...,5860150.0,4000,5860200.0,4900,5860150.0,4204.432889,4517,48.49,42.8,8.72
1,30620,36000.039994,3,46494513,20,5861400,-1,2012-06-21 10:00:00.039994000,5861300,18,...,5860150.0,4000,5860200.0,4900,5860150.0,4204.432889,4517,48.48,42.8,8.72
2,30621,36000.040067,1,46531040,20,5858100,1,2012-06-21 10:00:00.040067000,5861300,18,...,5860150.0,4000,5860200.0,4500,5860350.0,4204.432889,4518,48.49,42.8,8.71
3,30622,36000.063812,1,46533017,8,5858800,1,2012-06-21 10:00:00.063811999,5861300,18,...,5860350.0,3800,5860300.0,4400,5860400.0,4204.432889,4519,48.49,42.79,8.71
4,30623,36000.095645,1,46534282,18,5861000,-1,2012-06-21 10:00:00.095645000,5861000,18,...,5860100.0,3500,5860150.0,4000,5860200.0,4204.432889,4520,48.5,42.79,8.71


In [47]:
print('total column number: ',len(df.columns))
df.columns

total column number:  61


Index(['Unnamed: 0', 'Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time',
       'P_ask_1', 'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2',
       'P_bid_2', 'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3',
       'P_ask_4', 'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5',
       'P_bid_5', 'V_bid_5', 'Label', 'Level', 'bid-ask spread 1',
       'mid-price 1', 'd_P_ask_51', 'd_P_bid_51', 'd_P_ask_21', 'd_P_bid_21',
       'd_P_ask_32', 'd_P_bid_32', 'd_P_ask_43', 'd_P_bid_43', 'd_P_ask_54',
       'd_P_bid_54', 'Mean_P_ask', 'Mean_P_bid', 'Mean_V_ask', 'Mean_V_bid',
       'P_accu', 'V_accu', 'bid-ask spread 2', 'mid-price 2',
       'bid-ask spread 3', 'mid-price 3', 'bid-ask spread 4', 'mid-price 4',
       'bid-ask spread 5', 'mid-price 5', 'Volts', 'N_OrderSubmit',
       'Submit_rate', 'Cancel_rate', 'Execute_rate'],
      dtype='object')

#### 1) GROUPING ALL MESSAGES BY ORDER IDS

In [48]:
OrderHistory = GetOrderHistory(df)

Completed. It takes about 0.67 minutes.


In [49]:
# take a look at first 1 item of OrderHistory
from itertools import islice
def take(n, iterable):
    "Return first n items of the iterable as a dictionary"
    return dict(islice(iterable, n))
n_items = take(1, OrderHistory.items())
n_items   
# OrderID: [{Time_stamp:[V,P,Label,level,Volts,mid-price,N_OrderSubmit,bid-ask spread 1,Submit_rate,Cancel_rate,Execute_rate]}]

{46527525: [{36000.037423252004: [17,
    5861500,
    'c',
    3.0,
    4204.432889258471,
    5860150.0,
    4517,
    2300,
    48.49,
    42.8,
    8.72]}]}

In [50]:
his_list = []
ct_items = 0
for k,v in OrderHistory.items():
    ct_items += 1
    his_list.append(len(v))

In [51]:
counts_allHistories = Counter(his_list)
counts_allHistories

Counter({1: 25091,
         2: 109112,
         3: 4455,
         4: 609,
         5: 169,
         6: 53,
         7: 22,
         8: 11,
         9: 6,
         10: 1,
         11: 1,
         12: 3,
         13: 3,
         14: 1,
         15: 2,
         18: 1,
         26: 1})

In [52]:
print("total number of orders is",int(ct_items))    
print("orders with 2 events is about {0:.2f}%.".format(57673*100/78376))

total number of orders is 139541
orders with 2 events is about 73.59%.


In [53]:
ct = 0
for k, v in OrderHistory.items():
    ct += len(v)
print('average number of event for an order is {0:.3f}'.format(ct/len(OrderHistory.keys())))

average number of event for an order is 1.869


#### 2) GETTING START AND END STATES FOR SELECTED ORDER TYPES, STORE IN DICTIONARIES

In [54]:
OE_history = ReturnTwoStates_orders(df_original=df,type_list=[4,5],orderhistory=OrderHistory)

Completed. It takes about 0.00 minutes.


In [55]:
OC_history = ReturnTwoStates_orders(df_original=df,type_list=[2,3],orderhistory=OrderHistory)

Completed. It takes about 0.00 minutes.


In [56]:
print("total number of execution orders and cancellation orders are {}, {} respectively."
      .format(len(OE_history.keys()),len(OC_history.keys())))

total number of execution orders and cancellation orders are 15095, 100183 respectively.


#### deprecated: too slow. FORMAT ORDER HISTORIES INTO DATAFRAMES

In [57]:
# OE_df = HistoryToDF_2(OE_history)
# OC_df = HistoryToDF_2(OC_history)
# 4) TO CSV
# OC_df.to_csv("OC_twoStates.csv")
# OE_df.to_csv("OE_twoStates.csv")

#### 3) DIRECTLY WRITE OC AND OE HISTORY INTO CSV

In [58]:
#HistoryToCSV(OE_history,file_name="Orders_execute_GOOG.csv")
#HistoryToCSV(OE_history,file_name="Orders_execute_AMZN.csv")
HistoryToCSV(OE_history,file_name="Orders_execute_AAPL.csv")

10.00%
19.99%
29.99%
39.99%
49.98%
59.98%
69.98%
79.97%
89.97%
99.97%
Completed. It takes about 0.00 minutes.


In [59]:
#HistoryToCSV(OC_history,file_name="Orders_cancel_GOOG.csv")
#HistoryToCSV(OC_history,file_name="Orders_cancel_AMZN.csv")
HistoryToCSV(OC_history,file_name="Orders_cancel_AAPL.csv")

10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
Completed. It takes about 0.03 minutes.


#### 4) CLEANING START LEVEL AND END LEVEL

In [60]:
### do it manually in excel bro

## TESTING.... 