In [1]:
import warnings
warnings.filterwarnings('ignore') # Filter out warnings
import numpy as np
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from scipy import stats
import math
from math import isnan
import time
from collections import Counter

## Helper function

In [2]:
## FOR STATISTICAL ANALYSIS
# Create a Combined Dataframe from message csv and orderbook csv
def Merge_MsgAndOrdb(msg_path, ordb_path):
    t = time.time()
    # get the level of the LOBSTER dataset
    level = int(msg_path.split(".")[0][-1])
    # get the date
    date = msg_path.split(".")[0].split("_")[1]
    # add header row for message csv
    msg_names = ["Time_stamp", "Type", "OrderID", "V", "P", "Dir"]  # V: size; P: price; Dir: direction
    df_msg = pd.read_csv(msg_path, names=msg_names)
    # add header row for orderbook csv
    default_ordb_names = ['P_ask', 'V_ask', 'P_bid', 'V_bid']
    ordb_names = []
    for i in range(level):
        for item in default_ordb_names:
            ordb_names.append(str(item) + str("_") + str(i + 1))
    df_ordb = pd.read_csv(ordb_path, names=ordb_names)
    # adding a meaningful time column
    df_msg['Time'] = pd.to_datetime(df_msg['Time_stamp'], unit="s", origin=pd.Timestamp(date))
    # combine two dataframes and return it
    df = pd.concat([df_msg, df_ordb], axis=1)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Renaming columns as to strip out redundnat words
def RenameColumn(df,state_type):
    old_col = df.columns.tolist()
    new_col = []
    for col in old_col:
        if ( ("bid" in col) | ("ask" in col) ):
            new_col.append(col+str(" ")+state_type)
        elif ((col == 'V') | (col == 'P')):
            new_col.append(col+str(" ")+state_type)
        elif (col=='Time'):
            new_col.append(col+str(" ")+state_type)
        else:
            new_col.append(col)
    columns = dict(zip(old_col,new_col))
    df.rename(columns=columns,inplace=True)
    return

def GroupByOrderType(df,typeID):                        # for now, typeID should only be 2 or 3 or 4 or 5
    t = time.time()
    # lookup dictionary
    lookup = {1:'Sub', 2:'Can', 3:'Del', 4:'ExVis',5:'ExHid',7:'Halt'}
    # getting all the orders of type typeID
    orderID_set = set(df[(df['Type']==typeID)].OrderID.tolist())
    df_subset = df.loc[df['OrderID'].isin(orderID_set)]
    
    # get initial states when orders of this type is submitted
    df_init = df_subset[(df_subset['Type']==1)]
    RenameColumn(df_init,state_type=lookup[1])
    
    # get the end states of type typeID
    df_end = df_subset[(df_subset['Type']==typeID)]
    RenameColumn(df_end,state_type=lookup[typeID])
    
    # merge these two dataframes
    df = df_init.merge(df_end, left_on='OrderID', right_on='OrderID', how='outer')
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [3]:
## FOR PAPER 462
# Compute Time-insensitive variables
def ComputeTimeInsenstiveSet(df_original, levels=5):
    df = df_original.copy(deep=False)
    t = time.time()
    for i in range(1,levels+1):
        # compute bid-ask spreads and mid-prices (v2)
        df["bid-ask spread {}".format(i)] = df["P_ask_{}".format(i)] - df["P_bid_{}".format(i)]
        df["mid-price {}".format(i)] = (df["P_ask_{}".format(i)] + df["P_bid_{}".format(i)])/2
        # compute price differences (v3)
        df["d_P_ask_{}{}".format(levels,1)] = df["P_ask_{}".format(levels)] - df["P_ask_1"]
        df["d_P_bid_{}{}".format(levels,1)] = df["P_bid_{}".format(levels)] - df["P_bid_1"]
        for j in range(1,levels):
            df["d_P_ask_{}{}".format(j+1,j)] = np.abs(df["P_ask_{}".format(j+1)] - df["P_ask_{}".format(j)])
            df["d_P_bid_{}{}".format(j+1,j)] = np.abs(df["P_bid_{}".format(j+1)] - df["P_bid_{}".format(j)])
        # compute mean prices and volumes (v4)
        df["Mean_P_ask"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_P_bid"] = df[["P_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_ask"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        df["Mean_V_bid"] = df[["V_bid_{}".format(i) for i in range(1,levels+1)]].mean(axis=1)
        # compute accumulated differences (v5)
        df["P_accu"] = df[["P_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["P_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
        df["V_accu"] = df[["V_ask_{}".format(i) for i in range(1,levels+1)]].sum(axis=1) \
            - df[["V_bid_{}".format(i) for i in range(1,levels+1)]].sum(axis=1)
    # returning...
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

# Compute Time-sensitive variables
def ComputeTimeSensitiveSet(df_original,levels=5,dt_secs=1):
    df = df_original.copy(deep=False)
    t = time.time()
    for index, row in df.iterrows():
        t = row['Time_stamp']
        temp = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)] # for v6
        # Note: direction = 1 <--> buy/bid order;   direction = -1 <--> sell/ask order
        temp_la = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==-1)] # limit ask order
        temp_lb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)&(df['Dir']==1)]  # limit bid order
        temp_ca = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==-1)&(df['Type']==2)] # cancelled limit ask order
        temp_cb = df.loc[(df['Time_stamp']>t-dt_secs)&(df['Time_stamp']<=t)
                         &(df['Dir']==1)&(df['Type']==2)]  # cancelled limit buy order
        for i in range(1,levels+1):
            # compute price and volume derivatives (v6)
            df.set_value(index,'der_P_ask_{}'.format(i),
                         (temp['P_ask_{}'.format(i)].iloc[-1]-temp['P_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_P_bid_{}'.format(i),
                         (temp['P_bid_{}'.format(i)].iloc[-1]-temp['P_bid_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_ask_{}'.format(i),
                         (temp['V_ask_{}'.format(i)].iloc[-1]-temp['V_ask_{}'.format(i)].iloc[0])/dt_secs)
            df.set_value(index,'der_V_bid_{}'.format(i),
                         (temp['V_bid_{}'.format(i)].iloc[-1]-temp['V_bid_{}'.format(i)].iloc[0])/dt_secs)
            # compute average intensity of each type (v7)
            df.set_value(index,'lam_la',int(len(temp_la.index)/dt_secs))
            df.set_value(index,'lam_lb',int(len(temp_lb.index)/dt_secs))
            df.set_value(index,'lam_ca',int(len(temp_ca.index)/dt_secs))
            df.set_value(index,'lam_cb',int(len(temp_cb.index)/dt_secs))
            # compute relative intensity indicators (v8) ???
            # compute accelerations (market/limit)  (v9) ???
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [4]:
# Function of Adding the labels (lb, la, mb, ma) to classify all of the order
def AddLabels(df_original):
    df = df_original.copy(deep=False)
    t = time.time()
    labels = []
    for index, row in df.iterrows():
        if (row['Type']==1):
            if (row['Dir']==-1):
                labels.append('la')
            elif (row['Dir']==1):
                labels.append('lb')
        elif (row['Type'] in [4,5]):  # -1: sell limit order; 1: buy limit order; 
            if (row['Dir']==-1):      # But execution of buy/sell is initiated by a market sell/buy trade                    
                labels.append('mb')
            elif (row['Dir']==1):
                labels.append('ma')
        elif (row['Type']==2):
            labels.append('pc')
        elif (row['Type']==3):
            labels.append('c')
        else:
            labels.append(None)
            print('error! index {} has unknown order type.'.format(index))
    ## 
    df['Label'] = labels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [5]:
# Function of Classifying the level where action takes
def AddPriceLevels(df_original):
    df = df_original.copy(deep=False)  
    t = time.time()
    if ('Label' not in df.columns):
        print("You haven't appended labels to messages.")
        return
    nlevels = 5
    levels = [None]
    for index, row in df.iterrows():   # 1: buy   -1: sell
        if (index > 0):
            bid_levels = np.array(df.iloc[index-1][["P_bid_{}".format(i) for i in range(1,nlevels+1)]])
            ask_levels = np.array(df.iloc[index-1][["P_ask_{}".format(i) for i in range(1,nlevels+1)]])
            price = row['P']
            # for label 'la', 'lb'
            if (row["Label"] in ['la','lb']):
                if (row['Dir']==-1):
                    if (price > max(ask_levels)):
                        i = "out-of-bounds"
                    else:
                        i = np.argmax(price<=ask_levels)
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    if (price < min(bid_levels)):
                        i = "out-of_bounds"
                    else:
                        i = np.argmax(price>=bid_levels)
                    levels.append(int(i+1))
            # for 'c', 'pc'
            elif (row["Label"] in ['c','pc']):
                if (row['Dir']==-1):
                    i = np.where(price==ask_levels)[0]
                    levels.append(int(i+1))
                elif (row['Dir']==1):
                    i = np.where(price==bid_levels)[0]
                    levels.append(int(i+1))
            # for 'mb', 'ma', always happen at the best level
            elif (row["Label"] in ['mb','ma']):
                levels.append(int(1))
            else:
                levels.append(None)     
    if (len(df.index)!=len(levels)):
        print('Error. Lengths don\'t match.')
        return
    df['Level'] = levels
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))
    return df

In [6]:
# Function of getting history for all orders
def GetOrderHistory(df_original):
    df = df_original.copy(deep=False) 
    t = time.time()
    table = {}
    for index, row in df.iterrows():
        if (row['OrderID']!=0):   # don't need records where orderID==0; remove 'Type' and 'Dir' is okay?
            d = {row['Time_stamp']: [row['Type'],row['Dir'],row['V'],row['P'],row['Label'],row['Level']]}
            if (row['OrderID'] not in table.keys()):
                li = []
                li.append(d)
                table[row['OrderID']] = li
            else:
                table[row['OrderID']].append(d)
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))        
    return table

In [7]:
def ReturnTwoStates_orders(df_original,type_list,orderhistory):
    df = df_original.copy(deep=False)
    t = time.time()
    TwoStates = {}
    # only want records that are of those types, and OrderID != 0
    df = df.loc[df['Type'].isin(type_list)]
    Ids = set(df['OrderID'].tolist())
    Ids.discard(0)
    # look through all order ids in the dictionary
    for k, v in orderhistory.items():
        if k in Ids:
            temp_v = orderhistory[k]
            # avoid cases where there is only one event happened to a single order
            if (len(temp_v)>=2):
                # get first event and the very last event
                st_ed = temp_v[::len(temp_v)-1]
                TwoStates[k] = st_ed
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60))  
    return TwoStates

In [8]:
def HistoryToDF(history):
    t = time.time()
    df = pd.DataFrame(columns=['OrderID','Time_st','V_st','P_st','Label_st','Level_st',\
                                         'Time_ed','V_ed','P_ed','Label_ed','Level_ed'])
    orders, time_st,v_st,p_st,label_st,level_st,time_ed,v_ed,p_ed,label_ed,level_ed = [],[],[],[],[],[],[],[],[],[],[]
    # traverse through every single orderID
    for k, v in history.items():
        orders.append(k)
        # start stage
        time_st.append(list(v[0].keys())[0]) # #type_st.append(list(v[0].values())[0][0]), dir_st.append(list(v[0].values())[0][1])
        v_st.append(list(v[0].values())[0][2])
        p_st.append(list(v[0].values())[0][3])
        label_st.append(list(v[0].values())[0][4])
        level_st.append(list(v[0].values())[0][5])
        # end stage
        time_ed.append(list(v[-1].keys())[0]) # type_ed.append(list(v[-1].values())[0][0]),dir_ed.append(list(v[-1].values())[0][1])
        v_ed.append(list(v[-1].values())[0][2])
        p_ed.append(list(v[-1].values())[0][3])
        label_ed.append(list(v[-1].values())[0][4])
        level_ed.append(list(v[-1].values())[0][5])
    # append them to our dataframe
    df['OrderID'] = orders 
    df['Time_st'] = time_st
    df['V_st'] = v_st
    df['P_st'] = p_st
    df['Label_st'] = label_st
    df['Level_st'] = level_st
    df['Time_ed'] = time_ed
    df['V_ed'] = v_ed
    df['P_ed'] = p_ed
    df['Label_ed'] = label_ed
    df['Level_ed'] = level_ed
#   Note: other methods: easier to read, but takes longer time
#         l = [k]
#         l.append(list(v[0].keys())[0]) 
#         l.extend(list(v[0].values())[0])
#         l.append(list(v[-1].keys())[0])
#         l.extend(list(v[-1].values())[0])
#         df.loc[index] = l
#         index += 1
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t)/60)) 
    return df

### Processing starts from here

#### 1) MERGING MESSAGE FILE AND ORDERBOOK FILE

In [9]:
df = Merge_MsgAndOrdb(msg_path='AMZN_2012-06-21_34200000_57600000_message_5.csv',
                     ordb_path='AMZN_2012-06-21_34200000_57600000_orderbook_5.csv')
#df.head(5)

Completed. It takes about 0.04 minutes.


#### 2) ADDING LABELS

In [10]:
df = AddLabels(df)

Completed. It takes about 0.31 minutes.


In [11]:
#df.columns

#### 3) ADDING PRICE LEVEL INFORMATION

In [12]:
df = AddPriceLevels(df)

Completed. It takes about 6.08 minutes.


In [13]:
print('after adding labels and price levels, dataset looks like: ')
df.head(3)

after adding labels and price levels, dataset looks like: 


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_ask_4,V_ask_4,P_bid_4,V_bid_4,P_ask_5,V_ask_5,P_bid_5,V_bid_5,Label,Level
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,2242500,100,2230000,10,2244000,547,2226200,100,mb,
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2242500,100,2230400,100,2244000,547,2230000,10,lb,1.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,2240000,220,2230400,100,2242500,100,2230000,10,la,2.0


#### 4) ADD VOLATILITY AND NUMBER OF LIMIT ORDERS SUBMITTED

In [23]:
def AddVolatilityAndNOB(df_original,dt_mins):
    df = df_original.copy(deep=False)
    t2 = time.time()
    dt = dt_mins * 60 # convert minutes to seconds
    norders, volts = [],[]
    for index, row in df.iterrows():
        t = row['Time_stamp']
        subset = df[(df['Time_stamp']>t-dt)&(df['Time_stamp']<=t)]
        # computing...
        norder = len(subset[(subset['Type']==1)].index) # type 1: limit order submitted
        volt = np.std(subset[(subset['Type']==4)|(subset['Type']==5)]['P']) # type 4 and 5: execution
        # appending
        norders.append(norder)
        volts.append(volt)
        
        if (len(volts)% 12000 == 0):
            print("{0:.2f}%..".format(len(volts)*100/len(df.index)))
    df['Volts'] = volts
    df['N_OrderSubmit'] = norders
    print("Completed. It takes about {0:.2f} minutes.".format((time.time()-t2)/60))
    return df

In [24]:
#volts, norders = AddVolatilityAndNOB(df_original=df,dt_mins=10)

In [25]:
df = AddVolatilityAndNOB(df_original=df,dt_mins=10)

7.70%..
15.39%..
23.09%..
30.78%..
38.48%..
46.17%..
53.87%..
61.56%..
69.26%..
76.96%..
84.65%..
92.35%..
Completed. It takes about 16.94 minutes.


In [34]:
print('after adding volatility and # of orders submitted: ')
df.head(3)

after adding volatility and # of orders submitted: 


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_accu,V_accu,bid-ask spread 2,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,56800,557,9200,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,44900,636,8100,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,40500,109,7800,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0
3,34200.189608,1,11534792,100,2237500,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,33000,19,2100,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0
4,34200.189608,1,1365373,13,2240000,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,33000,32,2100,2238550.0,8100,2235850.0,9300,2235350.0,12100,2236450.0


#### 5) ADDING TIME-INSENSITIVE FEATURES

In [28]:
df = ComputeTimeInsenstiveSet(df)

Completed. It takes about 0.02 minutes.


In [30]:
print('after appending time-insensitive information:')
df.head(3)

after appending time-insensitive information:


Unnamed: 0,Time_stamp,Type,OrderID,V,P,Dir,Time,P_ask_1,V_ask_1,P_bid_1,...,P_accu,V_accu,bid-ask spread 2,mid-price 2,bid-ask spread 3,mid-price 3,bid-ask spread 4,mid-price 4,bid-ask spread 5,mid-price 5
0,34200.01746,5,0,1,2238200,-1,2012-06-21 09:30:00.017460,2239500,100,2231800,...,56800,557,9200,2235300.0,9600,2235200.0,12500,2236250.0,17800,2235100.0
1,34200.189608,1,11885113,21,2238100,1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,44900,636,8100,2235850.0,9300,2235350.0,12100,2236450.0,14000,2237000.0
2,34200.189608,1,3911376,20,2239600,-1,2012-06-21 09:30:00.189608,2239500,100,2238100,...,40500,109,7800,2235700.0,9200,2235300.0,9600,2235200.0,12500,2236250.0


In [31]:
df.columns

Index(['Time_stamp', 'Type', 'OrderID', 'V', 'P', 'Dir', 'Time', 'P_ask_1',
       'V_ask_1', 'P_bid_1', 'V_bid_1', 'P_ask_2', 'V_ask_2', 'P_bid_2',
       'V_bid_2', 'P_ask_3', 'V_ask_3', 'P_bid_3', 'V_bid_3', 'P_ask_4',
       'V_ask_4', 'P_bid_4', 'V_bid_4', 'P_ask_5', 'V_ask_5', 'P_bid_5',
       'V_bid_5', 'Label', 'Level', 'Volts', 'N_OrderSubmit',
       'bid-ask spread 1', 'mid-price 1', 'd_P_ask_51', 'd_P_bid_51',
       'd_P_ask_21', 'd_P_bid_21', 'd_P_ask_32', 'd_P_bid_32', 'd_P_ask_43',
       'd_P_bid_43', 'd_P_ask_54', 'd_P_bid_54', 'Mean_P_ask', 'Mean_P_bid',
       'Mean_V_ask', 'Mean_V_bid', 'P_accu', 'V_accu', 'bid-ask spread 2',
       'mid-price 2', 'bid-ask spread 3', 'mid-price 3', 'bid-ask spread 4',
       'mid-price 4', 'bid-ask spread 5', 'mid-price 5'],
      dtype='object')

####  6) OUTPUT IT TO A CSV

In [41]:
df.to_csv("MsgAndOrders_withNewColumns.csv")

#### 4) GROUPING ALL MESSAGES BY ORDER IDS

In [99]:
OrderHistory = GetOrderHistory(df)

running...
Completed. It takes about 0.28 minutes.


In [246]:
his_list = []
ct_items = 0
for k,v in OrderHistory.items():
    ct_items += 1
    his_list.append(len(v))

print("total number of orders is",int(ct_items))    
print("orders with 2 events is about {0:.2f}%.".format(63712*100/85402))

total number of orders is 85402
orders with 2 events is about 74.60%.


In [243]:
counts_allHistories = Counter(his_list)
counts_allHistories

Counter({1: 19853,
         2: 63712,
         3: 1477,
         4: 214,
         5: 83,
         6: 26,
         7: 13,
         8: 6,
         9: 4,
         10: 3,
         11: 4,
         13: 1,
         14: 1,
         15: 3,
         16: 1,
         18: 1})

In [238]:
ct = 0
for k, v in OrderHistory.items():
    ct += len(v)
print('average number of event for an order is {0:.3f}'.format(ct/len(OrderHistory.keys())))

average number of event for an ordedr is 1.797


#### 5) GETTING START AND END STATES FOR SELECTED ORDER TYPES, STORE IN DICTIONARIES

In [184]:
OE_history = ReturnTwoStates_orders(df_original=df,type_list=[4,5],orderhistory=OrderHistory)

running...
Completed. It takes about 0.00 minutes.


In [185]:
OC_history = ReturnTwoStates_orders(df_original=df,type_list=[2,3],orderhistory=OrderHistory)

running...
Completed. It takes about 0.01 minutes.


In [187]:
#OC_history

#### 6) FORMAT ORDER HISTORIES INTO DATAFRAMES

In [221]:
OC_df = HistoryToDF(OC_history)

running now....
Completed. It takes about 0.01 minutes.


In [222]:
OE_df = HistoryToDF(OE_history)

running now....
Completed. It takes about 0.00 minutes.


In [241]:
OE_df.head(5)

Unnamed: 0,OrderID,Time_st,V_st,P_st,Label_st,Level_st,Time_ed,V_ed,P_ed,Label_ed,Level_ed
0,11885113,34200.189608,21,2238100,lb,1.0,34200.190226,21,2238100,ma,1.0
1,3911376,34200.189608,20,2239600,la,2.0,34200.391413,20,2239600,mb,1.0
2,11534792,34200.189608,100,2237500,lb,2.0,34230.0133,74,2237500,c,5.0
3,1365373,34200.189608,13,2240000,la,4.0,34200.403006,13,2240000,mb,1.0
4,11474176,34200.189608,2,2236500,lb,3.0,34309.903928,2,2236500,ma,1.0


In [247]:
OC_df.head(5)

Unnamed: 0,OrderID,Time_st,V_st,P_st,Label_st,Level_st,Time_ed,V_ed,P_ed,Label_ed,Level_ed
0,11534792,34200.189608,100,2237500,lb,2.0,34230.0133,74,2237500,c,5.0
1,4632045,34200.189608,100,2235000,lb,5.0,34318.955212,100,2235000,c,1.0
2,16206910,34200.39304,286,2236200,lb,3.0,34200.510747,286,2236200,c,3.0
3,16229815,34200.510832,286,2236300,lb,3.0,34200.516096,286,2236300,c,3.0
4,16230863,34200.516304,286,2236900,lb,2.0,34200.542125,286,2236900,c,2.0


In [225]:
counts_OC = Counter(OC_df['Label_ed'])
counts_OC

Counter({'c': 59500, 'pc': 25})

In [226]:
counts_OE = Counter(OE_df['Label_ed'])
counts_OE

Counter({'c': 461, 'ma': 2530, 'mb': 3494})

In [249]:
# OrderHistory

## TESTING.... 

In [196]:
d = pd.DataFrame(columns=['Name','Age'])
ind = 0
d.loc[ind] = ['Aaron',23]
ind += 1
d.loc[ind] = ['Ben',26]
d

Unnamed: 0,Name,Age
0,Aaron,23
1,Ben,26


In [202]:
test_dict = {23405153: [{34507.149459957: [1, 1, 200, 2237400, 'lb', 3.0]},
  {34510.033355939: [3, 1, 200, 2237400, 'c', 2.0]}],
 23405159: [{34507.149553915: [1, -1, 2, 2240900, 'la', 1.0]},
  {34507.149742697: [3, -1, 2, 2240900, 'c', 1.0]}]}

for k, v in test_dict.items():
    l = [k]
    
    print(k)
    print(list(v[0].keys())[0])
    print(list(v[0].values())[0])
    print(list(v[-1].keys())[0])
    print(list(v[-1].values())[0])
    print('\n')
    
    l.append(list(v[0].keys())[0]) 
    l.extend(list(v[0].values())[0])
    l.append(list(v[-1].keys())[0])
    l.extend(list(v[-1].values())[0])
#     print(l)
#     print(len(l))

23405153
34507.149459957
[1, 1, 200, 2237400, 'lb', 3.0]
34510.033355939
[3, 1, 200, 2237400, 'c', 2.0]


23405159
34507.149553915
[1, -1, 2, 2240900, 'la', 1.0]
34507.149742697
[3, -1, 2, 2240900, 'c', 1.0]




### Testing

In [131]:
L = [1,900]
M = L[::len(L)-1]
M

[1, 900]