In [1]:
import pandas as pd
import numpy as np
import math
from scipy.optimize import minimize
pd.options.mode.chained_assignment = None

In [2]:
# read data from WRDS
df = pd.read_csv('data.csv', low_memory=False)

# filter for targeted exchange
target_exchanges = ['T', 'P', 'Z'] # nasdaq, nyse-arca, or bats
data = df[df['EX'].isin(target_exchanges)]

0.7322183887796165

In [3]:
# Filter and get rid of negative and zero bid-ask spread
data['spread'] = data['ASK'] - data['BID']
filtered_data = data[data['spread'] > 0]

In [4]:
# parse for each security
jpm = data[data['SYM_ROOT'] == 'JPM']
xlf = data[data['SYM_ROOT'] == 'XLF']
qqqq = data[data['SYM_ROOT'] == 'QQQQ']
aapl = data[data['SYM_ROOT'] == 'AAPL']

In [None]:
# Convert time and date into datetime
data['DATE'] = data['DATE'].astype(str)
data['TIME_M'] = data['TIME_M'].astype(str)
data['TIMESTAMP'] = pd.to_datetime(data['DATE'] + ' ' + data['TIME_M'])

data['midquote'] = (data['BID'] + data['ASK']) / 2
data['spread'] = data['ASK'] - data['BID']
data['bsize_asize'] = data['BIDSIZ'] + data['ASKSIZ']
time_diff = (data['TIMESTAMP'].max() - data['TIMESTAMP'].min()).total_seconds()

summary_stats = data.groupby(['SYM_ROOT', 'EX']).agg(
    num_quotes=('midquote', 'size'),
    avg_spread=('spread', 'mean'),
    avg_bsize_asize=('bsize_asize', 'mean'),
    avg_price=('midquote', 'mean')
).reset_index()

# edit the results to look exactly like the table in the paper
summary_stats['quotes_per_sec'] = (summary_stats['num_quotes'] / ((6 * 60 * 60) * 5)).astype(int)
summary_stats['num_quotes'] = round(summary_stats['num_quotes'] / 1e6, 1)
summary_stats['avg_spread'] = round(summary_stats['avg_spread'], 3)
summary_stats['avg_bsize_asize'] = summary_stats['avg_bsize_asize'].apply(
    lambda x: int(x) if x > 10 else round(x, 1)
)
summary_stats['avg_price'] = round(summary_stats['avg_price'], 2)

summary_stats.rename(columns={
    'SYM_ROOT': 'Ticker',
    'EX': 'Exchange',
    'num_quotes': 'Num Quotes',
    'avg_spread': 'avg(spread)',
    'avg_bsize_asize': 'avg(bsize + asize)',
    'avg_price': 'avg(price)'
}, inplace=True)

exchange_mapping = {
    'T': 'NASDAQ',
    'P': 'NYSE',
    'Z': 'BATS'
}
summary_stats['Exchange'] = summary_stats['Exchange'].replace(exchange_mapping)
exchange_order = ['NASDAQ', 'NYSE', 'BATS']
summary_stats['Exchange'] = pd.Categorical(summary_stats['Exchange'], categories=exchange_order, ordered=True)

ticker_order = ['XLF', 'QQQQ', 'JPM', 'AAPL']
summary_stats['Ticker'] = pd.Categorical(summary_stats['Ticker'], categories=ticker_order, ordered=True)

summary_stats = summary_stats[['Ticker', 'Exchange', 'Num Quotes', 'quotes_per_sec', 'avg(spread)', 'avg(bsize + asize)', 'avg(price)']]

summary_stats = summary_stats.sort_values(by=['Ticker', 'Exchange'], ascending=[True, True])

In [None]:
summary_stats

(c) Table 3 - Empirical vs. Model probabilities for the probability of an
upward move on Nasdaq (T) for all 4 securities.

In [4]:
from scipy.optimize import minimize

In [11]:
def calc_empirical_prob(data, ticker, exchange, spread=None):
    # Filter data for the given ticker, exchange, and spread range
    if spread is None:
        print(data.head())
        td = data[(data['SYM_ROOT'] == ticker) & (data['EX'] == exchange) & (data['spread'] > 0)].copy()
    else:
        td = data[(data['SYM_ROOT'] == ticker) & (data['EX'] == exchange) 
                  & (data['spread'] > 0) & (np.round(data['spread'].values, 2) == spread)].copy()
    
    # Calculate midquote
    td['MIDQUOTE'] = (td['BID'] + td['ASK']) / 2
    
    # Calculate direction based on MID price changes
    td['Direction'] = [0] + list(np.sign(np.diff(td['MIDQUOTE'].values)))
    td['Direction'] = td['Direction'].replace(to_replace=0, method='bfill')

    # Above is from vinicio ( do not touch )

    # Vinicio told me the above should be enough logic to send it to the minimizer.. I feel like
    # I am missing something
    
    # Sort BIDSIZ and ASKSIZ
    sorted_bidsiz = np.sort(td['BIDSIZ'].values)
    sorted_asksiz = np.sort(td['ASKSIZ'].values)
    
    # Number of buckets
    num_buckets = 10
    
    # Calculate bucket boundaries using ceiling of the length and bucket position (from assignment description)
    bid_bucket_pos = np.ceil(len(sorted_bidsiz) / num_buckets * np.arange(1, num_buckets + 1)).astype(int)
    ask_bucket_pos = np.ceil(len(sorted_asksiz) / num_buckets * np.arange(1, num_buckets + 1)).astype(int)
    
    # Create boundaries for bid and ask size buckets
    bid_boundaries = sorted_bidsiz[bid_bucket_pos - 1]  # Subtract 1 for zero-indexing
    ask_boundaries = sorted_asksiz[ask_bucket_pos - 1]
    
    # Assign bid and ask size buckets based on the created boundaries
    td['BIDSIZ_bucket'] = pd.cut(td['BIDSIZ'], bins=[-np.inf] + list(bid_boundaries) + [np.inf], labels=False, duplicates="drop")
    td['ASKSIZ_bucket'] = pd.cut(td['ASKSIZ'], bins=[-np.inf] + list(ask_boundaries) + [np.inf], labels=False, duplicates="drop")

    # Return the deciles and directions for future calculations
    return td['BIDSIZ_bucket'], td['ASKSIZ_bucket'], td['Direction'], bid_boundaries, ask_boundaries

# Model probability function using deciles
def model_prob(i, j, H):
    return (j + H) / (j + i + 2 * H)

def objective_function(H, bid_decile, ask_decile, directions):
    total_error = 0
    num_buckets = len(bid_decile)

    # Convert deciles to numpy arrays for index-based access
    bid_decile = bid_decile.to_numpy()
    ask_decile = ask_decile.to_numpy()

    # Iterate through the bid and ask decile buckets
    for i in range(num_buckets):
        for j in range(num_buckets):
            # Get model probability using the current H
            model_p = model_prob(bid_decile[i], ask_decile[j], H)
            
            # Retrieve the actual direction (up/down move) for comparison
            actual_direction = directions[(bid_decile == i) & (ask_decile == j)]
            
            actual_prob = (actual_direction == 1).mean()  # Calculate empirical probability of upward moves
            
            # Add weighted squared error to the total error
            total_error += ((actual_prob - model_p) ** 2) * len(actual_direction)  # Weighted error
    
    return total_error

# Function to estimate H using deciles and directions
def estimate_H(bid_decile, ask_decile, directions):
    initial_guess = [0.5]  # Starting guess for H

    # Minimize the objective function to estimate H
    result = minimize(objective_function, initial_guess, args=(bid_decile, ask_decile, directions))

    # Retrieve the optimal value for H
    H_optimal = result.x[0]
    print(f"Estimated H: {H_optimal}")
    
    return H_optimal

# Example usage:
bid_decile, ask_decile, directions, bid_boundaries, ask_boundaries = calc_empirical_prob(data, 'AAPL', 'T')
H_optimal = estimate_H(bid_decile, ask_decile, directions)


       DATE        TIME_M EX SYM_ROOT SYM_SUFFIX     BID  BIDSIZ     ASK  \
1  20100104  10:00:00.030  T     AAPL        NaN  213.25       2  213.36   
4  20100104  10:00:00.133  T     AAPL        NaN  213.25       3  213.36   
5  20100104  10:00:00.137  T     AAPL        NaN  213.25       3  213.36   
7  20100104  10:00:00.333  Z     AAPL        NaN  213.23       1  213.37   
8  20100104  10:00:00.337  T     AAPL        NaN  213.25       3  213.38   

   ASKSIZ  spread  
1       2    0.11  
4       2    0.11  
5       1    0.11  
7       2    0.14  
8       1    0.13  


  td['Direction'] = td['Direction'].replace(to_replace=0, method='bfill')


KeyboardInterrupt: 

In [62]:
spread=None
ticker='XLF'
exchange='T'
if spread is None:
    print(data.head())
    td = data[(data['SYM_ROOT'] == ticker) & (data['EX'] == exchange) & (data['spread'] > 0)].copy()
else:
    td = data[(data['SYM_ROOT'] == ticker) & (data['EX'] == exchange) 
              & (data['spread'] > 0) & (np.round(data['spread'].values, 2) == spread)].copy()

# Calculate midquote
td['MIDQUOTE'] = (td['BID'] + td['ASK']) / 2

# Calculate direction based on MID price changes
td['Direction'] = [0]+list(np.sign(np.diff(td['MIDQUOTE'].values)))
td['Direction'] = td['Direction'].replace(to_replace=0, method='bfill')

       DATE        TIME_M EX SYM_ROOT SYM_SUFFIX     BID  BIDSIZ     ASK  \
1  20100104  10:00:00.030  T     AAPL        NaN  213.25       2  213.36   
4  20100104  10:00:00.133  T     AAPL        NaN  213.25       3  213.36   
5  20100104  10:00:00.137  T     AAPL        NaN  213.25       3  213.36   
7  20100104  10:00:00.333  Z     AAPL        NaN  213.23       1  213.37   
8  20100104  10:00:00.337  T     AAPL        NaN  213.25       3  213.38   

   ASKSIZ  spread  
1       2    0.11  
4       2    0.11  
5       1    0.11  
7       2    0.14  
8       1    0.13  


  td['Direction'] = td['Direction'].replace(to_replace=0, method='bfill')


In [63]:
td['BIDSIZ'].quantile(np.arange(0.1,1,0.1)).values

array([1436., 2277., 3351., 4505., 5237., 5623., 5985., 6427., 6995.])

In [68]:
# Create boundaries for bid and ask size buckets
bid_boundaries = td['BIDSIZ'].quantile(np.arange(0.1,1,0.1)).values
ask_boundaries = td['ASKSIZ'].quantile(np.arange(0.1,1,0.1)).values

# Assign bid and ask size buckets based on the created boundaries
td['BIDSIZ_bucket'] = pd.cut(td['BIDSIZ'], bins=[-np.inf] + list(bid_boundaries) + [np.inf], labels=False, duplicates="drop")
td['ASKSIZ_bucket'] = pd.cut(td['ASKSIZ'], bins=[-np.inf] + list(ask_boundaries) + [np.inf], labels=False, duplicates="drop")

In [69]:
def model_prob(i, j, H):
    return (i + H) / (j + i + 2 * H)

def objective_function(H, bid_decile, ask_decile, directions):
    total_error = 0
    num_buckets = len(bid_decile.unique())

    # Convert deciles to numpy arrays for index-based access
    # bid_decile = bid_decile.to_numpy()
    # ask_decile = ask_decile.to_numpy()

    # Iterate through the bid and ask decile buckets
    for i in range(9):
        for j in range(9):
            # Get model probability using the current H
            model_p = model_prob((i+1)/10, (j+1)/10, H)
            
            # Retrieve the actual direction (up/down move) for comparison
            actual_direction = directions[(bid_decile == i) & (ask_decile == j)]
            
            actual_prob = (actual_direction == 1).mean()  # Calculate empirical probability of upward moves
            
            # Add weighted squared error to the total error
            total_error += ((actual_prob - model_p) ** 2) * len(actual_direction)  # Weighted error
    
    return total_error

# Function to estimate H using deciles and directions
def estimate_H(bid_decile, ask_decile, directions):
    initial_guess = 0.2 # Starting guess for H

    # Minimize the objective function to estimate H
    result = minimize(objective_function, initial_guess, args=(bid_decile, ask_decile, directions),method='Nelder-Mead')

    # Retrieve the optimal value for H
    H_optimal = result.x[0]
    print(f"Estimated H: {H_optimal}\n Success: {result.success}\n Message: {result.message}")
    
    return H_optimal

In [70]:
H=estimate_H(td['BIDSIZ_bucket'], td['ASKSIZ_bucket'], td['Direction'])

Estimated H: 0.19453125000000002
 Success: True
 Message: Optimization terminated successfully.


In [71]:
td['BIDSIZ_bucket'].unique()

array([1, 2, 0, 3, 4, 5, 6, 7, 8, 9])

In [72]:
emp_prob=[]
mp=[]
directions=td['Direction']
err=0
H=0.140078125
for i in range(10):
    emp_row=[]
    model_row=[]
    for j in range(10):
        actual_direction = directions[(td['BIDSIZ_bucket'] == i) & (td['ASKSIZ_bucket'] == j)]
        actual_prob = (actual_direction == 1).mean()
        emp_row.append(actual_prob)

        model_p=model_prob(i,j,H)
        model_row.append(model_p)
        err += ((actual_prob - model_p) ** 2) * len(actual_direction)
    emp_prob.append(emp_row)
    mp.append(model_row)
print(err)

19474.813876581284


In [73]:
emp_prob=pd.DataFrame(emp_prob,columns=np.arange(0.1,1.1,0.1),index=np.arange(0.1,1.1,0.1))
emp_prob

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
0.1,0.633028,0.591463,0.153005,0.284708,0.563458,0.369123,0.304791,0.206323,0.268373,0.058412
0.2,0.688119,0.321429,0.391026,0.914831,0.306097,0.415805,0.360162,0.321287,0.267038,0.217802
0.3,0.665904,0.7306,0.943888,0.78534,0.383063,0.448708,0.458631,0.471003,0.463641,0.404683
0.4,0.931987,0.818571,0.546036,0.622512,0.463393,0.52077,0.481531,0.515766,0.519464,0.395913
0.5,0.655863,0.5625,0.568316,0.645227,0.56377,0.553932,0.627019,0.555358,0.561178,0.552814
0.6,0.790613,0.335667,0.692726,0.586228,0.550035,0.537295,0.506215,0.499361,0.585101,0.493275
0.7,0.885961,0.75906,0.594806,0.535667,0.61074,0.6157,0.58961,0.640075,0.554499,0.592266
0.8,0.911064,0.78715,0.720217,0.702227,0.666815,0.601297,0.560365,0.655738,0.669456,0.534236
0.9,0.985747,0.913703,0.813786,0.722206,0.630752,0.694698,0.689531,0.742564,0.81,0.648453
1.0,0.973693,0.939529,0.877714,0.757392,0.724177,0.633198,0.730873,0.70822,0.726387,0.601389


In [74]:
mp=pd.DataFrame(mp,columns=np.arange(0.1,1.1,0.1),index=np.arange(0.1,1.1,0.1))
mp

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0
0.1,0.5,0.109423,0.061434,0.042705,0.032727,0.026529,0.022305,0.019241,0.016917,0.015094
0.2,0.890577,0.5,0.347568,0.266364,0.215917,0.181537,0.156601,0.137688,0.122851,0.110901
0.3,0.938566,0.652432,0.5,0.405306,0.340768,0.29396,0.258459,0.230608,0.208176,0.189721
0.4,0.957295,0.733636,0.594694,0.5,0.43132,0.379229,0.338365,0.30545,0.278372,0.255703
0.5,0.967273,0.784083,0.659232,0.56868,0.5,0.446122,0.402725,0.367023,0.337136,0.311749
0.6,0.973471,0.818463,0.70604,0.620771,0.553878,0.5,0.455674,0.418568,0.38705,0.359946
0.7,0.977695,0.843399,0.741541,0.661635,0.597275,0.544326,0.5,0.46235,0.429973,0.401833
0.8,0.980759,0.862312,0.769392,0.69455,0.632977,0.581432,0.53765,0.5,0.467278,0.438576
0.9,0.983083,0.877149,0.791824,0.721628,0.662864,0.61295,0.570027,0.532722,0.5,0.471065
1.0,0.984906,0.889099,0.810279,0.744297,0.688251,0.640054,0.598167,0.561424,0.528935,0.5


In [22]:
bid_boundaries

array([1436., 2277., 3351., 4505., 5237., 5623., 5985., 6427., 6995.])

In [45]:
td.loc[td['BIDSIZ_bucket']>1,'BIDSIZ_bucket']+=1

In [46]:
td.loc[td['BIDSIZ_bucket']>3,'BIDSIZ_bucket']+=1

In [23]:
td['BIDSIZ_bucket'].unique()

array([1, 2, 0, 3, 4, 5, 6, 7, 8, 9])

In [24]:
ask_boundaries

array([1112., 1808., 2508., 3445., 4541., 5279., 5770., 6125., 6669.])

In [49]:
td.loc[td['ASKSIZ_bucket']>1,'BIDSIZ_bucket']+=1
td.loc[td['ASKSIZ_bucket']>3,'BIDSIZ_bucket']+=1

In [57]:
actual_direction = directions[(bid_decile == 0) & (ask_decile == 0)]
actual_direction

9          1.0
49         1.0
53        -1.0
54         1.0
55        -1.0
          ... 
3182640   -1.0
3182647   -1.0
3182649   -1.0
3182678   -1.0
3182696   -1.0
Name: Direction, Length: 39794, dtype: float64

In [60]:
(actual_direction == 1).mean()

0.502965271146404

In [None]:
# split up data into exchanges
data_nasdaq = jpm[jpm['EX'] == 'T']
data_nyse = jpm[jpm['EX'] == 'P']
data_arca = jpm[jpm['EX'] == 'Z']

In [None]:
data_nasdaq = xlf[xlf['EX'] == 'T']
generate_empirical_probabilties(data_nasdaq)

In [None]:
generate_empirical_probabilties(qqqq)

Bid on x-axis and ask on y-axis