In [1]:
import pandas as pd
from statsmodels.tsa.stattools import acf, pacf
import statsmodels.api as sm
import numpy as np

df = pd.read_csv('xnas-itch-nvidia-cake.csv')

In [2]:
# Split up and prepare dataset

# this is returning nothing
def prepare_bid_ask_data(df):
    """ Helper function to create bid_price, ask_price, bid_depth, and ask_depth columns
        that will be used for later summary statistic calculations."""
    
    # Create bid and ask price columns based on the 'side' column
    df['bid_price'] = df['price'].where(df['side'] == 'B', None)
    df['ask_price'] = df['price'].where(df['side'] == 'A', None)
        
    # Forward fill the missing bid and ask prices and depths
    df['bid_price'] = df['bid_price'].ffill()
    df['ask_price'] = df['ask_price'].ffill()
    
    return df

nvda_df = prepare_bid_ask_data(df[df['symbol'] == 'NVDA'].copy())
cake_df = prepare_bid_ask_data(df[df['symbol'] == 'CAKE'].copy())


In [3]:
def calc_summary_statistics(df):
    ''' Driver function for (a) - (k)'''
    
    # ask vinicio
    def calculate_5s_price_impact(df):
        ''' Helper function that contains logic to calculate 5 second price impact.'''

        # Calculate the midpoint price
        df['midpoint'] = (df['bid_price'] + df['ask_price']) / 2

        # Lag midpoint by 5 seconds
        df['midpoint_lag'] = df['midpoint'].shift(5)

        # Calculate the midpoint return
        df['midpoint_return'] = df['midpoint'] - df['midpoint_lag']

        # Create a trade sign column based on the 'side' (-1 for 'Ask', +1 for 'Bid')
        df['trade_sign'] = df['side'].apply(lambda x: 1 if x == 'Bid' else -1)

        # Remove NaN values that result from the shift
        clean = nvda_df.dropna(subset=['midpoint_return', 'trade_sign'])

        # Perform the regression of 5-second midpoint return on trade sign
        price_impact_model = sm.OLS(pd.to_numeric(clean['midpoint_return']), clean['trade_sign']).fit()

        # Display the summary of the regression model
        return price_impact_model.summary()

    
    
    def calculate_depth_at_twice_avg_spread(df):
        """ Calculate depth at twice the day's average spread per minute. """

        # Group by minute
        df['minute'] = pd.to_datetime(df['ts_event']).dt.floor('T')

        # Calculate the spread for each event
        df['spread'] = df.apply(lambda row: row['ask_price'] - row['bid_price'] if pd.notna(row['ask_price']) and pd.notna(row['bid_price']) else None, axis=1)

        # Calculate the daily average spread
        daily_avg_spread = df['spread'].mean()

        # Threshold is twice the daily average spread
        threshold = 2 * daily_avg_spread

        # Group by minute and calculate the depth based on the condition
        result = df.groupby('minute').apply(lambda group: calculate_depth_for_group(group, threshold))

        return result

    
    
    def calculate_depth_for_group(group, threshold):
        """ Calculate depth for a specific group given the threshold. """

        # Calculate the best bid and ask prices in the minute
        best_bid_price = group[group['side'] == 'Bid']['price'].max()
        best_ask_price = group[group['side'] == 'Ask']['price'].min()

        # Calculate the spread for the minute
        minute_spread = best_ask_price - best_bid_price if pd.notna(best_bid_price) and pd.notna(best_ask_price) else None

        if pd.notna(minute_spread) and minute_spread > threshold:
            # If the spread is greater than the threshold, set the depth to zero
            return pd.Series({'depth_at_twice_avg_spread': 0})
        else:
            # Otherwise, sum the bid and ask depths at the best prices
            best_bid_depth = group[(group['side'] == 'Bid') & (group['price'] == best_bid_price)]['size'].sum()
            best_ask_depth = group[(group['side'] == 'Ask') & (group['price'] == best_ask_price)]['size'].sum()
            depth_at_twice_avg_spread = best_bid_depth + best_ask_depth
            return pd.Series({'depth_at_twice_avg_spread': depth_at_twice_avg_spread})

        
    
    def calculate_bbo_spread_and_depth_per_minute(df):
        ''' Calculate the best BBO spread and depth per minute from the provided DataFrame. '''

        # Group by minute
        df['minute'] = pd.to_datetime(df['ts_event']).dt.floor('T')

        # Initialize lists to store results
        minutes = []
        best_bbo_spreads = []
        best_bbo_depths = []

        # Initialize variables to keep track of the last available bid and ask prices
        last_bid_price = None
        last_ask_price = None
        last_bid_depth = 0
        last_ask_depth = 0

        # Group by each minute
        grouped = df.groupby('minute')

        for minute, group in grouped:
            # Identify the best bid and ask prices in the minute
            best_bid_price = group[group['side'] == 'B']['price'].max() if not group[group['side'] == 'B'].empty else last_bid_price
            best_ask_price = group[group['side'] == 'A']['price'].min() if not group[group['side'] == 'A'].empty else last_ask_price

            # If bid or ask is missing, use the last available values
            if pd.isna(best_bid_price):
                best_bid_price = last_bid_price
            if pd.isna(best_ask_price):
                best_ask_price = last_ask_price

            # Calculate the BBO spread
            if pd.notna(best_bid_price) and pd.notna(best_ask_price):
                bbo_spread = abs(best_ask_price - best_bid_price)
            else:
                bbo_spread = None

            # Calculate the BBO depth
            best_bid_depth = group[(group['side'] == 'B') & (group['price'] == best_bid_price)]['size'].sum() if not group[group['side'] == 'B'].empty else last_bid_depth
            best_ask_depth = group[(group['side'] == 'A') & (group['price'] == best_ask_price)]['size'].sum() if not group[group['side'] == 'A'].empty else last_ask_depth
            bbo_depth = best_bid_depth + best_ask_depth

            # Update the last bid and ask prices and depths
            last_bid_price = best_bid_price
            last_ask_price = best_ask_price
            last_bid_depth = best_bid_depth
            last_ask_depth = best_ask_depth

            # Store the results
            minutes.append(minute)
            best_bbo_spreads.append(bbo_spread)
            best_bbo_depths.append(bbo_depth)

        # Create a DataFrame for the results
        best_bbo_spread = dict(zip(minutes, best_bbo_spreads))
        best_bbo_depth = dict(zip(minutes, best_bbo_depths))

        return best_bbo_spread, best_bbo_depth
    
    

    def calculate_5s_price_impact(df):
        ''' Helper function that contains logic to calculate 5 second price impact.'''
        
        # Calculate the midpoint price
        df['midpoint'] = (df['bid_price'] + df['ask_price']) / 2

        # Lag midpoint by 5 seconds
        df['midpoint_lag'] = df['midpoint'].shift(5)

        # Calculate the midpoint return
        df['midpoint_return'] = df['midpoint'] - df['midpoint_lag']

        # Create a trade sign column based on the 'side' (-1 for 'Ask', +1 for 'Bid')
        df['trade_sign'] = df['side'].apply(lambda x: 1 if x == 'Bid' else -1)

        # Remove NaN values that result from the shift
        clean = nvda_df.dropna(subset=['midpoint_return', 'trade_sign'])

        # Perform the regression of 5-second midpoint return on trade sign
        price_impact_model = sm.OLS(pd.to_numeric(clean['midpoint_return']), clean['trade_sign']).fit()

        # Display the summary of the regression model
        return price_impact_model.summary()
    
    
    
    def calculate_midquote_transaction(df):
        ''' Helper function to calculate one-second and one-minute midquote and transaction price series.'''

        # Ensure ts_event is datetime and set it as the index
        df['ts_event'] = pd.to_datetime(df['ts_event'])
        df.set_index('ts_event', inplace=True)

        # Calculate the midpoint price
        df['midpoint'] = (df['bid_price'] + df['ask_price']) / 2

        # Resample the data to one-second and one-minute intervals, taking the last value in each interval
        midquote_1sec = df['midpoint'].resample('1S').last()
        midquote_1min = df['midpoint'].resample('1T').last()

        # For transaction prices
        transaction_1sec = df['price'].resample('1S').last()
        transaction_1min = df['price'].resample('1T').last()
        
        
        # Drop the first NaN value
        midquote_1sec.dropna(inplace=True)
        midquote_1min.dropna(inplace=True)
        transaction_1sec.dropna(inplace=True)
        transaction_1min.dropna(inplace=True)


        return midquote_1sec, midquote_1min, transaction_1sec, transaction_1min

    
    
    def calculate_log_returns(midquote_1_min, transaction_1min):
        ''' Helper function to calculate log returns.'''
        
        midquote_log_return_1min = np.log(midquote_1min) - np.log(midquote_1min.shift(1))
        transaction_log_return_1min = np.log(transaction_1min) - np.log(transaction_1min.shift(1))

        # Drop NaN values resulting from the shift
        midquote_log_return_1min = midquote_log_return_1min.dropna()
        transaction_log_return_1min = transaction_log_return_1min.dropna()

        # Display the first few log-returns
        return midquote_log_return_1min, transaction_log_return_1min
    

    # initialize results table
    results = {}
    results_1s = {}
    
    # get minutes for by-minute fcns
    num_minutes = pd.to_datetime(df['ts_event']).dt.floor('T').nunique()
    
    # (a) dollar trading volume per minute
    dollar_trading_per_min = ((df['price'] * df['size']).sum()) / num_minutes
    print('Dollar trading per minute: ', dollar_trading_per_min )

    # (b) number of trades and number of orders (orders only for NASDAQ ITCH) per minute
    number_trades_per_min = df.shape[0] / num_minutes
    print('Number trades per minute: ', number_trades_per_min)

    number_orders_per_min = df['sequence'].nunique() / num_minutes
    print('Number orders per minute: ', number_orders_per_min)
    
    # (c) open, close, high and low prices
    opn = df.iloc[0]['price']

    close = df.iloc[-1]['price']

    high = df['price'].max()

    low = df['price'].min()
    
    print('(A) - (C)')
    print('Open: ', opn)
    print('Close: ', close)
    print('High: ', high)
    print('Low : ', low)
    
    # (d) VWAP per minute
    results['vwap'] = df.groupby(pd.to_datetime(nvda_df['ts_event']).dt.floor('T')).apply(
        lambda x: (x['price'] * x['size']).sum() / x['size'].sum())
    
    # (e) BBO spread and depth per minute
    best_bbo_spread, best_bbo_depth = calculate_bbo_spread_and_depth_per_minute(df)
    results['best_bbo_spread'] = pd.Series(best_bbo_spread)
    results['best_bbo_depth'] = pd.Series(best_bbo_depth)
            
    # (f) depth at twice that day’s average spread (it will be zero when the spread is greater than twice its
    # average) per minute
    
    # this might be wrong
    
    depth_twice_avg = calculate_depth_at_twice_avg_spread(df)
    results['depth_twice_avg'] = depth_twice_avg['depth_at_twice_avg_spread']
    
    # (g) 5-second price impact (regress 5-second midpoint quote return on current trade sign, -1 or +1) per minute
    price_impact_5s = calculate_5s_price_impact(df)
    results['5s_price_impact'] = price_impact_5s
    
    # (h) calculate one second and one minute midquote and transaction price series (take the last quote price
    # and transaction price in each interval)
    midquote_1sec, midquote_1min, transaction_1sec, transaction_1min = calculate_midquote_transaction(df)
    results_1s['midquote_1sec'] = midquote_1sec
    results_1s['transaction_1sec'] = transaction_1sec
    results['midquote_1min'] = midquote_1min
    results['transaction_1min'] = transaction_1min
    
    # (i) Compute the difference (log-returns), log(pt) - log(pt−1), for both midquote prices and transaction
    # prices. These are the midquote return and transaction return series.
    midquote_log_return_1min, transaction_log_return_1min = calculate_log_returns(midquote_1min, transaction_1min)
    results['midquote_log_return_1min'] = midquote_log_return_1min
    results['transaction_log_return_1min'] = transaction_log_return_1min
    
    # (j) Compute the realized variance for both midquote returns and transaction returns
    
    midquote_realized_variance = (midquote_log_return_1min ** 2).sum()
    results['midquote_realized_variance'] = (midquote_log_return_1min ** 2).sum()
    
    # (k) Compute the auto-correlation for both midquote returns and transaction returns using either the Box-
    # Pierce test (function Box.test from package stats) or ACF (function pacf)
    
    midquote_pacf = pacf(midquote_log_return_1min)

    # Auto-correlation for transaction returns
    transaction_pacf = pacf(transaction_log_return_1min)
    
    print('\n\n(K) Autocorrelation for midquote and transaction returns')
    
    print('Midquote return autocorrelations')
    print(midquote_pacf)
    
    print('\n\nTransaction return autocorrelations')
    print(transaction_pacf)

    return pd.DataFrame.from_dict(results), pd.DataFrame.from_dict(results_1s)


In [4]:
min_stats, sec_stats = calc_summary_statistics(nvda_df)

Dollar trading per minute:  7655964.519903844
Number trades per minute:  518.8966346153846
Number orders per minute:  518.8966346153846
(A) - (C)
Open:  128.63
Close:  125.61
High:  130.85
Low :  124.43


(K) Autocorrelation for midquote and transaction returns
Midquote return autocorrelations
[ 1.00000000e+00  7.22534526e-02 -1.34167359e-01 -3.08815198e-02
  1.91689224e-02  4.16881282e-02  1.83715931e-01 -6.09642770e-02
 -5.65628874e-02 -2.67192962e-02  1.50537850e-02  1.65506315e-01
  5.72440645e-02 -6.04378300e-02 -1.18278737e-03 -2.71744472e-03
 -8.66985917e-02 -7.47156444e-02 -2.13133570e-02  5.84451671e-02
  2.72945722e-02 -7.43170483e-04  2.02785407e-02  6.97074097e-03
  2.33588517e-02  3.17885161e-02  4.84009096e-02]


Transaction return autocorrelations
[ 1.00000000e+00  6.00847530e-02 -1.28197090e-01 -3.04166863e-02
  1.97797621e-02  5.29437961e-02  1.77711799e-01 -6.05048789e-02
 -5.51132786e-02 -3.36847427e-02  1.71885951e-02  1.49026548e-01
  7.80841901e-02 -7.29162560e-02

In [5]:
min_stats.head()

Unnamed: 0,vwap,best_bbo_spread,best_bbo_depth,depth_twice_avg,5s_price_impact,midquote_1min,transaction_1min,midquote_log_return_1min,transaction_log_return_1min,midquote_realized_variance
2024-08-22 09:00:00+00:00,128.61518,0.02,155,0,OLS Regression Res...,128.625,128.61,,,0.000539
2024-08-22 09:01:00+00:00,128.61352,0.01,93,0,OLS Regression Res...,128.62,128.62,-3.9e-05,7.8e-05,0.000539
2024-08-22 09:02:00+00:00,128.619093,0.03,101,0,OLS Regression Res...,128.635,128.61,0.000117,-7.8e-05,0.000539
2024-08-22 09:03:00+00:00,128.605455,0.05,450,0,OLS Regression Res...,128.625,128.6,-7.8e-05,-7.8e-05,0.000539
2024-08-22 09:04:00+00:00,128.642832,0.02,528,0,OLS Regression Res...,128.64,128.65,0.000117,0.000389,0.000539


In [6]:
sec_stats.head()

Unnamed: 0_level_0,midquote_1sec,transaction_1sec
ts_event,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-08-22 09:00:11+00:00,,128.63
2024-08-22 09:00:12+00:00,128.62,128.61
2024-08-22 09:00:19+00:00,128.625,128.62
2024-08-22 09:00:25+00:00,128.625,128.62
2024-08-22 09:00:26+00:00,128.625,128.62
