In [1]:
import pandas as pd

df = pd.read_csv('xnas-itch-nvidia-cake.csv')

In [2]:
# Split up and prepare dataset

def prepare_bid_ask_data(df):
    """ Helper function to create bid_price, ask_price, bid_depth, and ask_depth columns
        that will be used for later summary statistic calculations."""
    df.loc[:, 'bid_price'] = df.apply(lambda x: x['price'] if x['side'] == 'Bid' else None, axis=1)
    df.loc[:, 'ask_price'] = df.apply(lambda x: x['price'] if x['side'] == 'Ask' else None, axis=1)
    df.loc[:, 'bid_depth'] = df.apply(lambda x: x['depth'] if x['side'] == 'Bid' else None, axis=1)
    df.loc[:, 'ask_depth'] = df.apply(lambda x: x['depth'] if x['side'] == 'Ask' else None, axis=1)
    
    # Forward fill the missing bid and ask prices and depths
    df.loc[:, 'bid_price'] = df['bid_price'].ffill()
    df.loc[:, 'ask_price'] = df['ask_price'].ffill()
    df.loc[:, 'bid_depth'] = df['bid_depth'].ffill()
    df.loc[:, 'ask_depth'] = df['ask_depth'].ffill()
    
    return df

df = prepare_bid_ask_data(df)

nvda_df = prepare_bid_ask_data(df[df['symbol'] == 'NVDA'].copy())
cake_df = prepare_bid_ask_data(df[df['symbol'] == 'CAKE'].copy())


# a. dollar trading volume per minute
# b. number of trades and number of orders (orders only for NASDAQ ITCH) per minute
# c. open, close, high and low prices
# d. VWAP per minute

In [3]:
# Calculate statistics for NVDA
nvda_dollar_volume = (nvda_df['price'] * nvda_df['size']).sum()

nvda_trades = nvda_df.shape[0]

nvda_orders = nvda_df['sequence'].nunique()

nvda_open = nvda_df.iloc[0]['price']

nvda_close = nvda_df.iloc[-1]['price']

nvda_high = nvda_df['price'].max()

nvda_low = nvda_df['price'].min()

nvda_vwap = nvda_df.groupby(pd.to_datetime(nvda_df['ts_event']).dt.floor('T')).apply(
    lambda x: (x['price'] * x['size']).sum() / x['size'].sum())


In [4]:
# Calculate statistics for CAKE
cake_dollar_volume = (cake_df['price'] * cake_df['size']).sum()

cake_trades = cake_df.shape[0]

cake_orders = cake_df['sequence'].nunique()

cake_open = cake_df.iloc[0]['price']

cake_close = cake_df.iloc[-1]['price']

cake_high = cake_df['price'].max()

cake_low = cake_df['price'].min()

cake_vwap = cake_df.groupby(pd.to_datetime(cake_df['ts_event']).dt.floor('T')).apply(
    lambda x: (x['price'] * x['size']).sum() / x['size'].sum())


# e. BBO spread and depth per minute

In [5]:

# Calculate the BBO Spread
df['bbo_spread'] = df['ask_price'] - df['bid_price']

# Calculate the BBO Depth
df['bbo_depth'] = df['bid_depth'] + df['ask_depth']

# Group by minute and calculate the mean BBO Spread and BBO Depth for each minute
bbo_stats_nvda = df[df['symbol'] == 'NVDA'].groupby(pd.to_datetime(df['ts_event']).dt.floor('T')).agg({
    'bbo_spread': 'mean',
    'bbo_depth': 'mean'
})

bbo_stats_cake = df[df['symbol'] == 'CAKE'].groupby(pd.to_datetime(df['ts_event']).dt.floor('T')).agg({
    'bbo_spread': 'mean',
    'bbo_depth': 'mean'
})


# g. 5-Second Price Impact

In [None]:
# need to add more of the stats later

In [6]:
# Create the Summary DataFrame
summary_statistics = pd.DataFrame({
    'NVIDIA': [nvda_dollar_volume, nvda_trades, nvda_orders, nvda_open, nvda_close, nvda_high, nvda_low],
    'CAKE': [cake_dollar_volume, cake_trades, cake_orders, cake_open, cake_close, cake_high, cake_low]
}, index=['Dollar Volume', 'Number of Trades', 'Number of Orders', 'Open', 'Close', 'High', 'Low'])

# Display the summary statistics
summary_statistics



Unnamed: 0,NVIDIA,CAKE
Dollar Volume,3184881000.0,1441245.96
Number of Trades,215861.0,771.0
Number of Orders,215861.0,771.0
Open,128.63,38.76
Close,125.61,38.7
High,130.85,39.09
Low,124.43,38.63
