In [1]:
import pandas as pd
import numpy as np


In [2]:
# MBP Schema
mbp_csv = pd.read_csv('xnas-itch-20240822.mbp-1.csv', parse_dates=['ts_recv', 'ts_event'])

nvda_mbp = mbp_csv[mbp_csv['symbol'] == 'NVDA'].copy()
cake_mbp = mbp_csv[mbp_csv['symbol'] == 'CAKE'].copy()

In [3]:
# get tick-by-tick log middle price

# Calculate the log-arithmetic middle price
nvda_mbp['log_arith_middle_price'] = (np.log(nvda_mbp['bid_px_00']) + np.log(nvda_mbp['ask_px_00'])) / 2
cake_mbp['log_arith_middle_price'] = (np.log(cake_mbp['bid_px_00']) + np.log(cake_mbp['ask_px_00'])) / 2


In [8]:
# set parameters

# total number of tick observations
# for ten minute intervals, n = 60 * 10
n = 60 * 10
# number of subsamples per ten minute interval (chosen at will)
K = 20
# total sampling length - number minutesm - total trading day
T = 8 * 60

In [9]:
# [X, X]^K_T
def compute_subsampled_variance(log_returns, K, n):
    subsampled_variance = 0
    
    log_returns = list(log_returns)
    
    for i in range(n - K + 1):
        pt1 = log_returns[i + K]
        pt2 = log_returns[i]
        subsampled_variance += (pt1 - pt2)**2
        
    subsampled_variance /= K
    return subsampled_variance


# [X, X]^{All}_T
def compute_full_sample_variance(log_returns, n):
    full_sample_variance = 0
    log_returns = list(log_returns)

    for i in range(n - 1):
        
        pt1 = log_returns[i+1]
        pt2 = log_returns[i]
        
        full_sample_variance += (pt1 - pt2) ** 2
        
    return full_sample_variance

full_sample_variance = compute_full_sample_variance(nvda_mbp['log_arith_middle_price'], n)


# z
z = (n - K + 1) / K


# TSRV^{(10m)}_t
def compute_TSRV(subsampled_variance, full_sample_variance, z, n):
    tsrv = (1 - z / n) ** -1 * (subsampled_variance - (z / n) * full_sample_variance)
    return tsrv


def compute_timed_tsrvs(df, K, n, interval):
    
    tsrvs = []
    z = (n - K + 1) / K
    
    if interval == 10:
        resample = '10T'
    elif interval == 30:
        resample = '30T'
    else:
        resample = '60T'
        
    df_resampled = df.set_index('ts_event').resample(resample)
    
    for time_interval, data in df_resampled:

        if len(data) < n:
            continue
        
        log_returns_series = data['log_arith_middle_price'].values[:n]
        
        # Compute TSRV for the n-minute interval
        subsampled_variance = compute_subsampled_variance(log_returns_series, n, K)
        full_sample_variance = compute_full_sample_variance(log_returns_series, n)
        tsrv = compute_TSRV(subsampled_variance, full_sample_variance, z, n)

        tsrvs.append(tsrv)
        
    return tsrvs


nvda_tsrvs = compute_timed_tsrvs(nvda_mbp, K, n, 10)

print("10 min TSRV for NVDA: ", np.mean(nvda_tsrvs))

cake_tsrvs = compute_timed_tsrvs(cake_mbp, K, n, 10)

print("10 min TSRV for CAKE: ", np.mean(cake_tsrvs))



10 min TSRV for NVDA:  -5.59341605974833e-08
10 min TSRV for CAKE:  -2.0155719661495228e-07


In [None]:
# reset n since 30m

#nvda_tsrvs = compute_timed_tsrvs(nvda_mbp, K, n, 30)

#print("10 min TSRV for NVDA: ", np.mean(nvda_tsrvs))

#cake_tsrvs = compute_timed_tsrvs(cake_mbp, K, n, 30)

#print("10 min TSRV for CAKE: ", np.mean(cake_tsrvs))

