In [3]:
import pandas as pd
import numpy as np


In [4]:
# MBP Schema
mbp_csv = pd.read_csv('xnas-itch-20240822.mbp-1.csv', parse_dates=['ts_recv', 'ts_event'])

nvda_mbp = mbp_csv[mbp_csv['symbol'] == 'NVDA'].copy()
cake_mbp = mbp_csv[mbp_csv['symbol'] == 'CAKE'].copy()

In [5]:
# get tick-by-tick log middle price

# Calculate the log-arithmetic middle price
nvda_mbp['log_arith_middle_price'] = (np.log(nvda_mbp['bid_px_00']) + np.log(nvda_mbp['ask_px_00'])) / 2
cake_mbp['log_arith_middle_price'] = (np.log(cake_mbp['bid_px_00']) + np.log(cake_mbp['ask_px_00'])) / 2


In [16]:
# set parameters

# total number of tick observations
# for ten minute intervals, n = 60 * 10
n = 600 
# number of subsamples per ten minute interval (chosen at will)
K = 20
# total sampling length - number minutes
# 8 hours in a trading day - 8 * 60
T = 60

In [19]:
# [X, X]^K_T
def compute_subsampled_variance(log_returns, K, n):
    subsampled_variance = 0
    log_returns = list(log_returns)
    
    for i in range(n - K + 1):
        pt1 = log_returns[i + K]
        pt2 = log_returns[i]
        subsampled_variance += (pt1 - pt2)**2
        
    subsampled_variance /= K
    return subsampled_variance


# [X, X]^{All}_T
def compute_full_sample_variance(log_returns, n):
    full_sample_variance = 0
    log_returns = list(log_returns)

    for i in range(n - 1):
        
        pt1 = log_returns[i+1]
        pt2 = log_returns[i]
        
        full_sample_variance += (pt1 - pt2) ** 2
        
    return full_sample_variance

full_sample_variance = compute_full_sample_variance(nvda_mbp['log_arith_middle_price'], n)


# z
z = (n - K + 1) / K


# TSRV^{(10m)}_t
def compute_TSRV(subsampled_variance, full_sample_variance, z, n):
    tsrv = (1 - z / n) ** -1 * (subsampled_variance - (z / n) * full_sample_variance)
    return tsrv


def compute_timed_tsrvs(df, K, n, minute):
    
    tsrvs = []
    z = (n - K + 1) / K

    resampling_index = str(minute) + 'T'
    df_resampled = df.set_index('ts_event').resample(resampling_index)
    
    for time_interval, data in df_resampled:

        if len(data) < n:
            continue
        
        if len(log_returns_series) < n:
            continue
        
        log_returns_series = data['log_arith_middle_price'].values[:n]  # Take the first `n` observations
        log_returns_interval = list(log_returns_series)
        
        # Compute TSRV for the n-minute interval
        subsampled_variance = compute_subsampled_variance(log_returns_interval, K, n)
        full_sample_variance = compute_full_sample_variance(log_returns_interval, n)
        tsrv = compute_TSRV(subsampled_variance, full_sample_variance, z, n)

        tsrvs.append((time_interval, tsrv))
        
    return tsrvs


nvda_tsrvs = compute_timed_tsrvs(nvda_mbp, K, n, 10)

print("10 min TSRV for NVDA: ", np.mean(nvda_tsrvs))

cake_tsrvs = compute_timed_tsrvs(cake_mbp, K, n, 10)

print("10 min TSRV for CAKE: ", np.mean(cake_tsrvs))



UnboundLocalError: cannot access local variable 'log_returns_series' where it is not associated with a value