In [1]:
import os
import pandas as pd
from typing import List
from datetime import datetime
import numpy as np

from lib.utils import get_nyse_date_tups, get_oxford_dfs
import pandas_market_calendars as mcal

In [82]:
start, end = '2018-10-11', '2023-10-09'
market_days = [tup[0] for tup in get_nyse_date_tups(start, end, unix=False)]

In [88]:
dfs = get_oxford_dfs(n=2)

In [96]:
def filter_market_hours(dfs):
    market_hours = []
    # outside_hours = [] 
    for df in dfs: # no list comp for clarity
        df['datetime'] = pd.to_datetime(df['t'], utc=True, unit='ms').dt.tz_convert('America/New_York') # convert unix to datetime
        # this bool has to be NON-INCLUSIVE of the end time else later the resampler will create a 7th interval just to hold trades exactly at 4pm
        bool1 = (df['datetime'].dt.time >= pd.to_datetime('09:30:00').time()) & (df['datetime'].dt.time < pd.to_datetime('16:00:00').time()) # 9:30am - 4pm
        bool2 = df['datetime'].dt.date.isin(pd.to_datetime(market_days).date) # only market days (no weekends or holidays)
        bool_ = bool1 & bool2 # combine bools
        market_hours.append(df[bool_].reset_index(drop=True))
        # outside_hours.append(df[~bool_].reset_index(drop=True))
    return market_hours

market_hours = filter_market_hours(dfs)

## Let's Try Some Experiments

In [165]:
test = market_hours[0].copy()

In [105]:
all_date_strs_in_df = test['datetime'].dt.date.astype(str).unique()
known_weekend = '2018-10-13 9:30:00-04:00'
assert known_weekend not in all_date_strs_in_df, 'weekend data found in df, check date filtering above'

In [150]:
test

Unnamed: 0,c,h,l,n,o,t,v,vw,ticker,datetime
0,230.9300,230.9500,230.765,25,230.950,1539264660000,934,230.9806,TMO,2018-10-11 09:31:00-04:00
1,231.0400,232.0899,231.030,141,231.200,1539264720000,44230,231.2495,TMO,2018-10-11 09:32:00-04:00
2,231.1100,231.1100,230.670,74,231.080,1539264780000,5056,230.8307,TMO,2018-10-11 09:33:00-04:00
3,230.7150,231.0900,230.610,143,230.865,1539264840000,11734,230.6950,TMO,2018-10-11 09:34:00-04:00
4,230.3513,230.7200,230.340,66,230.720,1539264900000,3376,230.4611,TMO,2018-10-11 09:35:00-04:00
...,...,...,...,...,...,...,...,...,...,...
468764,495.9850,496.1700,495.770,443,495.770,1696881300000,16163,495.9732,TMO,2023-10-09 15:55:00-04:00
468765,495.7500,496.0300,495.650,251,496.030,1696881360000,8608,495.8342,TMO,2023-10-09 15:56:00-04:00
468766,495.8500,495.9000,495.690,291,495.720,1696881420000,10163,495.7920,TMO,2023-10-09 15:57:00-04:00
468767,495.9300,495.9900,495.770,482,495.860,1696881480000,19078,495.8682,TMO,2023-10-09 15:58:00-04:00


## Daily RV on 1-min returns

In [156]:
def agg_rv(df: pd.DataFrame):
    """Aggregate RV over 65 minute intervals each day, offset by 9:30am"""

    def _rv(x: pd.Series):
        """RV over a single window/bin of time"""
        log_returns = np.log(x / x.shift(1))
        squared_log_returns = log_returns ** 2
        rv = np.log(squared_log_returns.sum() + 1e-16)
        return rv

    def _noop(x: pd.Series): return x

    # group by day, resample to 65T intervals, offset by 9:30am, label left
    df.set_index('datetime').groupby(pd.Grouper(freq='D')).resample('1D', offset='9H30T', label='right').agg({"c" : _noop})
    df.reset_index(inplace=True) # make datetime a column again
    df.rename(columns={'c': 'rv'}, inplace=True) # rename the rv column
    return df

In [166]:
def noop(x): return x
test.set_index("datetime").groupby(pd.Grouper(freq='D')).resample('1D', offset='9H30T', label='right').agg({"c" : noop})
test.reset_index(inplace=True)
test.rename(columns={'c': 'rv'}, inplace=True)

In [167]:
test

Unnamed: 0,index,rv,h,l,n,o,t,v,vw,ticker,datetime
0,0,230.9300,230.9500,230.765,25,230.950,1539264660000,934,230.9806,TMO,2018-10-11 09:31:00-04:00
1,1,231.0400,232.0899,231.030,141,231.200,1539264720000,44230,231.2495,TMO,2018-10-11 09:32:00-04:00
2,2,231.1100,231.1100,230.670,74,231.080,1539264780000,5056,230.8307,TMO,2018-10-11 09:33:00-04:00
3,3,230.7150,231.0900,230.610,143,230.865,1539264840000,11734,230.6950,TMO,2018-10-11 09:34:00-04:00
4,4,230.3513,230.7200,230.340,66,230.720,1539264900000,3376,230.4611,TMO,2018-10-11 09:35:00-04:00
...,...,...,...,...,...,...,...,...,...,...,...
468764,468764,495.9850,496.1700,495.770,443,495.770,1696881300000,16163,495.9732,TMO,2023-10-09 15:55:00-04:00
468765,468765,495.7500,496.0300,495.650,251,496.030,1696881360000,8608,495.8342,TMO,2023-10-09 15:56:00-04:00
468766,468766,495.8500,495.9000,495.690,291,495.720,1696881420000,10163,495.7920,TMO,2023-10-09 15:57:00-04:00
468767,468767,495.9300,495.9900,495.770,482,495.860,1696881480000,19078,495.8682,TMO,2023-10-09 15:58:00-04:00


In [159]:
dailyrv = agg_rv(test) # this is daily rv on 1-minutes returns for a single stock

In [163]:
dailyrv

Unnamed: 0,index,rv,h,l,n,o,t,v,vw,ticker,datetime
0,0,230.9300,230.9500,230.765,25,230.950,1539264660000,934,230.9806,TMO,2018-10-11 09:31:00-04:00
1,1,231.0400,232.0899,231.030,141,231.200,1539264720000,44230,231.2495,TMO,2018-10-11 09:32:00-04:00
2,2,231.1100,231.1100,230.670,74,231.080,1539264780000,5056,230.8307,TMO,2018-10-11 09:33:00-04:00
3,3,230.7150,231.0900,230.610,143,230.865,1539264840000,11734,230.6950,TMO,2018-10-11 09:34:00-04:00
4,4,230.3513,230.7200,230.340,66,230.720,1539264900000,3376,230.4611,TMO,2018-10-11 09:35:00-04:00
...,...,...,...,...,...,...,...,...,...,...,...
468764,468764,495.9850,496.1700,495.770,443,495.770,1696881300000,16163,495.9732,TMO,2023-10-09 15:55:00-04:00
468765,468765,495.7500,496.0300,495.650,251,496.030,1696881360000,8608,495.8342,TMO,2023-10-09 15:56:00-04:00
468766,468766,495.8500,495.9000,495.690,291,495.720,1696881420000,10163,495.7920,TMO,2023-10-09 15:57:00-04:00
468767,468767,495.9300,495.9900,495.770,482,495.860,1696881480000,19078,495.8682,TMO,2023-10-09 15:58:00-04:00


## Daily RV on 10-min returns

In [63]:
def mean(x): return sum(x)/len(x)

In [66]:
for i in test.set_index('datetime').groupby(pd.Grouper(freq='D')):
    print(i)

(Timestamp('2018-10-11 00:00:00-0400', tz='America/New_York'),                                   c         h        l    n        o  \
datetime                                                               
2018-10-11 09:31:00-04:00  230.9300  230.9500  230.765   25  230.950   
2018-10-11 09:32:00-04:00  231.0400  232.0899  231.030  141  231.200   
2018-10-11 09:33:00-04:00  231.1100  231.1100  230.670   74  231.080   
2018-10-11 09:34:00-04:00  230.7150  231.0900  230.610  143  230.865   
2018-10-11 09:35:00-04:00  230.3513  230.7200  230.340   66  230.720   
...                             ...       ...      ...  ...      ...   
2018-10-11 15:55:00-04:00  226.8400  226.9200  226.660  393  226.730   
2018-10-11 15:56:00-04:00  226.4500  226.8350  226.430  337  226.830   
2018-10-11 15:57:00-04:00  225.9900  226.6400  225.920  411  226.370   
2018-10-11 15:58:00-04:00  226.1100  226.2500  225.920  435  225.980   
2018-10-11 15:59:00-04:00  226.0900  226.2599  225.870  602  226.110   



KeyboardInterrupt: 

In [None]:
def agg_rv_10min(df: pd.DataFrame):
    """Aggregate RV over 65 minute intervals each day, offset by 9:30am"""

    def _rv(x: pd.Series):
        """RV over a single window/bin of time"""
        log_returns = np.log(x / x.shift(1))
        squared_log_returns = log_returns ** 2
        rv = np.log(squared_log_returns.sum() + 1e-16)
        return rv

    # group by day, resample to 65T intervals, offset by 9:30am, label left
    out = df.set_index('datetime').groupby(pd.Grouper(freq='D')).agg({"c" : _rv}) # apply rv to each bin
        # .resample('65T', offset='9H30T', label='right')\
    out.reset_index(inplace=True) # make datetime a column again
    out.rename(columns={'c': 'rv'}, inplace=True) # rename the rv column
    return out