In [5]:
import pandas as pd
from typing import List
from datetime import datetime
import numpy as np

from lib.utils import get_nyse_date_tups, get_oxford_dfs

In [2]:
start, end = '2018-10-11', '2023-10-09'
unix_tups = get_nyse_date_tups(start, end, unix=True)

In [3]:
dfs = get_oxford_dfs()

In [4]:
def filter_market_hours(dfs):
    market_hours = []
    # outside_hours = [] 
    for df in dfs: # no list comp for clarity
        df['datetime'] = pd.to_datetime(df['t'], utc=True, unit='ms').dt.tz_convert('America/New_York') # convert unix to datetime
        # this bool has to be NON-INCLUSIVE of the end time else later the resampler will create a 7th interval just to hold trades exactly at 4pm
        bool_ = (df['datetime'].dt.time >= pd.to_datetime('09:30:00').time()) & (df['datetime'].dt.time < pd.to_datetime('16:00:00').time()) # 9:30am - 4pm
        market_hours.append(df[bool_].reset_index(drop=True))
        # outside_hours.append(df[~bool_].reset_index(drop=True))
    return market_hours

market_hours = filter_market_hours(dfs)

In [7]:
unique_days = 1256 # 5 years of trading days

In [49]:
test = market_hours[0].copy()

In [54]:
def agg_rv(df: pd.DataFrame):
    """Aggregate RV over 65 minute intervals each day, offset by 9:30am"""

    def _rv(x: pd.Series):
        """RV over a single window/bin of time"""
        log_returns = np.log(x / x.shift(1))
        squared_log_returns = log_returns ** 2
        rv = np.log(squared_log_returns.sum() + 1e-16)
        return rv

    # group by day, resample to 65T intervals, offset by 9:30am, label left
    out = df.set_index('datetime').groupby(pd.Grouper(freq='D'))\
        .resample('65T', offset='9H30T', label='right')\
        .agg({"c" : _rv}) # apply rv to each bin
    out.index = out.index.droplevel(0) # drop the day index
    out.reset_index(inplace=True) # make datetime a column again
    out.rename(columns={'c': 'rv'}, inplace=True) # rename the rv column
    return out

In [55]:
processed = agg_rv(test)

In [41]:
processed.index = processed.index.droplevel(0)
processed.reset_index(inplace=True)

In [57]:
all_rv = [agg_rv(df) for df in market_hours]

In [61]:
tickers = [df['ticker'].iloc[0] for df in dfs]

In [63]:
base_path = "/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/raw_rv"
for idx, agg in enumerate(all_rv):
    agg['ticker'] = tickers[idx]
    path = f"{base_path}/{tickers[idx]}_65min.csv"
    agg.to_csv(path, index=False)

In [64]:
test_saved = pd.read_csv(f"{base_path}/AAPL_65min.csv", index_col=0)

In [65]:
test_saved.head()

Unnamed: 0_level_0,rv,ticker
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-10-11 10:35:00-04:00,-8.253303,AAPL
2018-10-11 11:40:00-04:00,-8.613353,AAPL
2018-10-11 12:45:00-04:00,-9.249611,AAPL
2018-10-11 13:50:00-04:00,-9.63919,AAPL
2018-10-11 14:55:00-04:00,-9.050998,AAPL


In [67]:
mega_df = pd.DataFrame(all_rv[0]['datetime'])
for df, ticker in zip(all_rv, tickers):
    mega_df[ticker] = df['rv']

In [69]:
mega_df.to_csv("/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65_min_rv.csv")