In [1]:
import os
import pandas as pd
from typing import List
from datetime import datetime
import numpy as np

from lib.utils import get_nyse_date_tups, get_oxford_dfs
import pandas_market_calendars as mcal

In [2]:
start, end = '2018-10-11', '2023-10-09'
unix_tups = get_nyse_date_tups(start, end, unix=True)

In [5]:
df = get_oxford_dfs(n=1)
dfs = [df]

In [6]:
def filter_market_hours(dfs):
    market_hours = []
    # outside_hours = [] 
    for df in dfs: # no list comp for clarity
        df['datetime'] = pd.to_datetime(df['t'], utc=True, unit='ms').dt.tz_convert('America/New_York') # convert unix to datetime
        # this bool has to be NON-INCLUSIVE of the end time else later the resampler will create a 7th interval just to hold trades exactly at 4pm
        bool_ = (df['datetime'].dt.time >= pd.to_datetime('09:30:00').time()) & (df['datetime'].dt.time < pd.to_datetime('16:00:00').time()) # 9:30am - 4pm
        market_hours.append(df[bool_].reset_index(drop=True))
        # outside_hours.append(df[~bool_].reset_index(drop=True))
    return market_hours

market_hours = filter_market_hours(dfs)

In [10]:
tmo = market_hours[0].copy()

In [11]:
tmo

Unnamed: 0,c,h,l,n,o,t,v,vw,ticker,datetime
0,230.9300,230.9500,230.765,25,230.950,1539264660000,934,230.9806,TMO,2018-10-11 09:31:00-04:00
1,231.0400,232.0899,231.030,141,231.200,1539264720000,44230,231.2495,TMO,2018-10-11 09:32:00-04:00
2,231.1100,231.1100,230.670,74,231.080,1539264780000,5056,230.8307,TMO,2018-10-11 09:33:00-04:00
3,230.7150,231.0900,230.610,143,230.865,1539264840000,11734,230.6950,TMO,2018-10-11 09:34:00-04:00
4,230.3513,230.7200,230.340,66,230.720,1539264900000,3376,230.4611,TMO,2018-10-11 09:35:00-04:00
...,...,...,...,...,...,...,...,...,...,...
468764,495.9850,496.1700,495.770,443,495.770,1696881300000,16163,495.9732,TMO,2023-10-09 15:55:00-04:00
468765,495.7500,496.0300,495.650,251,496.030,1696881360000,8608,495.8342,TMO,2023-10-09 15:56:00-04:00
468766,495.8500,495.9000,495.690,291,495.720,1696881420000,10163,495.7920,TMO,2023-10-09 15:57:00-04:00
468767,495.9300,495.9900,495.770,482,495.860,1696881480000,19078,495.8682,TMO,2023-10-09 15:58:00-04:00


In [13]:
tmo.set_index('datetime', inplace=True)

In [15]:
tmo.groupby(pd.Grouper(freq='D')).resample('')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1286f0e20>

In [19]:
tmo.groupby(pd.Grouper(freq='D')).resample('65T').agg({'c' : lambda x: np.log(x.iloc[-1]) - np.log(x.iloc[0])})

Unnamed: 0_level_0,Unnamed: 1_level_0,c
datetime,datetime,Unnamed: 2_level_1
2018-10-11 00:00:00-04:00,2018-10-11 08:40:00-04:00,-0.000953
2018-10-11 00:00:00-04:00,2018-10-11 09:45:00-04:00,-0.003989
2018-10-11 00:00:00-04:00,2018-10-11 10:50:00-04:00,-0.000608
2018-10-11 00:00:00-04:00,2018-10-11 11:55:00-04:00,0.002887
2018-10-11 00:00:00-04:00,2018-10-11 13:00:00-04:00,-0.004737
...,...,...
2023-10-09 00:00:00-04:00,2023-10-09 10:50:00-04:00,0.002539
2023-10-09 00:00:00-04:00,2023-10-09 11:55:00-04:00,-0.000467
2023-10-09 00:00:00-04:00,2023-10-09 13:00:00-04:00,0.002837
2023-10-09 00:00:00-04:00,2023-10-09 14:05:00-04:00,0.005812


In [16]:
def agg_rv(df: pd.DataFrame):
    """Aggregate RV over 65 minute intervals each day, offset by 9:30am"""

    def _rv(x: pd.Series):
        """RV over a single window/bin of time"""
        log_returns = np.log(x / x.shift(1))
        squared_log_returns = log_returns ** 2
        rv = np.log(squared_log_returns.sum() + 1e-16)
        return rv

    # group by day, resample to 65T intervals, offset by 9:30am, label left
    out = df.set_index('datetime').groupby(pd.Grouper(freq='D'))\
        .resample('65T', offset='9H30T', label='right')\
        .agg({"c" : _rv}) # apply rv to each bin
    out.index = out.index.droplevel(0) # drop the day index
    out.reset_index(inplace=True) # make datetime a column again
    out.rename(columns={'c': 'rv'}, inplace=True) # rename the rv column
    return out

In [17]:
agg_rvs = [agg_rv(df) for df in market_hours]

In [18]:
agg_rvs[0]

Unnamed: 0,datetime,rv
0,2018-10-11 10:35:00-04:00,-9.302132
1,2018-10-11 11:40:00-04:00,-9.226515
2,2018-10-11 12:45:00-04:00,-9.818327
3,2018-10-11 13:50:00-04:00,-10.207846
4,2018-10-11 14:55:00-04:00,-8.921766
...,...,...
7511,2023-10-09 11:40:00-04:00,-10.733918
7512,2023-10-09 12:45:00-04:00,-11.575774
7513,2023-10-09 13:50:00-04:00,-11.869460
7514,2023-10-09 14:55:00-04:00,-11.277812


In [98]:
tickers = [df['ticker'].iloc[0] for df in dfs]

In [99]:
base_path = "/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/raw_log_ret"
if not os.path.exists(base_path): os.mkdir(base_path)
for idx, agg in enumerate(all_log_ret):
    agg['ticker'] = tickers[idx]
    path = f"{base_path}/{tickers[idx]}_65min.csv"
    agg.to_csv(path, index=False)

In [100]:
mega_df = pd.DataFrame(all_log_ret[0]['datetime'])
for df, ticker in zip(all_log_ret, tickers):
    mega_df[ticker] = df['log_ret']

In [101]:
mega_df.head(100)

Unnamed: 0,datetime,TMO,ABT,HD,MCD,PG,CAT,DIS,CCI,JNJ,...,COST,IBM,ADP,AVGO,WMT,AMGN,INTU,AXP,MMC,CB
0,2018-10-11 10:35:00-04:00,0.002044,0.000290,0.007970,-0.003542,-0.015252,0.003827,-0.000490,-0.009682,-0.010058,...,0.000760,-0.001296,-0.012762,0.006567,-0.007732,-0.026439,0.020094,0.012177,0.005559,-0.013216
1,2018-10-11 11:40:00-04:00,-0.012958,-0.005812,-0.010241,-0.007665,-0.005534,-0.008440,-0.003210,-0.007077,-0.007419,...,-0.003389,-0.007346,-0.009383,-0.009835,-0.003572,-0.003722,-0.008705,-0.007352,-0.004068,-0.015409
2,2018-10-11 12:45:00-04:00,0.011864,0.009495,0.005513,0.001582,-0.003369,0.004187,0.004273,0.001215,0.002866,...,0.010420,0.003169,0.005284,0.003212,0.000000,0.008531,0.000877,0.004544,0.001974,0.001637
3,2018-10-11 13:50:00-04:00,-0.010124,-0.006878,-0.005410,-0.004801,-0.003759,-0.005166,-0.003026,-0.001871,-0.003086,...,-0.003204,-0.006559,-0.005000,-0.004676,-0.001789,-0.009610,-0.006060,-0.002754,-0.003210,-0.003745
4,2018-10-11 14:55:00-04:00,-0.007333,-0.006271,-0.017991,-0.007464,-0.005163,-0.006255,-0.008510,-0.012835,-0.010583,...,-0.008888,-0.009318,-0.008487,-0.013015,-0.007933,-0.004655,-0.010946,-0.009342,-0.010448,-0.009196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2018-11-01 16:00:00-04:00,0.001756,0.002141,-0.000111,-0.002113,0.006496,-0.000160,0.000129,-0.000091,0.001208,...,0.002442,0.002571,0.002487,0.001611,0.001891,0.001194,0.005431,0.000914,-0.004012,-0.000956
96,2018-11-02 10:35:00-04:00,-0.002111,-0.003257,-0.011604,-0.004373,-0.002674,-0.012378,-0.009302,-0.004500,-0.009292,...,0.004246,-0.007725,-0.010008,-0.015634,-0.001493,-0.026150,0.000656,-0.004453,0.002934,0.002140
97,2018-11-02 11:40:00-04:00,0.000913,-0.003054,0.000000,-0.005656,-0.001116,0.007308,-0.002255,-0.006647,-0.001284,...,-0.008501,-0.000773,-0.002989,0.002264,-0.002829,-0.001304,-0.000094,-0.002599,-0.007530,-0.004647
98,2018-11-02 12:45:00-04:00,-0.004235,-0.003924,-0.005409,0.000201,-0.004255,-0.004211,-0.001870,0.001480,0.000857,...,-0.002057,-0.006895,-0.003537,-0.008791,-0.000298,-0.001377,0.000000,-0.003088,-0.008426,-0.003305


In [102]:
mega_df.to_csv("/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65min_log_ret.csv")