In [9]:
import pandas as pd
from typing import List

from lib.utils import get_nyse_date_tups, get_oxford_dfs

In [2]:
start, end = '2018-10-11', '2023-10-09'
unix_tups = get_nyse_date_tups(start, end, unix=True)

In [3]:
dfs = get_oxford_dfs()

In [14]:
def filter_market_hours(dfs):
    market_hours = []
    # outside_hours = [] 
    for df in dfs: # no list comp for clarity
        df['datetime'] = pd.to_datetime(df['t'], utc=True, unit='ms').dt.tz_convert('America/New_York') # convert unix to datetime
        # this bool has to be NON-INCLUSIVE of the end time else later the resampler will create a 7th interval just to hold trades exactly at 4pm
        bool_ = (df['datetime'].dt.time >= pd.to_datetime('09:30:00').time()) & (df['datetime'].dt.time < pd.to_datetime('16:00:00').time()) # 9:30am - 4pm
        market_hours.append(df[bool_].reset_index(drop=True))
        # outside_hours.append(df[~bool_].reset_index(drop=True))
    return market_hours

market_hours = filter_market_hours(dfs)

In [7]:
unique_days = 1256 # 5 years of trading days

In [10]:
def create_aggs(df: pd.DataFrame):
    aggs = df.set_index('datetime').groupby(pd.Grouper(freq='D'))\
        .resample('65T', offset='9H30T', label='left')\
        .agg({'o': 'first', 'h': 'max', 'l': 'min', 'c': 'last', 'v': 'sum'})
    aggs.index = aggs.index.droplevel(0)
    aggs.reset_index(inplace=True)
    return aggs

In [15]:
all_aggs = [create_aggs(df) for df in market_hours]

In [27]:
formatted_aggs = []
for df in all_aggs:
    copy = df.copy()
    copy.index = copy.index.droplevel(0)
    copy.reset_index(inplace=True)
    formatted_aggs.append(copy)

In [31]:
tickers = [df['ticker'].iloc[0] for df in dfs]

In [32]:
tickers

['TMO',
 'ABT',
 'HD',
 'MCD',
 'PG',
 'CAT',
 'DIS',
 'CCI',
 'JNJ',
 'KO',
 'BA',
 'LLY',
 'BDX',
 'FISV',
 'BMY',
 'TXN',
 'CVS',
 'MMM',
 'PFE',
 'GS',
 'CL',
 'LMT',
 'NFLX',
 'UPS',
 'BSX',
 'XOM',
 'MO',
 'SBUX',
 'COP',
 'WFC',
 'PM',
 'PEP',
 'LOW',
 'TGT',
 'CI',
 'CME',
 'GE',
 'NVDA',
 'HON',
 'GOOG',
 'C',
 'MSFT',
 'PNC',
 'AAPL',
 'D',
 'SCHW',
 'MS',
 'ADBE',
 'QCOM',
 'UNP',
 'CSX',
 'CMCSA',
 'MRK',
 'ISRG',
 'V',
 'CSCO',
 'VZ',
 'SYK',
 'ACN',
 'DHR',
 'MA',
 'NKE',
 'MDT',
 'INTC',
 'BRK.B',
 'CVX',
 'DUK',
 'TJX',
 'BLK',
 'GILD',
 'MU',
 'SO',
 'AMT',
 'ORCL',
 'AMZN',
 'FIS',
 'T',
 'UNH',
 'BAC',
 'JPM',
 'USB',
 'CRM',
 'VRTX',
 'COST',
 'IBM',
 'ADP',
 'AVGO',
 'WMT',
 'AMGN',
 'INTU',
 'AXP',
 'MMC',
 'CB']

In [34]:
base_path = "/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/raw_aggs"
for idx, agg in enumerate(formatted_aggs):
    agg['ticker'] = tickers[idx]
    path = f"{base_path}/{tickers[idx]}_65min.csv"
    agg.to_csv(path, index=False)

In [37]:
test_saved = pd.read_csv(f"{base_path}/AAPL_65min.csv", index_col=0)

In [45]:
mega_df = pd.DataFrame(formatted_aggs[0]['datetime'])
for df, ticker in zip(formatted_aggs, tickers):
    mega_df[ticker] = df['c']

In [47]:
mega_df.to_csv("/Users/beneverman/Documents/Coding/QuantHive/IDVF-Oxford-v1/data/processed-5yr-93-minute/65_min.csv")