In [None]:
import pandas as pd 
import numpy as np
from os import listdir
from os.path import isfile, join

from src.config import TEST_CUTOFF_DATE

import pytz
eastern = pytz.timezone('US/Eastern')


In [None]:
onlyfiles = [f for f in listdir(raw_iq_feed_data_dir) if isfile(join(raw_iq_feed_data_dir, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = "GOOGL"

In [None]:
prices: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/{ticker}_1min.parquet", 
                                                                 columns=["time", "close", "open", "volume"]))
prices_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/daily/{ticker}_daily.parquet", 
                                                                       columns=["date", "close", "open", "volume"]))

In [None]:
intra_opens = prices[(prices.index.hour == 9) & (prices.index.minute == 0), ["close", "open"]]
intra_opens.index = prices.index.date
intra_opens.index.name = "date"

In [None]:
merged = pd.merge(intra_opens, prices_daily["date", "close", "open"], left_on="date", right_on="date", suffixes=("_intra", "_eod"))

In [None]:
merged = merged.sort_index(ascending=False)
merged["split_ratio"] = 1

In [None]:
# Calculate Dividend and Split Events
# To adjust, we go backwrds in time. 
# When a split has occurred the close will be the first price that is influenced in the historical adjusted time series.
# Assuming the split/dividend has occurred overnight.
for row in merged:
     split_ratio = merged[row, "close_eod"] / merged[row, "close_intra"]
     if np.abs(split_ratio - 1) >= 0.01:
         merged[row:, ["close_intra", "split_ratio"]]  = [merged[row:, "close_intra"] * split_ratio, "split_ratio"]

In [None]:
# Check quality looking at the amount of splits/dividends

In [None]:
# Check quality looking at the difference between intra_day adj open and eod adj open.
# This difference should be smaller than the split_ratio for all days where the split_ratio is not 1
# This difference should also be smaller in general than our threshold for splits (1%?)

In [None]:
# Once this has been checked and intraday data consistency/continuity is assured we could
# Replace all the closes and opens from the intra day data set with those of the eod data set.
# However this leads to problems, if we calculate wrong a single split event.
# If we don't do this replacement only one day is affected (split day). 
# If we do the replacement multiple days will be affected by the error.