In [None]:
import pandas as pd 
import numpy as np
from os import listdir
from os.path import isfile, join
import plotly.express as px
import pytz
eastern = pytz.timezone('US/Eastern')

In [None]:
# List off functions for modifying the return
raw_iq_feed_data_dir = "D:/IQFeedData"

# Input-Output-Rows for the neural network training and validation
train_output_dir = "D:/Data/NN_Training"

# Input-Rows for testing
test_output_dir = "D:/Data/NN_Testing"

In [None]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [None]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)
        
        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are 
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

In [None]:
onlyfiles = [f for f in listdir(raw_iq_feed_data_dir) if isfile(join(raw_iq_feed_data_dir, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = "GOOGL"

In [None]:
prices: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/{ticker}_1min.parquet", 
                                                                 columns=["time", "close", "open", "volume"]))
prices_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/daily/{ticker}_daily.parquet", 
                                                                       columns=["date", "close", "open", "volume"]))

In [None]:

intra_opens = prices[(prices.index.hour == 9) & (prices.index.minute == 0), "open"]
intra_opens.index = intra_opens.index.date
intra_opens.index.name = "date"

intra_closes = prices[(prices.index.hour == 16) & (prices.index.minute == 0), "open"]
intra_closes.index = intra_closes.index.date
intra_closes.index.name = "date"
intra_closes.name = "intra_close"

In [None]:
merged = pd.merge(intra_opens, prices_daily["date", "close", "open"], left_on="date", right_on="date", suffixes=("_intra", "_eod"))
merged = pd.merge(merged, intra_closes, left_on="date", right_on="date")

In [None]:
merged = merged.sort_index(ascending=False)
merged["split_ratio"] = 1

In [None]:
# Calculate Dividend and Split Events (They are handled exactly the same, but dividend events are smaller)
# To adjust, we go backwrds in time. 
# When a split has occurred the close will be the first price that is influenced in the historical adjusted time series.
# Assuming the split/dividend has occurred overnight.
for row in merged:
     split_ratio = merged[row, "close_eod"] / merged[row, "close_intra"]
     if np.abs(split_ratio - 1) >= 0.01:
         merged[row:, ["close_intra", "split_ratio"]]  = [merged[row:, "close_intra"] * split_ratio, "split_ratio"]

In [None]:
prices["split_ratio"] = 1
prices[merged.index, "split_ratio"] = merged["split_ratio"]
prices["cum_split_ratio"] = np.cumsum(prices["split_ratio"][::-1])[::-1] 

In [None]:
# split_ratio < 1 => price goes down
prices[["open", "high", "low", "close"]] = prices[["open", "high", "low", "close"]] * prices["cum_split_ratio"]

# split_ratio < 1 => volume goes up, since price goes down and pricevolume has to stay the same 
# (volume denotes number of stocks traded)
prices["volume"] = prices["volume"] / prices["cum_split_ratio"]

In [None]:
px.line(merged["close_intra", "close_eod"])

In [None]:
# Check quality looking at the amount of splits/dividends
# Should be a maximum of 5 (one split event and 4 dividends)?

In [None]:
# Check quality looking at the difference between intra_day adj open and eod adj open.
# This difference should be smaller than the split_ratio for all days where the split_ratio is not 1
# This difference should also be smaller in general than our threshold for splits (1%?)

In [None]:
# Once this has been checked and intraday data consistency/continuity is assured we could
# Replace all the closes and opens from the intra day data set with those of the eod data set.
# However this leads to problems, if we calculate wrong a single split event.
# If we don't do this replacement only one day is affected (split day). 
# If we do the replacement multiple days will be affected by the error.