In [1]:
import pandas as pd 
import numpy as np
from os import listdir
from os.path import isfile, join
import plotly.express as px
import pytz
eastern = pytz.timezone('US/Eastern')
import yfinance 

from dotmap import DotMap
import yaml
config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)

In [2]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [461]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)
        
        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are 
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
        prices.index = prices.index.astype('datetime64[ns, US/Eastern]')
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

In [619]:
onlyfiles = [f for f in listdir(config.data.iqfeed.minute.raw) if isfile(join(config.data.iqfeed.minute.raw, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = "IBM"

In [646]:
prices: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.minute.raw}/{ticker}_1min.parquet", 
                                                                 columns=["time", "open", "high", "low", "close", "volume"]))
prices_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.minute.raw}/daily/{ticker}_daily.parquet", 
                                                                       columns=["date", "close", "open", "volume"]))

# Adjust for Splits and Dividends

In [666]:
def calc_backward_adjustment_factors(ticker):
    """Calculates the backward adjust factors based on data from yfinance.
    
    `cum_split_ratio` and `backward_adjustment_factor` are synonymous.
    IMPORTANT !!!!!!!!!!
    In yahoo finance the `Close` is adj. for splits and the `Adj. Close` for splits and dividends
    Dividends on Yahoo Finance are adjusted for splits!
    """
    df = yfinance.download(ticker, period="14y",actions=True)
    df.sort_index(ascending=True, inplace=True)
    df["split_ratio"] = 1

    # Add splits
    df["Stock Splits"] = df["Stock Splits"].shift(-1, fill_value=1)
    split_mask = df["Stock Splits"] > 0
    df.loc[split_mask, "split_ratio"] = 1 / df["Stock Splits"]

    # Add dividends
    dividend_mask = (df["Dividends"] > 0)
    # The Close is split, but not dividend adjusted
    df.loc[dividend_mask, "split_ratio"] = (1 - df.loc[dividend_mask, "Dividends"].values / df.shift(1).loc[dividend_mask, "Close"].values)

    df["cum_split_ratio"] = np.cumprod(df["split_ratio"][::-1])[::-1] 
    return df["cum_split_ratio"]

In [667]:
bafs = calc_backward_adjustment_factors(ticker)

[*********************100%***********************]  1 of 1 completed


In [668]:
def calc_adj_prices(prices, bafs):
    X = bafs.index.tz_localize("US/Eastern")
    bafs.index = X  + pd.DateOffset(hours=16, minutes=1)
    prices_adj = pd.merge_asof(prices, bafs, left_index=True, right_on="Date", direction="forward")
    prices_adj[ [f"adj_{x}" for x in ["open", "high", "low", "close"]]] = prices_adj[["open", "high", "low", "close"]] * prices_adj.cum_split_ratio
    prices_adj["volume"] = prices_adj["volume"] / prices_adj.cum_split_ratio
    return prices_adj

In [None]:
calc_adj_prices(prices, bafs)

In [None]:
prices

# Save adjusted files to disk