## Clean and Adjust Intraday Data

In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive')
cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot
%pip install -r requirements_clean.txt

from google.colab import auth
auth.authenticate_user()

In [None]:
import pandas as pd
import numpy as np
import pytz
eastern = pytz.timezone('US/Eastern')
from IPython.display import clear_output
import logging
from src.config import config
from src.utils.prices import calc_backward_adjustment_factors, calc_adj_prices
from src.utils.tickers import get_tickers

In [None]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [None]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame:
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)

        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
        prices.index = prices.index.astype('datetime64[ns, US/Eastern]')
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

In [None]:
tickers = get_tickers()

In [None]:
for i in range(len(tickers)):
  ticker = tickers[i]
  clear_output(wait=True)
  print(f"{i} - {ticker}", flush=True)

  path = f"{config.data.iqfeed.minute.raw}/{ticker}_1min.parquet"
  prices = pd.read_parquet(path=path,
                           columns=["time", "open", "high", "low", "close", "volume"])
  prices: pd.DataFrame = preprocess_iq_feed_prices(prices)
  # Adjust for Splits and Dividends
  bafs = calc_backward_adjustment_factors(ticker, return_dataframe=False)
  if bafs is None:
    # No yfinance data for this stock
    continue
  adj_prices = calc_adj_prices(prices, bafs)

  # Save adjusted files to disk
  adj_prices.to_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")

## Make Daily Time Series from Intra-Day

In [None]:
from concurrent.futures import ThreadPoolExecutor
import os

In [None]:
def downsample_intraday_prices_to_daily(df: pd.DataFrame):
    dic = dict()
    dic["adj_open"] = df.iloc[0, :].at["adj_open"]
    dic["adj_high"] = df["adj_high"].max()
    dic["adj_high"] = df["adj_low"].min()
    dic["adj_close"] = df.iloc[-1, :].at["adj_close"]
    dic["adj_volume"] = df.loc[:, "adj_volume"].mean()
    dic["cum_split_ratio"] = df.at[df.index[0], "cum_split_ratio"]
    
    daily = pd.Series(dic, name=df.index.date[0])
    return daily

In [None]:
tickers = get_tickers(directory=config.data.iqfeed.minute.cleaned)

In [None]:
def downsample_intraday_prices_to_daily_for_ticker_and_save(ticker: str):
    prices = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    assert prices.index.is_monotonic_increasing
    try:
        daily_prices = prices.groupby(prices.index.date).apply(downsample_intraday_prices_to_daily)
    except Exception as err:
        logging.info(f"{err=}")
        print(f"{err=}")
        return
    daily_prices.index = pd.to_datetime(prices.index)
    daily_prices = daily_prices.index.tz_localize("US/Eastern")
    daily_prices.index.name = "date"
    daily_prices.to_parquet(path=f"{config.data.iqfeed.daily.cleaned}/{ticker}_daily.parquet")

In [None]:
pool_obj = ThreadPoolExecutor(max_workers=os.cpu_count()-1)
ans = pool_obj.map(downsample_intraday_prices_to_daily_for_ticker_and_save, tickers)
result = list(ans)

In [None]:
tickers = get_tickers(config.data.iqfeed.minute.cleaned)
daily_tickers = get_tickers(config.data.iqfeed.daily.cleaned)
set(tickers) - set(daily_tickers)