<a href="https://colab.research.google.com/github/adrianbeer/trading_bot/blob/main/src/preprocessing/iq_feed_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
from google.colab import drive
drive.mount('/content/drive')
cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot
%pip install -r requirements_clean.txt

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
import plotly.express as px
import pytz
eastern = pytz.timezone('US/Eastern')
import yfinance
from IPython.display import clear_output

from dotmap import DotMap
import yaml

In [4]:
config_name = "src/config_gcs.yaml"
config = DotMap(yaml.safe_load(open(config_name)), _dynamic=False)

In [5]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [6]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame:
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)

        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
        prices.index = prices.index.astype('datetime64[ns, US/Eastern]')
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

In [7]:
def calc_backward_adjustment_factors(ticker: str, dataframe: bool =False):
    """Calculates the backward adjust factors based on data from yfinance.

    `cum_split_ratio` and `backward_adjustment_factor` are synonymous.
    IMPORTANT !!!!!!!!!!
    In yahoo finance the `Close` is adj. for splits and the `Adj. Close` for splits and dividends
    Dividends on Yahoo Finance are adjusted for splits!
    """
    df = yfinance.download(ticker, period="max",actions=True)
    if ticker in yfinance.shared._ERRORS:
      return None
    df.sort_index(ascending=True, inplace=True)
    df["split_ratio"] = 1

    # Add splits
    df["Stock Splits"] = df["Stock Splits"].shift(-1, fill_value=1)
    split_mask = df["Stock Splits"] > 0
    df.loc[split_mask, "split_ratio"] = 1 / df["Stock Splits"]

    # Add dividends
    dividend_mask = (df["Dividends"] > 0)
    # The Close is split, but not dividend adjusted
    df.loc[dividend_mask, "split_ratio"] = (1 - df.loc[dividend_mask, "Dividends"].values / df.shift(1).loc[dividend_mask, "Close"].values)

    df["cum_split_ratio"] = np.cumprod(df["split_ratio"][::-1])[::-1]
    if dataframe:
      return df
    else:
      return df["cum_split_ratio"]

In [8]:
def get_gcs_tickers():
  from google.cloud import storage
  storage_client = storage.Client()
  bucket = config.data.iqfeed.minute.raw.split("/")[2]
  prefix = "/".join(config.data.iqfeed.minute.raw.split("/")[3:]) + "/"
  bucket = storage_client.get_bucket(bucket)
  blobs = bucket.list_blobs(prefix=prefix)
  tickers = [subpath.name.split("/")[-1].split("_")[0] for subpath in blobs]
  tickers = [t for t in tickers if t != '']
  return tickers

def get_local_tickers():
  onlyfiles = [f for f in listdir(config.data.iqfeed.minute.raw) if isfile(join(config.data.iqfeed.minute.raw, f))]
  tickers = [x.split("_")[0] for x in onlyfiles]
  return tickers

def get_tickers():
  if config.environment == "colab":
    tickers = get_gcs_tickers()
  if config.environment == "local":
    tickers = get_local_tickers()
  return tickers

In [9]:
def calc_adj_prices(prices, bafs):
    X = bafs.index.tz_localize("US/Eastern")
    # danger: this was passed as reference, not valuee.
    bafs.index = X  + pd.DateOffset(hours=16, minutes=1)
    prices_adj = pd.merge_asof(prices, bafs, left_index=True, right_on="Date", direction="forward")
    prices_adj.loc[:, ["open", "high", "low", "close"]] = prices_adj[["open", "high", "low", "close"]].mul(prices_adj.cum_split_ratio, axis="index")
    prices_adj.loc[:, "volume"] = prices_adj["volume"].div(prices_adj.cum_split_ratio, axis="index")
    prices_adj.drop(columns="Date", inplace=True)
    prices_adj.rename(columns=dict(
        zip(["open", "high", "low", "close", "volume"],
            [f"adj_{x}" for x in ["open", "high", "low", "close", "volume"]])
        ),
                      inplace=True)
    return prices_adj

In [10]:
tickers = get_tickers()

In [None]:
for i in range(len(tickers)):
  ticker = tickers[i]
  clear_output(wait=True)
  print(f"{i} - {ticker}", flush=True)

  path = f"{config.data.iqfeed.minute.raw}/{ticker}_1min.parquet"
  prices = pd.read_parquet(path=path,
                           columns=["time", "open", "high", "low", "close", "volume"])
  prices: pd.DataFrame = preprocess_iq_feed_prices(prices)
  # Adjust for Splits and Dividends
  bafs = calc_backward_adjustment_factors(ticker, dataframe=False)
  if bafs is None:
    # No yfinance data for this stock
    continue
  adj_prices = calc_adj_prices(prices, bafs)

  # Save adjusted files to disk
  adj_prices.to_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")

15 - ABEV
