## Clean and Adjust Intraday Data

In [1]:
%cd /gxfs_work/cau/sunms534/trading_bot/

/gxfs_work/cau/sunms534/trading_bot


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import pandas as pd
import pytz
eastern = pytz.timezone('US/Eastern')
from IPython.display import clear_output
import logging
from src.config import config
from src.utils.prices import calc_backward_adjustment_factors, calc_adj_prices
from src.utils.tickers import get_tickers
from tqdm import tqdm

In [None]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [None]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame:
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)

        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
        prices.index = prices.index.astype('datetime64[ns, US/Eastern]')
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

In [None]:
tickers = get_tickers()

In [None]:
for i in tqdm(range(len(tickers))):
  ticker = tickers[i]

  path = f"{config.data.iqfeed.minute.raw}/{ticker}_1min.parquet"
  prices = pd.read_parquet(path=path,
                           columns=["time", "open", "high", "low", "close", "volume"])
  prices: pd.DataFrame = preprocess_iq_feed_prices(prices)
  # Adjust for Splits and Dividends
  bafs = calc_backward_adjustment_factors(ticker, return_dataframe=False)
  if bafs is None:
    # No yfinance data for this stock
    continue
  adj_prices = calc_adj_prices(prices, bafs)

  # Save adjusted files to disk
  adj_prices.to_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")

## Make Daily Time Series from Intra-Day

In [None]:
from concurrent.futures import ThreadPoolExecutor
import os

In [None]:
def downsample_intraday_prices_to_daily(df: pd.DataFrame):
    dic = dict()
    dic["adj_open"] = df.iloc[0, :].at["adj_open"]
    dic["adj_high"] = df["adj_high"].max()
    dic["adj_high"] = df["adj_low"].min()
    dic["adj_close"] = df.iloc[-1, :].at["adj_close"]
    dic["adj_volume"] = df.loc[:, "adj_volume"].mean()
    dic["cum_split_ratio"] = df.at[df.index[0], "cum_split_ratio"]
    
    daily = pd.Series(dic, name=df.index.date[0])
    return daily

In [None]:
tickers = get_tickers(directory=config.data.iqfeed.minute.cleaned)

In [None]:
def downsample_intraday_prices_to_daily_for_ticker_and_save(ticker: str):
    prices = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    assert prices.index.is_monotonic_increasing
    try:
        daily_prices = prices.groupby(prices.index.date).apply(downsample_intraday_prices_to_daily)
    except Exception as err:
        logging.info(f"{err=}")
        print(f"{err=}")
        return
    daily_prices.index = pd.to_datetime(prices.index)
    daily_prices = daily_prices.index.tz_localize("US/Eastern")
    daily_prices.index.name = "date"
    daily_prices.to_parquet(path=f"{config.data.iqfeed.daily.cleaned}/{ticker}_daily.parquet")

In [None]:
pool_obj = ThreadPoolExecutor(max_workers=os.cpu_count()-1)
ans = pool_obj.map(downsample_intraday_prices_to_daily_for_ticker_and_save, tickers)
result = list(ans)

In [None]:
tickers = get_tickers(config.data.iqfeed.minute.cleaned)
daily_tickers = get_tickers(config.data.iqfeed.daily.cleaned)
set(tickers) - set(daily_tickers)

# Clean daily time series - Remove large holes etc.

In [31]:
import numpy as np

In [19]:
tickers = get_tickers(directory=config.data.iqfeed.daily.cleaned)

In [35]:
prices = pd.read_parquet(path=f"{config.data.iqfeed.daily.cleaned}/AAPJ_daily.parquet")

In [36]:
timedeltas = prices.index.diff()
mask = timedeltas >= pd.Timedelta("30 days")

In [37]:
np.bincount(mask.cumsum())

array([  1,   5,  35,   5,  14,   5,   8, 226,   2,   1,  13,  11,   1,
         2,  10, 785])

In [38]:
groupers = mask.cumsum()

In [49]:
def f(x):
    # if x.shape[0] == 1:
    #     return x[:0]
    x["sum"] = x.shape[0]
    return x
prices.groupby(groupers, group_keys=False).apply(f)

Unnamed: 0_level_0,adj_open,adj_high,adj_close,adj_volume,cum_split_ratio,std_252,dollar_volume,r_intra_(t-1),unadj_open,r,cond_vola,sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-04-05,0.0003,0.0003,0.00030,4.750000e+02,1.0,,0.142500,,0.0003,-0.700000,1.365866,1
2010-06-24,0.0010,0.0010,0.00100,8.000000e+02,1.0,,0.800000,0.000,0.0010,4.000000,1.511460,5
2010-07-12,0.0002,0.0002,0.00020,5.000000e+02,1.0,,0.100000,0.000,0.0002,0.000000,1.579210,5
2010-07-26,0.0002,0.0002,0.00020,5.000000e+04,1.0,,10.000000,0.000,0.0002,0.000000,1.612007,5
2010-07-27,0.0002,0.0002,0.00020,5.000000e+03,1.0,,1.000000,0.000,0.0002,0.000000,1.628153,5
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-11,0.0005,0.0003,0.00040,2.735571e+06,1.0,2.754038,1231.007143,0.000,0.0005,0.290323,1.644129,785
2023-12-12,0.0004,0.0003,0.00031,3.333333e+06,1.0,2.763128,1183.333333,-0.200,0.0004,-0.225000,1.644129,785
2023-12-13,0.0004,0.0003,0.00040,6.486867e+06,1.0,2.771334,2594.746971,-0.225,0.0004,0.333333,1.644129,785
2023-12-14,0.0005,0.0003,0.00030,2.154375e+06,1.0,2.776146,861.750000,0.000,0.0005,-0.333333,1.644129,785


In [23]:
for ticker in tickers:
    prices = pd.read_parquet(path=f"{config.data.iqfeed.daily.cleaned}/{ticker}_daily.parquet")
    timedeltas = prices.index.diff()
    mask = timedeltas >= pd.Timedelta("30 days")
    if mask.any():
        mask.cumsum()
        last_gap_index = prices.index[mask][-1]
        print(f"{ticker=}, {last_gap_index=}")
        #prices = prices.loc[last_gap_index:, ]

ticker='AABB', last_gap_index=Timestamp('2018-03-12 00:00:00')
ticker='AACAY', last_gap_index=Timestamp('2011-01-24 00:00:00')
ticker='AACIU', last_gap_index=Timestamp('2023-10-30 00:00:00')
ticker='AAGC', last_gap_index=Timestamp('2011-05-18 00:00:00')
ticker='AAMAF', last_gap_index=Timestamp('2023-10-30 00:00:00')
ticker='AAMC', last_gap_index=Timestamp('2022-03-21 00:00:00')
ticker='AAPJ', last_gap_index=Timestamp('2017-08-22 00:00:00')
ticker='AAPT', last_gap_index=Timestamp('2010-02-22 00:00:00')
ticker='AATV', last_gap_index=Timestamp('2023-08-03 00:00:00')
ticker='ABCE', last_gap_index=Timestamp('2023-12-11 00:00:00')
ticker='ABMT', last_gap_index=Timestamp('2023-03-27 00:00:00')
ticker='ABQQ', last_gap_index=Timestamp('2019-12-18 00:00:00')
ticker='ABSSF', last_gap_index=Timestamp('2017-01-09 00:00:00')
ticker='ABTI', last_gap_index=Timestamp('2020-07-30 00:00:00')
ticker='ABTO', last_gap_index=Timestamp('2023-05-22 00:00:00')
ticker='ABVC', last_gap_index=Timestamp('2019-05-20

KeyboardInterrupt: 