In [None]:

import pandas as pd 
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd
import pytz
eastern = pytz.timezone('US/Eastern')

from src.config import config
from src.preprocessing.data_merger_util import (get_appropriate_closing_time,
                                                get_appropriate_entry_time, 
                                                get_primary_ticker)

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

## Import and Preprocess News 

In [None]:
news = dd.read_parquet(path=config.data.benzinga.cleaned, columns=["time", "stocks", "parsed_body"])

In [None]:
news = news.compute()

In [None]:
news["time"] = news.time.dt.tz_convert(eastern).astype('datetime64[ns, US/Eastern]')
news.rename(columns={"time":"news_time"}, inplace=True)

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
# Watch out, news time is accurate, but candles are right labeled, hence add one minute.
news["est_entry_time"] = news["news_time"].map(get_appropriate_entry_time)

# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
news["est_exit_time"] = news["news_time"].map(get_appropriate_closing_time)

## Consolidate Tickers

In [None]:
ticker_mapper_consolidated = pd.read_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [None]:
# Overwrite tickers with consolidated ticker, i.e. the ticker of the time series we use to construct input-output pairs
news["stocks"] = news.stocks.map(lambda ticker: get_primary_ticker(ticker, mapper=ticker_mapper_consolidated))
# Some tickers don't exist, they will be converted to NaNs
news = news.dropna()

In [None]:
news.memory_usage()

## Merge News with Price Time Series

In [None]:
# TODO: Also merge with  non-adjusted prices. We don't trade penny stocks.
# If the price is smaller than 1 when the news come out we don't trade.

In [None]:
spy: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/SPY_1min.parquet")
spy.columns = [x.strip("adj_") for x in spy.columns]
spy.columns = [f"SPY_{x}" for x in spy.columns]

In [None]:
unique_tickers = ticker_mapper_consolidated.stocks[ticker_mapper_consolidated.is_primary_ticker].values

In [None]:
for ticker in tqdm(unique_tickers):
    ticker_news = news.loc[news.stocks == ticker, :].reset_index()

    prices: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    prices.columns = [x.strip("adj_") for x in prices.columns]
    prices = prices.reset_index().sort_values("time")

    # We generally neeed to use `merge_asof` here instad of simple `merge`, because
    # Sometimes no auction occurred or was recorded at 16:00 or things of this sort.

    # Left key must be sorted
    ticker_news.sort_values("est_entry_time", inplace=True)
    merged = pd.merge_asof(ticker_news, prices.rename(columns=dict(time="entry_time")), left_on="est_entry_time", right_on="entry_time", direction="forward")

    merged.sort_values("est_exit_time", inplace=True)
    merged = pd.merge_asof(merged, prices.rename(columns=dict(time="exit_time")), left_on="est_exit_time", right_on="exit_time", suffixes=("_entry", "_exit"), direction="backward")
    # We use the O part of the OHLC for intra day candles here for convenienece as well
    merged["r"] = merged["open_exit"] / merged["open_entry"] - 1

    # Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
    # Merge news and stock prices with spy prices
    merged.sort_values("entry_time", inplace=True)
    merged.dropna(inplace=True) # NaN can occurr e.g. if there ist not exit_time for an est_exit_time
    merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")

    # TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
    # But for that we need the daily time series, not with minute frequency
    merged.sort_values("exit_time", inplace=True)
    merged.dropna(inplace=True)
    merged = pd.merge_asof(merged, spy, left_on="exit_time", right_on="time", suffixes=("_entry", "_exit"), direction="backward")

    # Calculate to potentially filter out penny stocks later on
    merged["unadj_entry_open"] = merged["open_entry"] / merged["cum_split_ratio_entry"]

    #TODO: shouldnt we use open entry and close exit?
    merged["r_spy"] = merged["SPY_close_exit"] / merged["SPY_close_entry"] - 1

    merged.set_index("index", inplace=True)
    
    keep_columns_from_news = ["staleness"]
    keep_columns = ["est_entry_time", "est_exit_time", "r", "unadj_entry_open", "r_spy", "entry_is_too_far_apart", "exit_is_too_far_apart"] + keep_columns_from_news
    
    # Filter out stocks where estimated entry/exit is further apart than actual entry/exit by more than 1h
    merged["entry_is_too_far_apart"] = (merged.entry_time - merged.est_entry_time) > pd.Timedelta(hours=1)
    merged["exit_is_too_far_apart"] = (merged.exit_time - merged.est_exit_time) > pd.Timedelta(hours=1)

    news.loc[merged.index, keep_columns] = merged.loc[:, keep_columns]

In [None]:
mask = (news["entry_is_too_far_apart"] | news["exit_is_too_far_apart"])
news = news[~(news["entry_is_too_far_apart"]|news["exit_is_too_far_apart"])]
print(f"Filtered rows: {mask.sum()}")

In [None]:
ticker_mapper_consolidated[ticker_mapper_consolidated.stocks == "ALV"]

In [None]:
print(f"{news.shape[0]} news before. {news.dropna().shape[0]} news after dropping NaNs."
      f"NaNs should occurr, when we don't have a price time series when news occurred.")
news = news.dropna()

In [None]:
# Save to Disk
news.to_parquet(config.data.merged)

------------------------------
## Merge with Daily Indicators

In [None]:
from src.utils.tickers import get_tickers
tickers = get_tickers(config.data.iqfeed.daily.cleaned)

In [None]:
dataset = pd.read_parquet(path=config.data.merged)
dataset[["std_252", "dollar_volume", 'r_intra_(t-1)', 'unadj_open']] = np.NaN

In [None]:
indicators = ["std_252", "dollar_volume", 'r_intra_(t-1)', 'unadj_open']

In [None]:
ticker = tickers[0]
prices = pd.read_parquet(path=f"{config.data.iqfeed.daily.cleaned}/{ticker}_daily.parquet")
prices.index = prices.index.tz_localize("US/Eastern")
ticker_dat = dataset.loc[dataset.stocks == ticker, :].reset_index()
merged = pd.merge_asof(ticker_dat, prices[indicators], left_on="est_entry_time", right_on="date", direction="backward")
merged.set_index("index", inplace=True)

In [None]:
for ticker in tqdm(tickers):
    prices = pd.read_parquet(path=f"{config.data.iqfeed.daily.cleaned}/{ticker}_daily.parquet")
    prices.index = prices.index.tz_localize("US/Eastern")
    ticker_dat = dataset.loc[dataset.stocks == ticker, :].reset_index().drop(columns=indicators)
    ticker_dat.sort_values("est_entry_time", inplace=True)
    merged = pd.merge_asof(ticker_dat, prices[indicators], left_on="est_entry_time", right_on="date", direction="backward")
    merged.set_index("index", inplace=True)
    dataset.loc[merged.index, indicators] = merged[indicators]

In [None]:
dataset.to_parquet(path=config.data.merged)

# Calculate target variables

In [None]:
dat: pd.DataFrame = pd.read_parquet(path=config.data.merged)

In [None]:
dat.loc[:, "r_mkt_adj"] =  dat["r"] - dat["r_spy"]
# std_252 is annualized, but returns arent...
# TODO: Dont annualize in indicator_applicator! or make it clear by naming properly
#TODO: This needs to be of r_mkt_adj, not of wahtever else std_252 is or?
dat.loc[:, "z_score"] = dat["r_mkt_adj"] / (dat["std_252"]/(252**0.5)) 

# TODO: Calculate based on training set split
upper_z_quantile = 0.27
lower_z_quantile = -0.27
dat.loc[:, "z_score_class"] = 1
# Ordinal labeling
dat.loc[dat["z_score"] >= upper_z_quantile, "z_score_class"] = 2
dat.loc[dat["z_score"] <= lower_z_quantile, "z_score_class"] = 0
dat["z_score_class"].value_counts()

In [None]:
dat.to_parquet(path=config.data.merged)


------------