In [1]:
import pandas as pd 
import numpy as np

import plotly.express as px
from IPython.display import clear_output
import dask.dataframe as dd

import pytz
eastern = pytz.timezone('US/Eastern')

from src.config import config
from src.preprocessing.data_merger_util import get_appropriate_closing_time,get_appropriate_entry_time, get_time_interval, consolidate_tickers, get_primary_ticker

In [69]:
import logging 
logging.basicConfig(filename='logs/preprocessing/data_merger.log', level=logging.INFO)

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

## Import and Preprocess News 

In [70]:
news = dd.read_parquet(path="data/processed_news", columns=["time", "stocks", "parsed_body"])
# news = pd.read_parquet(path="data/processed_news/data-0.parquet")

In [71]:
news = news.compute()

In [72]:
news["time"] = news.time.dt.tz_convert(eastern).astype('datetime64[ns, US/Eastern]')

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
# Watch out, news time is accurate, but candles are right labeled, hence add one minute.
news["entry_time"] = news["time"].map(get_appropriate_entry_time)

# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
news["nn_exit_time"] = news["time"].map(get_appropriate_closing_time)

## Consolidate Tickers

In [73]:
"""
Handling of multiple tickers for a the same company.
If there is only one price time series available for the company, we simply group together the tickers.
However in some cases we will have multiple price time series for the same company.

E.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same
underlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.
In this case we will try to only look at the main class. 

We find this class by choosing the Symbol with the longer stock price history, assuming that the history
of it includes(!) the history of the other one completely.
If one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity 
in a given week or but we will simply decide that the newer time series takes precedence for simplicity. 
"""

"\nHandling of multiple tickers for a the same company.\nIf there is only one price time series available for the company, we simply group together the tickers.\nHowever in some cases we will have multiple price time series for the same company.\n\nE.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same\nunderlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.\nIn this case we will try to only look at the main class. \n\nWe find this class by choosing the Symbol with the longer stock price history, assuming that the history\nof it includes(!) the history of the other one completely.\nIf one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity \nin a given week or but we will simply decide that the newer time series takes precedence for simplicity. \n"

In [74]:
do_consolidate_tickers = False
if do_consolidate_tickers:
    ticker_mapper = pd.read_parquet("data_shared/ticker_name_mapper_reduced.parquet")
    ticker_mapper[["first_date", "last_date"]] = np.NaN
    for i in ticker_mapper.index:
        ticker_mapper.loc[i, ["first_date", "last_date"]] = get_time_interval(ticker_mapper.loc[i, "stocks"])
    ticker_mapper.dropna(inplace=True)
    ticker_mapper[["first_date", "last_date"]] = ticker_mapper[["first_date", "last_date"]].apply(pd.to_datetime, axis=0)

    ticker_mapper_consolidated = ticker_mapper.copy(deep=True)
    ticker_mapper_consolidated["is_primary_ticker"] = False
    ticker_mapper_consolidated = ticker_mapper_consolidated.groupby("company_name", as_index=False).apply(consolidate_tickers)
    print(f"{ticker_mapper_consolidated.shape[0]} entries before consolidation. {ticker_mapper_consolidated[ticker_mapper_consolidated.is_primary_ticker].shape} entries after.")
    ticker_mapper_consolidated.to_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [75]:
ticker_mapper_consolidated = pd.read_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [76]:
# Overwrite tickers with consolidated ticker, i.e. the ticker of the time series we use to construct input-output pairs
news["stocks"] = news.stocks.map(lambda ticker: get_primary_ticker(ticker, mapper=ticker_mapper_consolidated))
# Some tickers don't exist, they will be converted to NaNs
news = news.dropna()

In [77]:
news.memory_usage()

Index              3418496
time               3418496
stocks             3418496
parsed_body     1939337930
entry_time         3418496
nn_exit_time       3418496
dtype: int64

## Merge

In [78]:
# TODO: Also merge with  non-adjusted prices. We don't trade penny stocks.
# If the price is smaller than 1 when the news come out we don't trade.

In [79]:
spy: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/SPY_1min.parquet")
spy.columns = [x.strip("adj_") for x in spy.columns]
spy.columns = [f"SPY_{x}" for x in spy.columns]

In [80]:
unique_tickers = ticker_mapper_consolidated.stocks[ticker_mapper_consolidated.is_primary_ticker == True].values

In [85]:
for i, ticker in enumerate(unique_tickers):
    clear_output(wait=True)
    print(f"{i} - {ticker}", flush=True)

    ticker_news = news.loc[news.stocks == ticker, :].reset_index()

    prices: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    prices.columns = [x.strip("adj_") for x in prices.columns]
    prices = prices.sort_values("time")

    # We generally neeed to use `merge_asof` here instad of simple `merge`, because
    # Sometimes no auction occurred or was recorded at 16:00 or things of this sort.
    
    # Left key must be sorted
    ticker_news.sort_values("entry_time", inplace=True)
    merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")
    merged.sort_values("nn_exit_time", inplace=True)
    merged = pd.merge_asof(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), direction="backward")
    # We use the O part of the OHLC for intra day candles here for convenienece as well
    merged["r"] = merged["open_exit"] / merged["open_entry"] - 1

    # Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
    # Merge news and stock prices with spy prices
    merged.sort_values("entry_time", inplace=True)
    merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")

    # TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
    # But for that we need the daily time series, not with minute frequency
    merged.sort_values("nn_exit_time", inplace=True)
    merged = pd.merge_asof(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), direction="backward")

    merged.loc[:, "r_spy"] = merged["SPY_close_exit"] / merged["SPY_close_entry"] - 1
    merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

    # Calculate to potentially filter out penny stocks later on
    merged["unadj_entry_stock_price"] = merged["close_entry"] / merged["cum_split_ratio_entry"]

    merged.set_index("index", inplace=True)
    news.loc[merged.index, ["entry_time", "nn_exit_time", "r"]] = merged.loc[:, ["entry_time", "nn_exit_time", "r"]]

8070 - OSSUY


In [86]:
ticker_mapper_consolidated[ticker_mapper_consolidated.stocks == "ALV"]

Unnamed: 0,Unnamed: 1,stocks,company_name,short_name,first_date,last_date,is_primary_ticker
778,74475,ALV,"Autoliv, Inc.",Autoliv,2010-01-04 09:31:00-05:00,2023-12-15 16:01:00-05:00,True


In [87]:
print(f"{news.shape[0]} news before. {news.dropna().shape[0]} news after dropping NaNs."
      f"NaNs should occurr, when we don't have a price time series when news occurred.")
news = news.dropna()

427312 news before. 419094 news after dropping NaNs.NaNs should occurr, when we don't have a price time series when news occurred.


In [88]:
## Set training, validation and test set indices
news["split"] = "training"
news.loc[news.time >= config.model.data.val_cutoff_date, "split"] = "validation"
news.loc[news.time >= config.model.data.test_cutoff_date, "split"] = "testing"
news["split"] = news["split"].astype("category")

In [103]:
## ALTERNATIVELY Set training, validation and test set indices
N = news.shape[0]
news["split"] = "training"
news.iloc[int(N * 0.7):, : ].loc[:, "split"] = "validation"
news.iloc[int(N * 0.9):, : ].loc[:, "split"] = "testing"
news["split"] = news["split"].astype("category")

In [105]:
train_N = news[news["split"] == "training"].shape[0]
valid_N = news[news["split"] == "validation"].shape[0]
test_N = news[news["split"] == "testing"].shape[0]

print(f"{train_N} samples in training set."
      f"\n {valid_N} samples in validation set."
      f"\n {test_N} samples in testing set.")

293365 samples in training set.
 83819 samples in validation set.
 0 samples in testing set.


In [6]:
# Save to Disk
news.to_parquet(config.data.merged)

------------------------------

In [2]:
news = pd.read_parquet(config.data.merged)