In [1]:
import pandas as pd 
import numpy as np

from os.path import isfile, join
import plotly.express as px
from IPython.display import clear_output
import dask.dataframe as dd
from os import listdir

import pytz
eastern = pytz.timezone('US/Eastern')

from dotmap import DotMap
import yaml
config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)

from src.preprocessing.util import get_appropriate_closing_time,get_appropriate_entry_time, get_time_interval, consolidate_tickers, get_primary_ticker

In [2]:
import logging 
logging.basicConfig(filename='logs/preprocessing/data_merger.log', level=logging.INFO)

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

## Import and Preprocess News 

In [3]:
news = dd.read_parquet(path="data/processed_news", columns=["time", "stocks", "parsed_body"])
# news = pd.read_parquet(path="data/processed_news/data-0.parquet")

In [4]:
news = news.compute()

In [5]:
news["time"] = news.time.dt.tz_convert(eastern).astype('datetime64[ns, US/Eastern]')

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
# Watch out, news time is accurate, but candles are right labeled, hence add one minute.
news["entry_time"] = news["time"].map(get_appropriate_entry_time)

# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
news["nn_exit_time"] = news["time"].map(get_appropriate_closing_time)

## Consolidate Tickers

In [6]:
"""
Handling of multiple tickers for a the same company.
If there is only one price time series available for the company, we simply group together the tickers.
However in some cases we will have multiple price time series for the same company.

E.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same
underlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.
In this case we will try to only look at the main class. 

We find this class by choosing the Symbol with the longer stock price history, assuming that the history
of it includes(!) the history of the other one completely.
If one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity 
in a given week or but we will simply decide that the newer time series takes precedence for simplicity. 
"""

"\nHandling of multiple tickers for a the same company.\nIf there is only one price time series available for the company, we simply group together the tickers.\nHowever in some cases we will have multiple price time series for the same company.\n\nE.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same\nunderlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.\nIn this case we will try to only look at the main class. \n\nWe find this class by choosing the Symbol with the longer stock price history, assuming that the history\nof it includes(!) the history of the other one completely.\nIf one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity \nin a given week or but we will simply decide that the newer time series takes precedence for simplicity. \n"

In [7]:
do_consolidate_tickers = False
if do_consolidate_tickers:
    ticker_mapper = pd.read_parquet("data_shared/ticker_name_mapper_reduced.parquet")
    ticker_mapper[["first_date", "last_date"]] = np.NaN
    for i in ticker_mapper.index:
        ticker_mapper.loc[i, ["first_date", "last_date"]] = get_time_interval(ticker_mapper.loc[i, "stocks"])
    ticker_mapper.dropna(inplace=True)
    ticker_mapper[["first_date", "last_date"]] = ticker_mapper[["first_date", "last_date"]].apply(pd.to_datetime, axis=0)

    ticker_mapper_consolidated = ticker_mapper.copy(deep=True)
    ticker_mapper_consolidated["is_primary_ticker"] = False
    ticker_mapper_consolidated = ticker_mapper_consolidated.groupby("company_name", as_index=False).apply(consolidate_tickers)
    print(f"{ticker_mapper_consolidated.shape[0]} entries before consolidation. {ticker_mapper_consolidated[ticker_mapper_consolidated.is_primary_ticker].shape} entries after.")
    ticker_mapper_consolidated.to_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [8]:
ticker_mapper_consolidated = pd.read_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [9]:
# Overwrite tickers with consolidated ticker, i.e. the ticker of the time series we use to construct input-output pairs
news["stocks"] = news.stocks.map(lambda ticker: get_primary_ticker(ticker, mapper=ticker_mapper_consolidated))
# Some tickers don't exist, they will be converted to NaNs
news = news.dropna()

In [10]:
news.memory_usage()

Index              3418496
time               3418496
stocks             3418496
parsed_body     1939337930
entry_time         3418496
nn_exit_time       3418496
dtype: int64

## Merge

In [11]:
# TODO: Also merge with  non-adjusted prices. We don't trade penny stocks.
# If the price is smaller than 1 when the news come out we don't trade.

In [12]:
spy: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/SPY_1min.parquet")
spy.columns = [x.strip("adj_") for x in spy.columns]
spy.columns = [f"SPY_{x}" for x in spy.columns]

In [13]:
unique_tickers = ticker_mapper_consolidated.stocks[ticker_mapper_consolidated.is_primary_ticker == True].values

In [14]:
# for i, ticker in enumerate(unique_tickers):
#     clear_output(wait=True)
#     print(f"{i} - {ticker}", flush=True)
    


In [15]:
ticker = "ALV"
### 
ticker_news = news.loc[news.stocks == ticker, :].reset_index()

prices: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
prices.columns = [x.strip("adj_") for x in prices.columns]
prices = prices.sort_values("time")

merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")
# TODO: backfill asof here, since sometimes auction doesnt occurr or ... i dont know was recorded as taking place at 16:00
# Weird -> Need to inspect!
merged = pd.merge_asof(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), direction="backward")
# We use the O part of the OHLC for intra day candles here for convenienece as well
merged["r"] = merged["open_exit"] / merged["open_entry"] - 1

# Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
# Merge news and stock prices with spy prices
# merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge(merged, spy, left_on="entry_time", right_on="time", how="left")

# TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
# But for that we need the daily time series, not with minute frequency
merged = pd.merge(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), how="left")

merged.loc[:, "r_spy"] = merged["SPY_close_exit"] / merged["SPY_close_entry"] - 1
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

# Calculate to potentially filter out penny stocks later on
merged["unadj_entry_stock_price"] = merged.close_entry / merged.cum_split_ratio_entry

merged.set_index("index", inplace=True)

merged = merged.loc[:, ["time", "stocks", "parsed_body", "entry_time", "nn_exit_time", "r", "r_spy", "r_mkt_adj"]]
news.loc[merged.index, ["entry_time", "r", "r_spy", "r_mkt_adj"]] = merged.loc[:, ["entry_time", "r", "r_spy", "r_mkt_adj"]]

In [24]:
spy.loc["2011-07-24 09:32:00-04:00	":]

Unnamed: 0_level_0,SPY_open,SPY_high,SPY_low,SPY_close,SPY_volume,SPY_cum_split_ratio
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-07-25 09:31:00-04:00,105.469291,105.540500,105.461379,105.524676,1.293347e+06,0.791217
2011-07-25 09:32:00-04:00,105.520720,105.532667,105.413906,105.413906,1.013519e+06,0.791217
2011-07-25 09:33:00-04:00,105.413906,105.453466,105.382257,105.437642,8.423487e+05,0.791217
2011-07-25 09:34:00-04:00,105.437642,105.564237,105.429730,105.556325,1.057097e+06,0.791217
2011-07-25 09:35:00-04:00,105.562575,105.587973,105.540500,105.572149,8.321808e+05,0.791217
...,...,...,...,...,...,...
2023-12-15 15:57:00-05:00,468.537352,468.565239,468.112076,468.430784,1.384860e+06,0.995962
2023-12-15 15:58:00-05:00,468.440744,468.460663,468.122036,468.231592,1.379081e+06,0.995962
2023-12-15 15:59:00-05:00,468.241551,468.311269,468.122036,468.122036,1.381041e+06,0.995962
2023-12-15 16:00:00-05:00,468.122036,468.141955,467.444782,467.474660,2.882668e+06,0.995962


In [17]:
merged[merged.isna().any(axis=1)]

Unnamed: 0_level_0,time,stocks,parsed_body,entry_time,nn_exit_time,r,r_spy,r_mkt_adj
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1795622,2011-07-24 06:46:05-04:00,ALV,"Regulatory News: the company (NYSE: ALV, and...",2011-07-24 09:32:00-04:00,2011-07-25 16:01:00-04:00,-0.009987,,
3112919,2012-11-22 04:27:07-05:00,ALV,Regulatory News: In addition to the recently a...,2012-11-22 09:32:00-05:00,2012-11-23 16:01:00-05:00,-0.001507,,
3536024,2013-04-28 14:01:06-04:00,ALV,Glancy Binkow & Goldberg LLP announces that a ...,2013-04-28 09:32:00-04:00,2013-04-29 16:01:00-04:00,0.005053,,
3620734,2013-05-25 00:03:31-04:00,ALV,The Law Offices of Todd M.Garber announces tha...,2013-05-25 09:32:00-04:00,2013-05-28 16:01:00-04:00,-0.006215,,
4112542,2013-11-27 04:10:30-05:00,ALV,the company (STO:ALIVSDB) the worldwide leade...,2013-11-27 09:32:00-05:00,2013-11-29 16:01:00-05:00,-0.00399,,
5244134,2015-02-16 14:34:05-05:00,ALV,"the company (STO:ALIVSDB) (NYSE: ALV, and SS...",2015-02-16 09:32:00-05:00,2015-02-17 16:01:00-05:00,0.004294,,
9069438,2017-02-20 02:36:50-05:00,ALV,"the company , the worldwide leader in automot...",2017-02-20 09:32:00-05:00,2017-02-21 16:01:00-05:00,0.005524,,
11221896,2018-02-19 06:42:48-05:00,ALV,"the company , the worldwide leader in automot...",2018-02-19 09:32:00-05:00,2018-02-20 16:01:00-05:00,0.010401,,


In [18]:
ticker_mapper_consolidated[ticker_mapper_consolidated.stocks == "ALV"]

Unnamed: 0,Unnamed: 1,stocks,company_name,short_name,first_date,last_date,is_primary_ticker
778,74475,ALV,"Autoliv, Inc.",Autoliv,2010-01-04 09:31:00-05:00,2023-12-15 16:01:00-05:00,True


# Save to Disk

In [86]:
news.columns

Index(['time', 'stocks', 'parsed_body', 'entry_time', 'nn_exit_time', 'r',
       'r_spy', 'r_mkt_adj'],
      dtype='object')

In [88]:
print(f"{news.shape[0]} news before. {news.dropna().shape[0]} news after dropping NaNs.")
news = news.dropna()
news.to_parquet(config.data.merged)

367479 news before. 153507 news after dropping NaNs.


------------------------------

In [83]:
news = pd.read_parquet(config.data.merged)

In [232]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= config.model.data.cutoff_date]
merged_train = merged.loc[merged.time < config.model.data.cutoff_date]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [235]:
config.model.data.training
config.model.data.testing

'D:/Data/NN_Training'