In [1]:
import pandas as pd 
import numpy as np

from os.path import isfile, join
import plotly.express as px

import dask.dataframe as dd
from os import listdir

import pytz
eastern = pytz.timezone('US/Eastern')

from dotmap import DotMap
import yaml
config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)

from src.preprocessing.util import get_appropriate_closing_time,get_appropriate_entry_time, get_time_interval, consolidate_tickers, get_primary_ticker

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

## Import and Preprocess News 

In [2]:
news = dd.read_parquet(path="data/processed_news", columns=["time", "stocks", "parsed_body"])
# news = pd.read_parquet(path="data/processed_news/data-0.parquet")

In [3]:
news = news.compute()

In [4]:
news["time"] = news.time.dt.tz_convert(eastern).astype('datetime64[ns, US/Eastern]')

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
# Watch out, news time is accurate, but candles are right labeled, hence add one minute.
news["entry_time"] = news["time"].map(get_appropriate_entry_time)

# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
news["nn_exit_time"] = news["time"].map(get_appropriate_closing_time)

## Consolidate Tickers

In [4]:
"""
Handling of multiple tickers for a the same company.
If there is only one price time series available for the company, we simply group together the tickers.
However in some cases we will have multiple price time series for the same company.

E.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same
underlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.
In this case we will try to only look at the main class. 

We find this class by choosing the Symbol with the longer stock price history, assuming that the history
of it includes(!) the history of the other one completely.
If one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity 
in a given week or but we will simply decide that the newer time series takes precedence for simplicity. 
"""

"\nHandling of multiple tickers for a the same company.\nIf there is only one price time series available for the company, we simply group together the tickers.\nHowever in some cases we will have multiple price time series for the same company.\n\nE.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same\nunderlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.\nIn this case we will try to only look at the main class. \n\nWe find this class by choosing the Symbol with the longer stock price history, assuming that the history\nof it includes(!) the history of the other one completely.\nIf one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity \nin a given week or but we will simply decide that the newer time series takes precedence for simplicity. \n"

In [5]:
do_consolidate_tickers = False
if do_consolidate_tickers:
    ticker_mapper = pd.read_parquet("data_shared/ticker_name_mapper_reduced.parquet")
    ticker_mapper[["first_date", "last_date"]] = np.NaN
    for i in ticker_mapper.index:
        ticker_mapper.loc[i, ["first_date", "last_date"]] = get_time_interval(ticker_mapper.loc[i, "stocks"])
    ticker_mapper.dropna(inplace=True)
    ticker_mapper[["first_date", "last_date"]] = ticker_mapper[["first_date", "last_date"]].apply(pd.to_datetime, axis=0)

    ticker_mapper_consolidated = ticker_mapper.copy(deep=True)
    ticker_mapper_consolidated["is_primary_ticker"] = False
    ticker_mapper_consolidated = ticker_mapper_consolidated.groupby("company_name", as_index=False).apply(consolidate_tickers)
    print(f"{ticker_mapper_consolidated.shape[0]} entries before consolidation. {ticker_mapper_consolidated[ticker_mapper_consolidated.is_primary_ticker].shape} entries after.")
    ticker_mapper_consolidated.to_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [5]:
ticker_mapper_consolidated = pd.read_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [6]:
# Overwrite tickers with consolidated ticker, i.e. the ticker of the time series we use to construct input-output pairs
news["stocks"] = news.stocks.map(lambda ticker: get_primary_ticker(ticker, mapper=ticker_mapper_consolidated))
# Some tickers don't exist, they will be converted to NaNs
news = news.dropna()

In [7]:
news.memory_usage()

Index              2939832
time               2939832
stocks             2939832
parsed_body     1669553597
entry_time         2939832
nn_exit_time       2939832
dtype: int64

## Merge

In [209]:
# TODO: Also merge with  non-adjusted prices. We don't trade penny stocks.
# If the price is smaller than 1 when the news come out we don't trade.

In [12]:
spy: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/SPY_1min.parquet")
spy.columns = [x.strip("adj_") for x in spy.columns]
spy.columns = [f"SPY_{x}" for x in spy.columns]

In [40]:
unique_tickers = ticker_mapper_consolidated.stocks[ticker_mapper_consolidated.is_primary_ticker == True].values

In [63]:
for i, ticker in enumerate(unique_tickers):
    print(f"{i}, {ticker}")
    ticker_news = news.loc[news.stocks == ticker, :].reset_index()

    prices: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    prices.columns = [x.strip("adj_") for x in prices.columns]
    prices = prices.sort_values("time")

    merged = pd.merge(ticker_news, prices, left_on="entry_time", right_on="time", how="left")
    merged = pd.merge(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), how="left")
    # We use the O part of the OHLC for intra day candles here for convenienece as well
    merged["r"] = merged["open_exit"] / merged["open_entry"] - 1

    # Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
    # Merge news and stock prices with spy prices
    # merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")
    merged = pd.merge(merged, spy, left_on="entry_time", right_on="time", how="left")

    # TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
    # But for that we need the daily time series, not with minute frequency
    merged = pd.merge(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"), how="left")

    merged.loc[:, "r_spy"] = merged["SPY_close_exit"] / merged["SPY_close_entry"] - 1
    merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

    # Filter out penny_stocks
    merged["unadj_entry_stock_price"] = merged.close_entry / merged.cum_split_ratio_entry
    merged = merged.loc[merged["unadj_entry_stock_price"] > 1, :]

    merged.set_index("index", inplace=True)

    merged = merged.loc[:, ["time", "stocks", "parsed_body", "entry_time", "r", "r_spy", "r_mkt_adj"]]
    news.loc[merged.index, ["entry_time", "r", "r_spy", "r_mkt_adj"]] = merged.loc[:, ["entry_time", "r", "r_spy", "r_mkt_adj"]]

0, OONEF
1, FLWS
2, VCXAU
3, TXG
4, YI
5, RETC


In [None]:
news.head()

# Save to Files 

Columns: `time|  stocks  |body  |entry_time  |r_mkt_adj`

In [232]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= config.model.data.cutoff_date]
merged_train = merged.loc[merged.time < config.model.data.cutoff_date]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [233]:
merged_train

Unnamed: 0,time,stocks,parsed_body,entry_time,r_mkt_adj
0,2010-01-03 18:05:02-05:00,JBLU,FirstCall the company will waive change...,2010-01-04 09:31:00-05:00,0.004129
1,2010-01-04 11:00:03-05:00,JBLU,"FirstCall To celebrate 2010, the company ...",2010-01-04 11:02:00-05:00,-0.047636
2,2010-01-05 09:03:43-05:00,JBLU,FirstCall the company today kicks off a...,2010-01-05 09:31:00-05:00,-0.058859


In [235]:
config.model.data.training
config.model.data.testing

'D:/Data/NN_Training'