In [1]:
import pandas as pd 
import numpy as np
import pandas_market_calendars as mcal
from os import listdir
from os.path import isfile, join
import plotly.express as px
from dotmap import DotMap
import yaml
import dask.dataframe as dd


import pytz
eastern = pytz.timezone('US/Eastern')
nyse_cal = mcal.get_calendar('NYSE')


config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

In [2]:
def get_next_available_candle(prices: pd.DataFrame, 
                              time: pd.Timestamp) -> pd.Series:
    entry_candle_idx = prices.index.get_indexer(target=[time], 
                                                method="bfill")
    entry_candle = prices.take(entry_candle_idx).iloc[0]
    return entry_candle

In [201]:
def get_appropriate_closing_time(time: pd.Timestamp, tz="US/Eastern") -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day, hour=16, minute=1, tz=tz)
    else:
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day, hour=16, minute=1, tz=tz)
            if i == 7:
                return ValueError()
            i += 1

In [202]:
def get_appropriate_entry_time(time: pd.Timestamp, tz="US/Eastern") -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day, hour=9, minute=31, tz=tz)
    elif (time.hour > 16) or ((time.hour == 16) and (time.minute > 31)):
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day, hour=9, minute=31, tz=tz)
            if i == 7:
                return ValueError()
            i += 1
    else:
        return time.ceil("min")  + pd.Timedelta(minutes=1)

## Import and Preprocess News 

In [203]:
# news = dd.read_parquet(path="data/processed_news")
news = pd.read_parquet(path="data/processed_news/data-0.parquet")

In [204]:
news["time"] = news.time.dt.tz_convert(eastern).astype('datetime64[ns, US/Eastern]')

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
# Watch out, news time is accurate, but candles are right labeled, hence add one minute.
news["entry_time"] = news["time"].map(get_appropriate_entry_time)

# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
news["nn_exit_time"] = news["time"].map(get_appropriate_closing_time)

## Import and Preprocess Stock Prices

In [151]:
# TODO: preprocessing is largely done in iq_feed_cleaning... probably can just import and use as is 
spy: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/SPY_1min.parquet")
spy.columns = [x.strip("adj_") for x in spy.columns]
spy.columns = [f"SPY_{x}" for x in spy.columns]

In [7]:
onlyfiles = [f for f in listdir(config.data.iqfeed.minute.cleaned) if isfile(join(config.data.iqfeed.minute.cleaned, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = news.stocks.iloc[0]

In [11]:
prices: pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
prices.columns = [x.strip("adj_") for x in prices.columns]
prices = prices.sort_values("time")

## Handle Ticker:Company Mapping

In [12]:
"""
Handling of multiple tickers for a the same company.
If there is only one price time series available for the company, we simply group together the tickers.
However in some cases we will have multiple price time series for the same company.

E.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same
underlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.
In this case we will try to only look at the main class. 

We find this class by choosing the Symbol with the longer stock price history, assuming that the history
of it includes(!) the history of the other one completely.
If one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity 
in a given week or but we will simply decide that the newer time series takes precedence for simplicity. 
"""

"\nHandling of multiple tickers for a the same company.\nIf there is only one price time series available for the company, we simply group together the tickers.\nHowever in some cases we will have multiple price time series for the same company.\n\nE.g. in case of Alphabet (Google) we have two different tickers and two different stock prices for the same\nunderlying company. Here `GOOG` and `GOOGL` describe two different classes of stock for the same company.\nIn this case we will try to only look at the main class. \n\nWe find this class by choosing the Symbol with the longer stock price history, assuming that the history\nof it includes(!) the history of the other one completely.\nIf one time series doesn't include the other we merge the two time series. Ideally based on which time series has more liquidity \nin a given week or but we will simply decide that the newer time series takes precedence for simplicity. \n"

In [14]:
def get_time_interval(ticker):
    try:
        price : pd.DataFrame = pd.read_parquet(path=f"{config.data.iqfeed.minute.cleaned}/{ticker}_1min.parquet")
    except FileNotFoundError:
        return (np.NaN, np.NaN)
    time_interval = price.index.min(), price.index.max()
    return time_interval

In [15]:
ticker_mapper = pd.read_parquet("data_shared/ticker_name_mapper_reduced.parquet")
ticker_mapper[["first_date", "last_date"]] = np.NaN

In [16]:
for i in ticker_mapper.index:
    ticker_mapper.loc[i, ["first_date", "last_date"]] = get_time_interval(ticker_mapper.loc[i, "stocks"])

In [17]:
ticker_mapper.dropna(inplace=True)
ticker_mapper[["first_date", "last_date"]] = ticker_mapper[["first_date", "last_date"]].apply(pd.to_datetime, axis=0)

In [28]:
def consolidate_tickers(df: pd.DataFrame) -> pd.DataFrame:
    # We want only one ticker per company name
    df = df.sort_values(["last_date", "first_date"], ascending=[False, True])
    df.loc[df.index[0], "is_primary_ticker"] = True
    return df

In [44]:
ticker_mapper_consolidated = ticker_mapper.copy(deep=True)
ticker_mapper_consolidated["is_primary_ticker"] = False
ticker_mapper_consolidated = ticker_mapper_consolidated.groupby("company_name", as_index=False).apply(consolidate_tickers)
ticker_mapper_consolidated.shape

(8133, 6)

In [45]:
ticker_mapper_consolidated[ticker_mapper_consolidated.is_primary_ticker].shape

(8071, 6)

In [175]:
ticker_mapper_consolidated.to_parquet("data_shared/ticker_name_mapper_consolidated.parquet")

In [70]:
# df = news
def get_primary_ticker(ticker, mapper):
    company_name = mapper.loc[mapper["stocks"] == ticker, "company_name"]
    if len(company_name) == 0:
        # No matching entry
        return None
    else:
        company_name = company_name.iat[0]
    primary_ticker = mapper.loc[(mapper["company_name"] == company_name) & mapper.is_primary_ticker, "stocks"].iat[0]
    return primary_ticker

In [205]:
# Overwrite tickers with consolidated ticker, i.e. the ticker of the time series we use to construct input-output pairs
news["stocks"] = news.stocks.map(lambda ticker: get_primary_ticker(ticker, mapper=ticker_mapper_consolidated))

In [206]:
news.shape

(190, 11)

In [207]:
news.dropna(inplace=True)

In [208]:
news.dropna().shape

(185, 11)

## Merge

In [209]:
# TODO: Also merge with  non-adjusted prices. We don't trade penny stocks.
# If the price is smaller than 1 when the news come out we don't trade.

In [223]:
ticker_news = news[news.stocks == ticker]
merged = pd.merge(ticker_news, prices, left_on="entry_time", right_on="time")

In [224]:
merged = pd.merge(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"))
# We use the O part of the OHLC for intra day candles here for convenienece as well
merged["r"] = merged["open_exit"] / merged["open_entry"] - 1

In [225]:
merged[["time", "entry_time", "nn_exit_time"]]

Unnamed: 0,time,entry_time,nn_exit_time
0,2010-01-03 18:05:02-05:00,2010-01-04 09:31:00-05:00,2010-01-04 16:01:00-05:00
1,2010-01-04 11:00:03-05:00,2010-01-04 11:02:00-05:00,2010-01-05 16:01:00-05:00
2,2010-01-05 09:03:43-05:00,2010-01-05 09:31:00-05:00,2010-01-05 16:01:00-05:00


In [226]:
# Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
# Merge news and stock prices with spy prices
# merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge(merged, spy, left_on="entry_time", right_on="time")

In [227]:
# TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
# But for that we need the daily time series, not with minute frequency
merged = pd.merge(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"))

merged.loc[:, "r_spy"] = merged["SPY_close_exit"] / merged["SPY_close_entry"] - 1
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

In [228]:
merged.head(3)

Unnamed: 0,time,stocks,title,channels,body,author,company_name,short_name,parsed_body,entry_time,...,SPY_volume_entry,SPY_cum_split_ratio_entry,SPY_open_exit,SPY_high_exit,SPY_low_exit,SPY_close_exit,SPY_volume_exit,SPY_cum_split_ratio_exit,r_spy,r_mkt_adj
0,2010-01-03 18:05:02-05:00,JBLU,JetBlue to Waive Change Fees and Fare Differen...,[],"NEW YORK, Jan. 3 /PRNewswire-FirstCall/ -- Jet...",PRNewswire,JetBlue Airways Corporation,JetBlue Airways,FirstCall the company will waive change...,2010-01-04 09:31:00-05:00,...,1631609.0,0.768816,87.11458,87.129956,87.076139,87.091515,2674731.0,0.768816,0.00774,0.004129
1,2010-01-04 11:00:03-05:00,JBLU,Celebrate 2010 with JetBlue Airways' Shake Up ...,[],"NEW YORK, Jan. 4 /PRNewswire-FirstCall/ -- To ...",PRNewswire,JetBlue Airways Corporation,JetBlue Airways,"FirstCall To celebrate 2010, the company ...",2010-01-04 11:02:00-05:00,...,186030.1,0.768816,87.360601,87.375977,87.352913,87.368289,927326.8,0.768816,0.004428,-0.047636
2,2010-01-05 09:03:43-05:00,JBLU,JetBlue Expands in San Francisco Today,[],"SAN FRANCISCO, Jan. 5 /PRNewswire-FirstCall/ -...",PRNewswire,JetBlue Airways Corporation,JetBlue Airways,FirstCall the company today kicks off a...,2010-01-05 09:31:00-05:00,...,2149897.0,0.768816,87.360601,87.375977,87.352913,87.368289,927326.8,0.768816,0.002735,-0.058859


In [229]:
merged = merged.loc[:, ["time", "stocks", "parsed_body", "entry_time", "r_mkt_adj"]]

In [230]:
merged.dtypes

time           datetime64[ns, US/Eastern]
stocks                             object
parsed_body                        object
entry_time     datetime64[ns, US/Eastern]
r_mkt_adj                         float64
dtype: object

In [231]:
merged.head(3)

Unnamed: 0,time,stocks,parsed_body,entry_time,r_mkt_adj
0,2010-01-03 18:05:02-05:00,JBLU,FirstCall the company will waive change...,2010-01-04 09:31:00-05:00,0.004129
1,2010-01-04 11:00:03-05:00,JBLU,"FirstCall To celebrate 2010, the company ...",2010-01-04 11:02:00-05:00,-0.047636
2,2010-01-05 09:03:43-05:00,JBLU,FirstCall the company today kicks off a...,2010-01-05 09:31:00-05:00,-0.058859


# Save to Files 

Columns: `time|  stocks  |body  |entry_time  |r_mkt_adj`

In [232]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= config.model.data.cutoff_date]
merged_train = merged.loc[merged.time < config.model.data.cutoff_date]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [233]:
merged_train

Unnamed: 0,time,stocks,parsed_body,entry_time,r_mkt_adj
0,2010-01-03 18:05:02-05:00,JBLU,FirstCall the company will waive change...,2010-01-04 09:31:00-05:00,0.004129
1,2010-01-04 11:00:03-05:00,JBLU,"FirstCall To celebrate 2010, the company ...",2010-01-04 11:02:00-05:00,-0.047636
2,2010-01-05 09:03:43-05:00,JBLU,FirstCall the company today kicks off a...,2010-01-05 09:31:00-05:00,-0.058859


In [235]:
config.model.data.training
config.model.data.testing

'D:/Data/NN_Training'