In [None]:
import pandas as pd 
import numpy as np
import pandas_market_calendars as mcal
from os import listdir
from os.path import isfile, join
import plotly.express as px

import pytz
eastern = pytz.timezone('US/Eastern')
nyse_cal = mcal.get_calendar('NYSE')

from dotmap import DotMap
import yaml
config = DotMap(yaml.safe_load(open("src/config.yaml")), _dynamic=False)

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

In [None]:
def get_next_available_candle(prices: pd.DataFrame, 
                              time: pd.Timestamp) -> pd.Series:
    entry_candle_idx = prices.index.get_indexer(target=[time], 
                                                method="bfill")
    entry_candle = prices.take(entry_candle_idx).iloc[0]
    return entry_candle

In [None]:
def get_appropriate_closing_time(time: pd.Timestamp) -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day)
    else:
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day)
            if i == 7:
                return ValueError()
            i += 1

In [None]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [None]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)
        
        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are 
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

## Import and Preprocess News 

In [None]:
df = pd.read_parquet(path="data/unraw2_bzg/data-10.parquet")
df["time"] = df.time.dt.tz_convert(eastern)

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
df["entry_time"] = df["time"].dt.ceil("min")


# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
df["nn_exit_time"] = df["time"].map(get_appropriate_closing_time)

## Import and Preprocess Stock Prices

In [None]:
spy: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.minute.raw}/SPY_1min.parquet", 
                                                              columns=["time", "close", "volume"]))
spy_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.daily.raw}/SPY_daily.parquet", 
                                                                    columns=["date", "close", "volume"]))

In [None]:
onlyfiles = [f for f in listdir(config.data.iqfeed.minute.raw) if isfile(join(config.data.iqfeed.minute.raw, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = "GOOGL"

In [None]:
prices: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.minute.raw}/{ticker}_1min.parquet", 
                                                                 columns=["time", "close", "open", "volume"]))
prices_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{config.data.iqfeed.minute.raw}/daily/{ticker}_daily.parquet", 
                                                                       columns=["date", "close", "open", "volume"]))

## Merge

In [None]:
ticker_news = df[df.stocks == ticker]
merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge(merged, prices_daily, left_on="nn_exit_time", right_on="date", suffixes=("_entry", "_exit"))
merged["r"] = merged["close_exit"] / merged["close_entry"] - 1

In [None]:
ticker_news

In [None]:
# Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
# Merge news and stock prices with spy prices
merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")

# TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
# But for that we need the daily time series, not with minute frequency
merged = pd.merge_asof(merged, spy_daily, left_on="nn_exit_time", right_on="date", suffixes=("_spy_entry", "_spy_exit"))

merged.loc[:, "r_spy"] = merged["close_spy_exit"] / merged["close_spy_entry"] - 1
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

In [None]:
merged.head(3)

In [None]:
merged = merged.loc[:, ["time", "stocks", "body", "entry_time", "r_mkt_adj"]]

In [None]:
merged.dtypes

In [None]:
merged.head(3)

# Save to Files 

Columns: `time|  stocks  |body  |entry_time  |r_mkt_adj`

In [None]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= config.model.data.cutoff_date]
merged_train = merged.loc[merged.time < config.model.data.cutoff_date]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [None]:
merged_train

In [None]:
config.model.data.training
config.model.data.testing

In [None]:
prices[(prices.index.hour == 16) & (prices.index.minute==0)]