In [4]:
import pandas as pd 
import numpy as np
import pandas_market_calendars as mcal
from os import listdir
from os.path import isfile, join
import plotly.express as px

from src.config import TEST_CUTOFF_DATE

import pytz
eastern = pytz.timezone('US/Eastern')
nyse_cal = mcal.get_calendar('NYSE')

[Python numpy: cannot convert datetime64[ns] to datetime64[D] (to use with Numba)](https://stackoverflow.com/a/76139900/9079015)

# Minutely Data

## Configuration

In [1]:
# List off functions for modifying the return
raw_iq_feed_data_dir = "D:/IQFeedData"

# Input-Output-Rows for the neural network training and validation
train_output_dir = "D:/Data/NN_Training"

# Input-Rows for testing
test_output_dir = "D:/Data/NN_Testing"

In [5]:
def get_next_available_candle(prices: pd.DataFrame, 
                              time: pd.Timestamp) -> pd.Series:
    entry_candle_idx = prices.index.get_indexer(target=[time], 
                                                method="bfill")
    entry_candle = prices.take(entry_candle_idx).iloc[0]
    return entry_candle

In [6]:
def get_appropriate_closing_time(time: pd.Timestamp) -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day)
    else:
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day)
            if i == 7:
                return ValueError()
            i += 1

In [139]:
def filter_trading_hours(df, time_column):
    T = df[time_column].dt
    min_mask = (T.hour >= 10) | ((T.hour == 9) & (T.minute >= 31))
    max_mask = (T.hour < 16) | ((T.hour == 16) & (T.minute <= 1))
    return df.loc[min_mask & max_mask, :]

In [140]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    if "time" in prices.columns:
        # Intra-day data
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
        prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
        prices.drop_duplicates(keep="first", inplace=True)
        prices.dropna(inplace=True)
        
        prices = filter_trading_hours(df=prices, time_column="time")

        # Deals with duplicate rows which occurr when not all the digits for volume are 
        # correctly entered, but only the first 1-3. So keep the largest.
        prices = prices.sort_values(["time", "volume"], ascending=[True, False])
        prices = prices.drop_duplicates(subset=["time"], keep="first")

        prices.set_index("time", inplace=True)
        prices.sort_index(ascending=True, inplace=True)
        assert prices.index.is_unique
    else:
        # Daily data
        prices.dropna(inplace=True)
        prices["date"] = pd.to_datetime(prices.date)
    return prices

## Import and Preprocess News 

In [151]:
df = pd.read_parquet(path="data/unraw2_bzg/data-10.parquet")
df["time"] = df.time.dt.tz_convert(eastern)

# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
df["entry_time"] = df["time"].dt.ceil("min")


# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
df["nn_exit_time"] = df["time"].map(get_appropriate_closing_time)

## Import and Preprocess Stock Prices

In [142]:
spy: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/SPY_1min.parquet", 
                                                              columns=["time", "close", "volume"]))
spy_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/daily/SPY_daily.parquet", 
                                                                    columns=["date", "close", "volume"]))

In [143]:
onlyfiles = [f for f in listdir(raw_iq_feed_data_dir) if isfile(join(raw_iq_feed_data_dir, f))]
tickers = [x.split("_")[0] for x in onlyfiles]
ticker = "GOOGL"

In [144]:
prices: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/{ticker}_1min.parquet", 
                                                                 columns=["time", "close", "open", "volume"]))
prices_daily: pd.DataFrame = preprocess_iq_feed_prices(pd.read_parquet(path=f"{raw_iq_feed_data_dir}/daily/{ticker}_daily.parquet", 
                                                                       columns=["date", "close", "open", "volume"]))

## Merge

In [152]:
ticker_news = df[df.stocks == ticker]
merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge(merged, prices_daily, left_on="nn_exit_time", right_on="date", suffixes=("_entry", "_exit"))
merged["r"] = merged["close_exit"] / merged["close_entry"] - 1

In [153]:
ticker_news

Unnamed: 0_level_0,time,stocks,author,title,channels,body,entry_time,nn_exit_time
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [148]:
# Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
# Merge news and stock prices with spy prices
merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")

# TODO: Don't use intraday as exit here (closing candle) but the actual closing auction...
# But for that we need the daily time series, not with minute frequency
merged = pd.merge_asof(merged, spy_daily, left_on="nn_exit_time", right_on="date", suffixes=("_spy_entry", "_spy_exit"))

merged.loc[:, "r_spy"] = merged["close_spy_exit"] / merged["close_spy_entry"] - 1
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] - merged["r"]

In [149]:
merged.head(3)

Unnamed: 0,time,stocks,author,title,channels,body,entry_time,nn_exit_time,close_entry,open_entry,...,open_exit,volume_exit,r,close_spy_entry,volume_spy_entry,date_spy_exit,close_spy_exit,volume_spy_exit,r_spy,r_mkt_adj


In [None]:
merged = merged.loc[:, ["time", "stocks", "body", "entry_time", "r_mkt_adj"]]

In [14]:
merged.dtypes

time          datetime64[us, US/Eastern]
stocks                    string[python]
body                      string[python]
entry_time    datetime64[us, US/Eastern]
r_mkt_adj                        float64
dtype: object

In [15]:
merged.head(3)

Unnamed: 0,time,stocks,body,entry_time,r_mkt_adj
0,2011-04-11 11:38:40-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-04-11 11:39:00-04:00,-0.964246
1,2011-04-12 14:28:03-04:00,AAPL,(via COMTEX News Network)-- Can Apple Punch T...,2011-04-12 14:29:00-04:00,-0.96389
2,2011-04-18 13:48:06-04:00,AAPL,(via COMTEX News Network)-- Eye on Apple Supp...,2011-04-18 13:49:00-04:00,-0.962895


# Save to Files 

Columns: `time|  stocks  |body  |entry_time  |r_mkt_adj`

In [16]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= TEST_CUTOFF_DATE]
merged_train = merged.loc[merged.time < TEST_CUTOFF_DATE]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [18]:
merged_train

Unnamed: 0,time,stocks,body,entry_time,r_mkt_adj
0,2011-04-11 11:38:40-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-04-11 11:39:00-04:00,-0.964246
1,2011-04-12 14:28:03-04:00,AAPL,(via COMTEX News Network)-- Can Apple Punch T...,2011-04-12 14:29:00-04:00,-0.96389
2,2011-04-18 13:48:06-04:00,AAPL,(via COMTEX News Network)-- Eye on Apple Supp...,2011-04-18 13:49:00-04:00,-0.962895
3,2011-04-28 12:22:04-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-04-28 12:23:00-04:00,-0.964007
4,2011-05-12 12:22:07-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-05-12 12:23:00-04:00,-0.964722
5,2011-05-16 11:38:03-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-05-16 11:39:00-04:00,-0.96448
6,2011-05-19 12:16:11-04:00,AAPL,(via COMTEX News Network)-- Eye on Apple By M...,2011-05-19 12:17:00-04:00,-0.964797
7,2011-05-24 13:10:08-04:00,AAPL,(via COMTEX News Network)-- SmarTrend has det...,2011-05-24 13:11:00-04:00,-0.963948
8,2011-06-06 13:30:06-04:00,AAPL,(via COMTEX News Network)-- Last updated June...,2011-06-06 13:31:00-04:00,-0.96538
9,2011-06-07 13:00:03-04:00,AAPL,(via COMTEX News Network)-- Apple Testing Sup...,2011-06-07 13:01:00-04:00,-0.964429


In [None]:
train_output_dir
test_output_dir

In [69]:
prices[(prices.index.hour == 16) & (prices.index.minute==0)]

Unnamed: 0_level_0,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-29 16:00:00-04:00,23.9400,243648.0
2010-06-30 16:00:00-04:00,23.6300,59289.0
2010-07-01 16:00:00-04:00,21.9000,27885.0
2010-07-02 16:00:00-04:00,19.2000,34925.0
2010-07-06 16:00:00-04:00,15.9800,101401.0
...,...,...
2023-12-11 16:00:00-05:00,239.6238,674959.0
2023-12-12 16:00:00-05:00,236.9300,597481.0
2023-12-13 16:00:00-05:00,239.2900,619644.0
2023-12-14 16:00:00-05:00,251.1401,729480.0
