In [40]:
import pandas as pd 
import numpy as np
import pandas_market_calendars as mcal
from os import listdir
from os.path import isfile, join

from src.config import TEST_CUTOFF_DATE

import pytz
eastern = pytz.timezone('US/Eastern')

# Minutely Data

## Configuration

In [41]:
# List off functions for modifying the return
raw_iq_feed_data_dir = "D:/IQFeedData"

# Input-Output-Rows for the neural network training and validation
train_output_dir = "D:/Data/NN_Training"

# Input-Rows for testing
test_output_dir = "D:/Data/NN_Testing"

## Import IQFeed Stocks

In [42]:
def get_next_available_candle(prices: pd.DataFrame, 
                              time: pd.Timestamp) -> pd.Series:
    entry_candle_idx = prices.index.get_indexer(target=[time], 
                                                method="bfill")
    entry_candle = prices.take(entry_candle_idx).iloc[0]
    return entry_candle

In [43]:
nyse_cal = mcal.get_calendar('NYSE')
def get_appropriate_closing_time(time: pd.Timestamp) -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day, hour=16, minute=0, tz=time.tz, unit="us")
    else:
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day, hour=16, minute=0, tz=time.tz, unit="us")
            if i == 7:
                return ValueError()
            i += 1

In [44]:
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
    prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
    prices.drop_duplicates(keep="first", inplace=True)
    prices.dropna(inplace=True)

    # Deals with duplicate rows which occurr when not all the digits for volume are 
    # correctly entered, but only the first 1-3. So keep the largest.
    prices = prices.sort_values(["time", "volume"], ascending=[True, False])
    prices = prices.drop_duplicates(subset=["time"], keep="first")

    prices.set_index("time", inplace=True)
    prices.sort_index(ascending=True, inplace=True)
    assert prices.index.is_unique
    return prices

## Import and Preprocess News 

In [45]:
df = pd.read_parquet("data/unraw2_bzg/data-5.parquet")

In [46]:
# Direct assignment doesnt work here, if df.time 
# already has a tz then it doesnt change on assignment 
tmp = df.time.dt.tz_convert(eastern)
df.loc[:, "time"] = 0
df.loc[:, "time"] = tmp

In [47]:
# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
df.loc[:, "entry_time"] = df.loc[:, "time"].dt.ceil("min")

In [48]:
# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
df.loc[:, "nn_exit_time"] = df.loc[:, "time"]
df.loc[:, "nn_exit_time"] = df.loc[:, "time"].map(get_appropriate_closing_time)

## Import and Preprocess Stock Prices

In [49]:
spy: pd.DataFrame = pd.read_parquet(f"{raw_iq_feed_data_dir}/SPY_1min.parquet", columns=["time", "close", "volume"])
spy = preprocess_iq_feed_prices(spy)
spy.head(3)

Unnamed: 0_level_0,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-04 04:08:00-05:00,112.25,1000.0
2010-01-04 04:09:00-05:00,112.25,1000.0
2010-01-04 04:16:00-05:00,112.22,19030.0


In [50]:
ticker = "FITB"
prices: pd.DataFrame = pd.read_parquet(f"{raw_iq_feed_data_dir}/{ticker}_1min.parquet", columns=["time", "close", "volume"])
prices = preprocess_iq_feed_prices(prices)

onlyfiles = [f for f in listdir(raw_iq_feed_data_dir) if isfile(join(raw_iq_feed_data_dir, f))]
tickers = [x.split("_")[0] for x in onlyfiles]

In [51]:
ticker_news = df[df.stocks == ticker]
merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")

In [None]:
merged = pd.merge(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"))
merged["r"] = merged["close_exit"] / merged["close_entry"] - 1

In [None]:
# Ideally we do this for every stock first and then we come back with the complete dataframe... (depends on if it fits in memory)
# Merge news and stock prices with spy prices
merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge_asof(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_spy_entry", "_spy_exit"))
merged.loc[:, "r_spy"] = merged["close_spy_exit"] / merged["close_spy_entry"] - 1
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] = merged["r"]
merged = merged.loc[:, ["time", "stocks", "body", "entry_time", "r_mkt_adj"]]

In [None]:
merged.dtypes

In [13]:
merged.head(3)

Unnamed: 0,time,stocks,body,entry_time,r_mkt_adj
0,2011-04-14 10:40:05-04:00,FITB,(via COMTEX News Network)-- SmarTrend identif...,2011-04-14 10:41:00-04:00,0.017337
1,2011-04-21 10:38:05-04:00,FITB,(via COMTEX News Network)-- SmarTrend identif...,2011-04-21 10:39:00-04:00,-0.000781
2,2011-04-21 11:36:05-04:00,FITB,(via COMTEX News Network)-- SmarTrend has det...,2011-04-21 11:37:00-04:00,-0.01374


# Save to Files 

In [55]:
# Splitting training and test set
merged_test = merged.loc[merged.time >= TEST_CUTOFF_DATE]
merged_train = merged.loc[merged.time < TEST_CUTOFF_DATE]
assert merged_test.shape[0] + merged_train.shape[0] == merged.shape[0]

In [None]:
train_output_dir
test_output_dir