In [1]:
import pandas as pd 
import numpy as np
import json
import pandas_market_calendars as mcal

# Minutely Data

## Configuration

In [2]:
# List off functions for modifying the return

## Import IQFeed Stocks

In [230]:
ticker = "FITB"

In [231]:
def get_next_available_candle(prices: pd.DataFrame, 
                              time: pd.Timestamp) -> pd.Series:
    entry_candle_idx = prices.index.get_indexer(target=[time], 
                                                method="bfill")
    entry_candle = prices.take(entry_candle_idx).iloc[0]
    return entry_candle

In [350]:
nyse_cal = mcal.get_calendar('NYSE')
def get_appropriate_closing_time(time: pd.Timestamp) -> pd.Timestamp:
    if (time.hour < 9) or ((time.hour == 9) and (time.minute < 30)):
        return pd.Timestamp(year=time.year, month=time.month, day=time.day, hour=16, minute=0, tz=time.tz, unit="us")
    else:
        valid_days = [x.date() for x in nyse_cal.valid_days(start_date=time.date(), end_date=time.date() + pd.DateOffset(days=10))]
        i = 1
        while True:
            new_time = time + pd.DateOffset(days=i)
            if new_time.date() in valid_days:
                return pd.Timestamp(year=new_time.year, month=new_time.month, day=new_time.day, hour=16, minute=0, tz=time.tz, unit="us")
            if i == 7:
                return ValueError()
            i += 1

In [351]:
import pytz
def preprocess_iq_feed_prices(prices: pd.DataFrame) -> pd.DataFrame: 
    eastern = pytz.timezone('US/Eastern')
    prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(None)
    prices.loc[:, "time"] = prices.loc[:, "time"].dt.tz_localize(eastern)
    prices.drop_duplicates(keep="first", inplace=True)
    prices.dropna(inplace=True)

    # Deals with duplicate rows which occurr when not all the digits for volume are 
    # correctly entered, but only the first 1-3. So keep the largest.
    prices = prices.sort_values(["time", "volume"], ascending=[True, False])
    prices = prices.drop_duplicates(subset=["time"], keep="first")

    prices.set_index("time", inplace=True)
    prices.sort_index(ascending=True, inplace=True)
    assert prices.index.is_unique
    return prices

## Import and Preprocess Stock Prices

In [352]:
prices: pd.DataFrame = pd.read_parquet(f"D:/IQFeedData/{ticker}_1min.parquet", columns=["time", "close", "volume"])
prices = preprocess_iq_feed_prices(prices)
prices.head(3)

Unnamed: 0_level_0,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-04 08:03:00-05:00,9.75,7799.0
2010-01-04 08:06:00-05:00,9.75,67884.0
2010-01-04 08:13:00-05:00,9.84,300.0


In [353]:
spy: pd.DataFrame = pd.read_parquet(f"D:/IQFeedData/SPY_1min.parquet", columns=["time", "close", "volume"])
spy = preprocess_iq_feed_prices(spy)
spy.head(3)

Unnamed: 0_level_0,close,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-04 04:08:00-05:00,112.25,1000.0
2010-01-04 04:09:00-05:00,112.25,1000.0
2010-01-04 04:16:00-05:00,112.22,19030.0


## Import and Preprocess News 

In [413]:
df = pd.read_parquet("data/unraw2_bzg/data-5.parquet")

In [414]:
# Direct assignment doesnt work here, if df.time 
# already has a tz then it doesnt change on assignment 
tmp = df.time.dt.tz_convert(eastern)
df.loc[:, "time"] = 0
df.loc[:, "time"] = tmp

In [415]:
ts = df.time.dt.ceil("min").iloc[5]
print(ts)
new_ts = get_appropriate_closing_time(ts)
print(new_ts)

2011-04-08 07:00:00-04:00
2011-04-08 16:00:00-04:00


In [428]:
# TODO: This can be *improved* by saying that if we are very close to completing the minute e.g. :55, 
# then we dont take the next candle (T+1), but the candle after the next(T+2).
df.loc[:, "entry_time"] = df.loc[:, "time"].dt.ceil("min")
# Necessary to get `us` units, otherwise pandas will always convert back to `ns` for some reason.
df.loc[:, "nn_exit_time"] = df.loc[:, "time"]
df.loc[:, "nn_exit_time"] = df.loc[:, "time"].map(get_appropriate_closing_time)

In [429]:
ticker_news = df[df.stocks == ticker]

In [430]:
news_event = ticker_news.iloc[-1]

In [431]:
df.columns

Index(['time', 'stocks', 'author', 'title', 'channels', 'body', 'entry_time',
       'nn_exit_time'],
      dtype='object')

In [432]:
prices.index.is_monotonic_decreasing

False

## Make Input-Output and Merge news and stock prices

In [433]:
merged = pd.merge_asof(ticker_news, prices, left_on="entry_time", right_on="time", direction="forward")

In [434]:
merged = pd.merge(merged, prices, left_on="nn_exit_time", right_on="time", suffixes=("_entry", "_exit"))

In [435]:
merged["r"] = merged["close_exit"] / merged["close_entry"] - 1

In [436]:
# Ideally we do this for every stock and then we come back with the complete dataframe... (depends on if it fits in memory)

In [439]:
# Merge news and stock prices with spy prices
merged = pd.merge_asof(merged, spy, left_on="entry_time", right_on="time", direction="forward")
merged = pd.merge_asof(merged, spy, left_on="nn_exit_time", right_on="time", suffixes=("_spy_entry", "_spy_exit"))

In [441]:
merged.loc[:, "r_spy"] = merged["close_spy_exit"] / merged["close_spy_entry"] - 1

In [444]:
merged.loc[:, "r_mkt_adj"] = merged["r_spy"] = merged["r"]

In [446]:
merged.dtypes

time                datetime64[us, US/Eastern]
stocks                          string[python]
author                          string[python]
title                           string[python]
channels                        string[python]
body                            string[python]
entry_time          datetime64[us, US/Eastern]
nn_exit_time        datetime64[us, US/Eastern]
close_entry                            float64
volume_entry                           float64
close_exit                             float64
volume_exit                            float64
r                                      float64
close_spy_entry                        float64
volume_spy_entry                       float64
close_spy_exit                         float64
volume_spy_exit                        float64
r_spy                                  float64
r_mkt_adj                              float64
dtype: object

# Daily Data

## Import Stocks

In [None]:
stocks = pd.read_pickle("data/stocks.pkl")

In [None]:
stocks.head(3)

In [None]:
stock_tickers = set(stocks.index.get_level_values("ID").unique())
len(stock_tickers)
#test

## Import Stories

In [19]:
#stories = pd.read_pickle("data/stories.pkl")
stories = pd.read_parquet("data/raw_bzg/story_df_raw_2019.parquet")

In [None]:
stories_tickers = set(stories.stocks.unique())

## Parse stocks

In [None]:
assert stories.stocks.dtype == stocks.index.dtypes[1]

In [None]:
def add_targets(df):
    required_columns = ["Close", "High", "Low", "Open"]
    # df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "CloseToCloseReturn"] = df["Close"] / df.shift(1)["Close"] - 1
    # df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    # df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df

In [None]:
stocks.index.dtypes

In [None]:
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan
stocks = stocks.swaplevel(0, 1).sort_index(ascending=[True, True])

In [None]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)

## Parse Stories

In [None]:
from pandas.tseries.offsets import BDay

In [None]:
# PARAMETER 
typ = "CloseToCloseReturn"

In [None]:
def get_appropriate_date(timestamp, typ):
    if typ == "CloseToCloseReturn":
        # TODO: Some noise here due to closing auction?
        if timestamp.hour < 16: return timestamp.date()
        if timestamp.hour >= 16: return timestamp.date() +  BDay(1)

In [None]:
# test
get_appropriate_date(stories.NewsTimestamp.iloc[4], typ="CloseToCloseReturn")

### Date assignment

In [None]:
# If we use Intraday return then news should only be between 9:40 am and 4pm (us trading hours).
# If we use close-to-close return then news for this days CTC should be between yesterday 4pm and today 4pm. 
stories.loc[:, "Date"] = stories.NewsTimestamp.apply(lambda x: get_appropriate_date(x, typ))
stories = stories.astype({"Date":'datetime64[ns]'})

## Merging

In [None]:
stories.rename(columns=dict(stocks="ID"), inplace=True)

In [None]:
stories.columns

In [None]:
stocks.columns

In [None]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].\
    merge(stocks[
        [
            # "IntradayReturn", 
            # "NextDayReturn", 
            "CloseToCloseReturn"]
         ], on=["Date", "ID"], how="inner")

In [None]:
dataset[dataset.isna().sum(axis=1) > 0]

In [None]:
dataset = dataset.dropna()

	Date	NewsTimestamp	ID	body	CloseToCloseReturn

In [None]:
dataset.to_pickle("data/dataset.pkl")

## Create train-test-split 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [None]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [None]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)

In [None]:
test_idx

## Filter training set for general stock market events

In [None]:
# Select S&P
SnP = stocks.query("ID == 'A0AET0'")

In [None]:
SnP = add_targets(SnP)

In [None]:
alpha = 0.1 # Percentage observations classified as too extreme to be used in the training set  

In [None]:
target_col = "CloseToCloseReturn"

In [None]:
lower_quantile = SnP.loc[:, target_col].quantile(alpha/2)

In [None]:
upper_quantile = SnP.loc[:, target_col].quantile(1-alpha/2)

In [None]:
print(f"Upper Quantile: {upper_quantile:.4f}. Lower Quantile: {lower_quantile:.4f}")

In [None]:
mask = (SnP.loc[:, target_col] >= lower_quantile) & (SnP.loc[:, target_col] <= upper_quantile)

# Only select dates where SnP behaved calmly
allowed_dates = SnP.loc[mask, :].index.get_level_values("Date")

In [None]:
# Now trim  training set
train_dat = dataset.loc[train_idx, :]
adj_train_idx = train_dat.loc[train_dat.Date.isin(allowed_dates)].index

In [None]:
train_idx

In [None]:
adj_train_idx

## Save train and test indices

In [None]:
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((adj_train_idx, test_idx), f)