In [2]:
import pandas as pd 
import numpy as np
import json
# import plotly.express as px

## Import IQFeed Stocks

In [10]:
df = pd.read_parquet("D:/IQFeedData/FITB_1min.parquet")

In [14]:
df.head()

Unnamed: 0,time,open,high,low,close,volume
0,2010-12-14 09:13:00+00:00,14.49,14.49,14.49,14.49,7800.0
1,2010-12-14 09:30:00+00:00,14.51,14.51,14.51,14.51,276.0
2,2010-12-14 09:31:00+00:00,14.48,14.5,14.46,14.48,62801.0
3,2010-12-14 09:32:00+00:00,14.475,14.54,14.475,14.53,56274.0
4,2010-12-14 09:33:00+00:00,14.53,14.55,14.52,14.55,24412.0


# Import Stocks

In [None]:
stocks = pd.read_pickle("data/stocks.pkl")

In [None]:
stocks.head(3)

In [None]:
stock_tickers = set(stocks.index.get_level_values("ID").unique())
len(stock_tickers)
#test

# Import Stories

In [None]:
#stories = pd.read_pickle("data/stories.pkl")
stories = pd.read_pickle("data/story_df_raw_2022.pkl")

In [None]:
stories_tickers = set(stories.stocks.unique())

In [None]:
stories.time

## Analysis: Channel occurence

In [None]:
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [None]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [None]:
df = df.sort_values("count", ascending=False)
df.head(7)

## Parse stocks

In [None]:
assert stories.stocks.dtype == stocks.index.dtypes[1]

In [None]:
def add_targets(df):
    required_columns = ["Close", "High", "Low", "Open"]
    # df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "CloseToCloseReturn"] = df["Close"] / df.shift(1)["Close"] - 1
    # df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    # df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df

In [None]:
stocks.index.dtypes

In [None]:
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan
stocks = stocks.swaplevel(0, 1).sort_index(ascending=[True, True])

In [None]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)

## Parse Stories

In [None]:
from pandas.tseries.offsets import BDay

In [None]:
# PARAMETER 
typ = "CloseToCloseReturn"

In [None]:
def get_appropriate_date(timestamp, typ):
    if typ == "CloseToCloseReturn":
        # TODO: Some noise here due to closing auction?
        if timestamp.hour < 16: return timestamp.date()
        if timestamp.hour >= 16: return timestamp.date() +  BDay(1)

In [None]:
# test
get_appropriate_date(stories.NewsTimestamp.iloc[4], typ="CloseToCloseReturn")

### Date assignment

In [None]:
# If we use Intraday return then news should only be between 9:40 am and 4pm (us trading hours).
# If we use close-to-close return then news for this days CTC should be between yesterday 4pm and today 4pm. 
stories.loc[:, "Date"] = stories.NewsTimestamp.apply(lambda x: get_appropriate_date(x, typ))
stories = stories.astype({"Date":'datetime64[ns]'})

## Merging

In [None]:
stories.rename(columns=dict(stocks="ID"), inplace=True)

In [None]:
stories.columns

In [None]:
stocks.columns

In [None]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].\
    merge(stocks[
        [
            # "IntradayReturn", 
            # "NextDayReturn", 
            "CloseToCloseReturn"]
         ], on=["Date", "ID"], how="inner")

In [None]:
dataset[dataset.isna().sum(axis=1) > 0]

In [None]:
dataset = dataset.dropna()

In [None]:
dataset.to_pickle("data/dataset.pkl")

# Create train-test-split 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [None]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [None]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)

In [None]:
test_idx

## Filter training set for general stock market events

In [None]:
# Select S&P
SnP = stocks.query("ID == 'A0AET0'")

In [None]:
SnP = add_targets(SnP)

In [None]:
alpha = 0.1 # Percentage observations classified as too extreme to be used in the training set  

In [None]:
target_col = "CloseToCloseReturn"

In [None]:
lower_quantile = SnP.loc[:, target_col].quantile(alpha/2)

In [None]:
upper_quantile = SnP.loc[:, target_col].quantile(1-alpha/2)

In [None]:
print(f"Upper Quantile: {upper_quantile:.4f}. Lower Quantile: {lower_quantile:.4f}")

In [None]:
mask = (SnP.loc[:, target_col] >= lower_quantile) & (SnP.loc[:, target_col] <= upper_quantile)

# Only select dates where SnP behaved calmly
allowed_dates = SnP.loc[mask, :].index.get_level_values("Date")

In [None]:
# Now trim  training set
train_dat = dataset.loc[train_idx, :]
adj_train_idx = train_dat.loc[train_dat.Date.isin(allowed_dates)].index

In [None]:
train_idx

In [None]:
adj_train_idx

## Save train and test indices

In [None]:
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((adj_train_idx, test_idx), f)