In [51]:
import pandas as pd 
import numpy as np
import json
import plotly.express as px

# Import Stocks

In [52]:
stocks = pd.read_pickle("data/stocks.pkl")

In [53]:
stocks.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open
Date,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1976-01-05,ARNC,5.779,5.7965,5.4802,5.4802
1976-01-05,GT,11.279,11.279,11.0298,11.0298
1976-01-06,ARNC,5.9369,5.9545,5.7965,5.7965


In [54]:
stock_tickers = set(stocks.index.get_level_values("ID").unique())
len(stock_tickers)

1959

# Import Stories

In [55]:
stories = pd.read_pickle("data/stories.pkl")

In [56]:
stories_tickers = set(stories.stocks.unique())

In [57]:
stories.head()

Unnamed: 0,stocks,author,title,channels,body,html_body,NewsTimestamp
33058275,BSET,Globe Newswire,Bassett Announces Fiscal Second Quarter Results,"[""Earnings"", ""Press Releases""]","Bassett Furniture Industries,\nInc. announced...","<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-29 09:00:00-04:00
33055733,SMPL,Globe Newswire,The Simply Good Foods Company Reports Third Qu...,"[""Earnings"", ""Restaurants"", ""Press Releases"", ...",the company\n\nand seller of branded nutrition...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-29 07:00:00-04:00
33054892,GBX,PRNewswire,Greenbrier Reports Third Quarter Results,"[""Earnings"", ""Press Releases""]",GAAP EPS of $0.64 includes $13 million loss re...,"<p xmlns=""http://www.w3.org/1999/xhtml"" class=...",2023-06-29 06:00:00-04:00
33054856,EGBN,Globe Newswire,"Eagle Bancorp, Inc. Announces Cash Dividend","[""News"", ""Dividends"", ""Press Releases""]",the company (the\n\ncash dividend for the seco...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-29 06:00:00-04:00
33050541,AOUT,PRNewswire,"American Outdoor Brands, Inc. Reports Fourth Q...","[""Earnings"", ""Press Releases""]",_• FY23_ _Net Sales $191.2 Million _\n\n_•_ ...,"<p xmlns=""http://www.w3.org/1999/xhtml"" class=...",2023-06-28 16:29:00-04:00


## Analysis: Channel occurence

In [58]:
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [59]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [60]:
df = df.sort_values("count", ascending=False)
df.head(7)

Unnamed: 0,count
Press Releases,2026
Earnings,1588
Dividends,348
News,332
Financing,100
Real Estate,86
Commodities,85


## Parse stocks

In [61]:
assert stories.stocks.dtype == stocks.index.dtypes[1]

In [62]:
def add_targets(df):
    required_columns = ["Close", "High", "Low", "Open"]
    df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df

In [63]:
stocks.index.dtypes

Date    datetime64[ns]
ID            category
dtype: object

In [64]:
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan
stocks = stocks.swaplevel(0, 1).sort_index(ascending=[True, True])

In [65]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)

## Parse Stories

In [66]:
stories.loc[:, "Date"] = stories.NewsTimestamp.dt.date
stories = stories.astype({"Date":'datetime64[ns]'})

## Merging

In [67]:
stories.rename(columns=dict(stocks="ID"), inplace=True)

In [68]:
stories.columns

Index(['ID', 'author', 'title', 'channels', 'body', 'html_body',
       'NewsTimestamp', 'Date'],
      dtype='object')

In [69]:
stocks.columns

Index(['Close', 'High', 'Low', 'Open', 'IntradayReturn', 'NextDayReturn',
       'CloseToNextOpen'],
      dtype='object')

In [70]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].merge(stocks[["IntradayReturn", "NextDayReturn", "CloseToNextOpen"]], on=["Date", "ID"], how="inner")

In [84]:
dataset[dataset.isna().sum(axis=1) > 0]

Unnamed: 0,Date,NewsTimestamp,ID,body,IntradayReturn,NextDayReturn,CloseToNextOpen


In [83]:
dataset = dataset.dropna()

In [85]:
dataset.to_pickle("data/dataset.pkl")

# Create train-test-split 

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [87]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [88]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((train_idx, test_idx), f)

In [89]:
test_idx

Index([1279, 1260,  884,  732,  235, 1119,  456,  687,  695, 1943,
       ...
       1871, 1379,  562,  645, 1209,    3, 1733,  944, 1044,  396],
      dtype='int64', length=401)