In [11]:
import pandas as pd 
import numpy as np
import json
import plotly.express as px

# Import Stocks

In [19]:
stocks = pd.read_pickle("data/stocks.pkl")

In [23]:
stocks.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open
Date,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1976-01-05,ARNC,5.779,5.7965,5.4802,5.4802
1976-01-05,GT,11.279,11.279,11.0298,11.0298
1976-01-06,ARNC,5.9369,5.9545,5.7965,5.7965


In [25]:
stock_tickers = set(stocks.index.get_level_values("ID").unique())
len(stock_tickers)

1956

# Import Stories

In [26]:
stories = pd.read_pickle("data/stories.pkl")

In [28]:
stories_tickers = set(stories.stocks.unique())

In [27]:
stories.head()

Unnamed: 0,stocks,author,title,channels,body,html_body,NewsTimestamp
33019447,STNG,Globe Newswire,Scorpio Tankers Inc. Announces Commitments for...,"[""News"", ""Financing"", ""Press Releases""]",the company \n\ncommitments from a group of fi...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-27 06:45:01-04:00
33013700,ARR,Globe Newswire,"ARMOUR Residential REIT, Inc. Announces Guidan...","[""News"", ""Dividends"", ""Press Releases""]",ARMOUR Residential\n\nthe July 2023 cash divid...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-26 16:15:00-04:00
33002576,TNXP,Globe Newswire,Tonix Pharmaceuticals Enters into Agreement to...,"[""M&A"", ""News"", ""Financing"", ""Press Releases""]",Zembrace® SymTouch® (sumatriptan injection) an...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-26 07:00:00-04:00
32982803,ANDE,PRNewswire,"The Andersons, Inc. Declares Cash Dividend for...","[""Dividends"", ""Press Releases""]","MAUMEE, Ohio, todayCNW/ -- the company \nannou...","<p xmlns=""http://www.w3.org/1999/xhtml""><span ...",2023-06-23 08:30:00-04:00
32982798,ANDE,PRNewswire,"The Andersons, Inc. Declares Cash Dividend for...","[""Dividends"", ""Press Releases""]",the company \nannounces a third quarter 2023 c...,"<p xmlns=""http://www.w3.org/1999/xhtml""><span ...",2023-06-23 08:30:00-04:00


## Analysis: Channel occurence

In [31]:
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [33]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [35]:
df = df.sort_values("count", ascending=False)
df.head(7)

Unnamed: 0,count
Press Releases,73
News,40
Dividends,37
Financing,18
Earnings,17
Commodities,6
Real Estate,6


## Parse stocks

In [None]:
assert stories.stocks.dtype == stocks.index.dtypes[1]

In [52]:
def add_targets(df):
    required_columns = ["Close", "High", "Low", "Open"]
    df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df

In [44]:
stocks.index.dtypes

Date    datetime64[ns]
ID            category
dtype: object

In [48]:
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan
stocks = stocks.swaplevel(0, 1).sort_index(ascending=[True, True])

In [53]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)

## Parse Stories

In [87]:
stories.loc[:, "Date"] = stories.NewsTimestamp.dt.date
stories = stories.astype({"Date":'datetime64[ns]'})

## Merging

In [64]:
stories.rename(columns=dict(stocks="ID"), inplace=True)

In [66]:
stories.columns

Index(['ID', 'author', 'title', 'channels', 'body', 'html_body',
       'NewsTimestamp', 'Date'],
      dtype='object')

In [65]:
stocks.columns

Index(['Close', 'High', 'Low', 'Open', 'IntradayReturn', 'NextDayReturn',
       'CloseToNextOpen'],
      dtype='object')

In [91]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].merge(stocks[["IntradayReturn", "NextDayReturn", "CloseToNextOpen"]], on=["Date", "ID"], how="inner")

In [24]:
dataset.to_pickle("data/dataset.pkl")

# Create train-test-split 

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [28]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [45]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((train_idx, test_idx), f)

In [48]:
test_idx

Index([364,  82, 674, 339, 590, 311, 622, 254,  85, 553,
       ...
       707, 764,  31, 685, 515, 283, 668,  16, 211, 660],
      dtype='int64', length=163)