In [None]:
import pandas as pd 
import numpy as np
import json
import plotly.express as px

# Import Stocks

In [None]:
stocks = pd.read_pickle("data/stocks.pkl")

In [None]:
stocks.head(3)

In [None]:
stock_tickers = set(stocks.index.get_level_values("ID").unique())
len(stock_tickers)

# Import Stories

In [None]:
stories = pd.read_pickle("data/stories.pkl")

In [None]:
stories_tickers = set(stories.stocks.unique())

In [None]:
stories.head()

## Analysis: Channel occurence

In [None]:
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [None]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.loc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [None]:
df = df.sort_values("count", ascending=False)
df.head(7)

## Parse stocks

In [None]:
assert stories.stocks.dtype == stocks.index.dtypes[1]

In [None]:
def add_targets(df):
    required_columns = ["Close", "High", "Low", "Open"]
    df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df

In [None]:
stocks.index.dtypes

In [None]:
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan
stocks = stocks.swaplevel(0, 1).sort_index(ascending=[True, True])

In [None]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)

## Parse Stories

In [None]:
stories.loc[:, "Date"] = stories.NewsTimestamp.dt.date
stories = stories.astype({"Date":'datetime64[ns]'})

## Merging

In [None]:
stories.rename(columns=dict(stocks="ID"), inplace=True)

In [None]:
stories.columns

In [None]:
stocks.columns

In [None]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].merge(stocks[["IntradayReturn", "NextDayReturn", "CloseToNextOpen"]], on=["Date", "ID"], how="inner")

In [None]:
dataset[dataset.isna().sum(axis=1) > 0]

In [None]:
dataset = dataset.dropna()

In [None]:
dataset.to_pickle("data/dataset.pkl")

# Create train-test-split 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [None]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [None]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((train_idx, test_idx), f)

In [None]:
test_idx