In [1]:
import pandas as pd 
import numpy as np
import json
import plotly.express as px

# Import Stocks

In [2]:
stocks = pd.read_csv("data/stocks.csv")

In [3]:
stocks.head()

Unnamed: 0,Date,ID,Close,High,Low,Open
0,1976-01-05,ARNC,57790,57965,54802,54802
1,1976-01-05,GT,112790,112790,110298,110298
2,1976-01-06,ARNC,59369,59545,57965,57965
3,1976-01-06,GT,114660,114660,113413,113413
4,1976-01-07,ARNC,60072,60774,59019,59369


In [4]:
# Datetime parsing
stocks.loc[:, "date"] = pd.to_datetime(stocks.Date, format="%Y-%m-%d")
stocks.drop(columns="Date", inplace=True)
stocks.rename(columns={"date":"Date"}, inplace=True)


In [5]:
# Type conversion
stocks = stocks.astype({"ID":"string"})
stocks.loc[:,["Close", "High", "Low", "Open"]] = stocks.loc[:,["Close", "High", "Low", "Open"]].applymap(lambda x: str(x).replace(",", ".")).replace("nan", np.nan)
stocks.dropna(inplace=True)
stocks.loc[:,["Close", "High", "Low", "Open"]] = stocks[["Close", "High", "Low", "Open"]].apply(pd.to_numeric)
stocks = stocks.astype(dict(zip(["Close", "High", "Low", "Open"], [float]*4)))

In [6]:
stock_tickers = set(stocks.ID.unique())

In [7]:
len(stock_tickers)

1956

In [8]:
stocks.head()

Unnamed: 0,ID,Close,High,Low,Open,Date
0,ARNC,5.779,5.7965,5.4802,5.4802,1976-01-05
1,GT,11.279,11.279,11.0298,11.0298,1976-01-05
2,ARNC,5.9369,5.9545,5.7965,5.7965,1976-01-06
3,GT,11.466,11.466,11.3413,11.3413,1976-01-06
4,ARNC,6.0072,6.0774,5.9019,5.9369,1976-01-07


# Import Stories

In [9]:
stories = pd.read_csv("data/stories.csv")

In [10]:
stories.loc[:, "Date"] = pd.to_datetime(stories.time)
stories.loc[:, "NewsTimestamp"] = stories.Date

In [11]:
stories_tickers = set(stories.stocks.unique())

In [12]:
stories.head()

Unnamed: 0.1,Unnamed: 0,time,stocks,author,title,channels,body,html_body,Date,NewsTimestamp
0,32904261,2023-06-16 22:50:00-04:00,FGEN,Globe Newswire,FIBROGEN INVESTIGATION CONTINUED BY FORMER LOU...,"[""News"", ""Legal"", ""Press Releases""]","Former Attorney General of\nLouisiana, Charles...","<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 22:50:00-04:00,2023-06-16 22:50:00-04:00
1,32904263,2023-06-16 22:50:00-04:00,FATE,Globe Newswire,FATE THERAPEUTICS INVESTIGATION INITIATED BY F...,"[""News"", ""Legal"", ""Press Releases""]","Former Attorney General of\nLouisiana, Charles...","<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 22:50:00-04:00,2023-06-16 22:50:00-04:00
2,32904264,2023-06-16 22:50:00-04:00,EYE,Globe Newswire,NATIONAL VISION INVESTIGATION INITIATED BY FOR...,"[""News"", ""Legal"", ""Press Releases""]","Former Attorney General of\nLouisiana, Charles...","<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 22:50:00-04:00,2023-06-16 22:50:00-04:00
3,32903996,2023-06-16 20:56:00-04:00,MTEM,Globe Newswire,Molecular Templates Announces Debt Payoff and ...,"[""News"", ""Press Releases""]",* **_Restructuring agreement with K2 HealthVen...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 20:56:00-04:00,2023-06-16 20:56:00-04:00
4,32903760,2023-06-16 19:45:43-04:00,DZSI,Globe Newswire,DZS Inc.: Please contact the Portnoy Law Firm ...,"[""News"", ""Legal"", ""Press Releases""]",The Portnoy Law\nFirm can provide a compliment...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 19:45:43-04:00,2023-06-16 19:45:43-04:00


In [13]:
## List of possible channels
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.iloc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [14]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.iloc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [15]:
df = df.sort_values("count", ascending=False)
df.head(10)

Unnamed: 0,count
Press Releases,1316
News,1149
Legal,397
Earnings,128
Management,105
Dividends,103
Health Care,62
Contracts,53
General,52
Financing,41


# Make common denominator and merge

In [16]:
common_tickers = stock_tickers.intersection(stories_tickers)
common_tickers

{'AAON',
 'ABM',
 'ABOS',
 'ABSI',
 'ABUS',
 'ACAD',
 'ACCD',
 'ACRS',
 'ADC',
 'ADV',
 'ADVM',
 'AFCG',
 'AFIB',
 'AFMD',
 'AGIO',
 'AHH',
 'AIP',
 'AIR',
 'AIRS',
 'AKBA',
 'AKRO',
 'AKYA',
 'ALEC',
 'ALGS',
 'ALGT',
 'ALHC',
 'ALKS',
 'ALKT',
 'ALLK',
 'ALLO',
 'ALT',
 'ALTR',
 'ALX',
 'AMBA',
 'AMCX',
 'AMK',
 'AMNB',
 'AMPE',
 'AMPH',
 'AMRK',
 'AMRS',
 'AMSC',
 'AMTB',
 'ANAB',
 'ANF',
 'ANNX',
 'APLS',
 'APLT',
 'APPN',
 'APPS',
 'ARDX',
 'ARI',
 'ARKO',
 'AROW',
 'ARQT',
 'ARR',
 'ARVN',
 'ASO',
 'ATEX',
 'ATHA',
 'ATNI',
 'ATOS',
 'ATRI',
 'AVA',
 'AVAH',
 'AVDX',
 'AVID',
 'AVIR',
 'AVNT',
 'AVO',
 'AVTA',
 'AVTE',
 'AVTX',
 'AVXL',
 'AWH',
 'AXGN',
 'AXTI',
 'BALY',
 'BASE',
 'BBIO',
 'BCAB',
 'BCRX',
 'BDN',
 'BDTX',
 'BHE',
 'BHVN',
 'BIG',
 'BJRI',
 'BKH',
 'BL',
 'BNGO',
 'BOLT',
 'BOOM',
 'BRC',
 'BRT',
 'BSET',
 'BTAI',
 'BVS',
 'BYSI',
 'CARA',
 'CASA',
 'CATO',
 'CBAY',
 'CBNK',
 'CBRL',
 'CCBG',
 'CCS',
 'CDMO',
 'CDNA',
 'CELC',
 'CENX',
 'CERE',
 'CHRS',
 'CHS',
 

## Parse stocks

In [17]:
def add_targets(df):
    required_columns = ["Date", "Close", "High", "Low", "Open"]
    df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    df.loc[:, "CloseToNextOpen"] = df.shift(-1)["Open"] / df["Close"] - 1
    return df
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan

In [18]:
#stocks[["ID", "Date"]].dtypes
stocks.set_index(["ID", "Date"], verify_integrity=True, inplace=True)
stocks.sort_index(ascending=True, inplace=True)

In [19]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)
stocks.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open,IntradayReturn,NextDayReturn,CloseToNextOpen
ID,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAN,1992-11-09,5.7972,5.8342,5.7231,5.7416,0.009684,0.0,-0.821034
AAN,1992-11-10,1.0375,1.0375,1.0375,1.0375,0.0,0.0,0.0
AAN,1992-11-11,1.0375,1.0375,1.0375,1.0375,0.0,0.0,0.035663
AAN,1992-11-12,1.0745,1.1116,1.0745,1.0745,0.0,0.0,0.0
AAN,1992-11-13,1.0745,1.1116,1.0745,1.0745,0.0,0.0,0.0


## Parse Stories

In [20]:
stories.loc[:, "Date"] = stories.Date.dt.tz_convert(None)
stories.loc[:, "Date"] = stories.Date.dt.date

## Merging

In [21]:
stories.rename(columns=dict(stocks="ID"), inplace=True)
stories.columns

Index(['Unnamed: 0', 'time', 'ID', 'author', 'title', 'channels', 'body',
       'html_body', 'Date', 'NewsTimestamp'],
      dtype='object')

In [22]:
stocks.columns

Index(['Close', 'High', 'Low', 'Open', 'IntradayReturn', 'NextDayReturn',
       'CloseToNextOpen'],
      dtype='object')

In [23]:
dataset = stories[["Date", "NewsTimestamp", "ID", "body"]].merge(stocks[["IntradayReturn", "NextDayReturn", "CloseToNextOpen"]], on=["Date", "ID"], how="inner")

In [24]:
dataset.to_pickle("data/dataset.pkl")

# Create train-test-split 

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

In [28]:
dataset = pd.read_pickle("data/dataset.pkl")
assert dataset.index.is_unique

In [45]:
test_size = 0.2
seed = 420
### Train-test split -> Auslagern
train_idx, test_idx = train_test_split(dataset.index, test_size=0.2, random_state=seed)
with open('data/dataset_train_test_idx.pkl', 'wb') as f:
    pickle.dump((train_idx, test_idx), f)

In [48]:
test_idx

Index([364,  82, 674, 339, 590, 311, 622, 254,  85, 553,
       ...
       707, 764,  31, 685, 515, 283, 668,  16, 211, 660],
      dtype='int64', length=163)