In [89]:
import pandas as pd 
import numpy as np
import json
import plotly.express as px

# Import Stocks

In [90]:
stocks = pd.read_csv("../../data/stocks.csv")

In [91]:
stocks.head()

Unnamed: 0,Date,ID,Close,High,Low,Open
0,1976-01-05,ARNC,57790,57965,54802,54802
1,1976-01-05,GT,112790,112790,110298,110298
2,1976-01-06,ARNC,59369,59545,57965,57965
3,1976-01-06,GT,114660,114660,113413,113413
4,1976-01-07,ARNC,60072,60774,59019,59369


In [92]:
# Datetime parsing
stocks.loc[:, "date"] = pd.to_datetime(stocks.Date, format="%Y-%m-%d")
stocks.drop(columns="Date", inplace=True)
stocks.rename(columns={"date":"Date"}, inplace=True)


In [93]:
# Type conversion
stocks = stocks.astype({"ID":"string"})
stocks.loc[:,["Close", "High", "Low", "Open"]] = stocks.loc[:,["Close", "High", "Low", "Open"]].applymap(lambda x: str(x).replace(",", ".")).replace("nan", np.nan)
stocks.dropna(inplace=True)
stocks.loc[:,["Close", "High", "Low", "Open"]] = stocks[["Close", "High", "Low", "Open"]].apply(pd.to_numeric)
stocks = stocks.astype(dict(zip(["Close", "High", "Low", "Open"], [float]*4)))

In [94]:
stock_tickers = set(stocks.ID.unique())

In [95]:
len(stock_tickers)

1956

In [96]:
stocks.head()

Unnamed: 0,ID,Close,High,Low,Open,Date
0,ARNC,5.779,5.7965,5.4802,5.4802,1976-01-05
1,GT,11.279,11.279,11.0298,11.0298,1976-01-05
2,ARNC,5.9369,5.9545,5.7965,5.7965,1976-01-06
3,GT,11.466,11.466,11.3413,11.3413,1976-01-06
4,ARNC,6.0072,6.0774,5.9019,5.9369,1976-01-07


# Import Stories

In [97]:
stories = pd.read_csv("../../data/stories.csv")

In [98]:
stories.loc[:, "Date"] = pd.to_datetime(stories.time)

In [99]:
stories_tickers = set(stories.stocks.unique())

In [100]:
stories.head()

Unnamed: 0.1,Unnamed: 0,time,stocks,author,title,channels,body,html_body,Date
0,32903740,"Fri, 16 Jun 2023 19:41:00 -0400",IEP,Newsfile,Icahn Enterprises Shareholder Action Reminder,"[""Press Releases""]",Securities Litigation Partner James (Josh) Wil...,<p>Securities Litigation Partner James (Josh) ...,2023-06-16 19:41:00-04:00
1,32903731,"Fri, 16 Jun 2023 19:39:00 -0400",BYND,Newsfile,Beyond Meat Shareholder Action Reminder,"[""Press Releases""]",Securities Litigation Partner James (Josh) Wil...,<p>Securities Litigation Partner James (Josh) ...,2023-06-16 19:39:00-04:00
2,32903714,"Fri, 16 Jun 2023 19:37:00 -0400",HBNC,Newsfile,Horizon Shareholder Action Reminder,"[""Press Releases""]",Securities Litigation Partner James (Josh) Wil...,<p>Securities Litigation Partner James (Josh) ...,2023-06-16 19:37:00-04:00
3,32903713,"Fri, 16 Jun 2023 19:36:18 -0400",BGNE,Globe Newswire,"BeiGene, Ltd. Investors: Please contact the Po...","[""News"", ""Legal"", ""Press Releases""]",_Investors can_ _ _contact__ _the law firm at ...,"<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 19:36:18-04:00
4,32903693,"Fri, 16 Jun 2023 19:31:00 -0400",VTRS,Globe Newswire,"ROSEN, A LEADING AND RANKED FIRM, Encourages V...","[""News"", ""Legal"", ""Press Releases""]","NEW YORK, June 16, 2023 (GLOBE NEWSWIRE) -- **...","<link type=""text/css"" rel=""stylesheet"" href=""h...",2023-06-16 19:31:00-04:00


In [101]:
## List of possible channels
channels = set()
for i in stories.index:
    s = pd.Series(json.loads(stories.iloc[i].channels)).value_counts()
    channels = channels.union(set(s.index))

In [102]:
df = pd.DataFrame(data=0, index=list(channels), columns=["count"])
for i in stories.index:
    s = pd.Series(json.loads(stories.iloc[i].channels)).value_counts()
    df.loc[s.index, "count"] += s.values

In [113]:
df = df.sort_values("count", ascending=False)
df.head(10)

Unnamed: 0,count
Press Releases,2916
News,568
Legal,220
General,178
Commodities,100
Small Cap,99
Dividends,95
Real Estate,59
Health Care,54
Entertainment,51


# Make common denominator and merge

In [104]:
common_tickers = stock_tickers.intersection(stories_tickers)
common_tickers

{'ABM',
 'ABOS',
 'ABSI',
 'ACAD',
 'ACCD',
 'ACCO',
 'ACIW',
 'ACRS',
 'ADC',
 'ADEA',
 'ADTN',
 'AFCG',
 'AHH',
 'AIN',
 'AIR',
 'AKBA',
 'AKTS',
 'AKYA',
 'ALDX',
 'ALGT',
 'ALKS',
 'ALLO',
 'ALT',
 'ALTR',
 'AMBC',
 'AMK',
 'AMKR',
 'AMN',
 'AMRK',
 'AMRX',
 'AMTB',
 'AMTX',
 'AOUT',
 'APLT',
 'APOG',
 'APYX',
 'ARAY',
 'ARI',
 'AROW',
 'ARVN',
 'ASAN',
 'ASGN',
 'ASPN',
 'ATEX',
 'ATNI',
 'ATOS',
 'AVA',
 'AVAH',
 'AVAV',
 'AVDX',
 'AVIR',
 'AVNS',
 'AVXL',
 'AXDX',
 'AXL',
 'AZZ',
 'B',
 'BAND',
 'BBIO',
 'BDC',
 'BDTX',
 'BECN',
 'BHE',
 'BHR',
 'BIGC',
 'BJ',
 'BKH',
 'BLFS',
 'BLNK',
 'BNGO',
 'BRSP',
 'BRT',
 'BXMT',
 'CADE',
 'CALX',
 'CARG',
 'CARM',
 'CASA',
 'CBNK',
 'CBT',
 'CCO',
 'CCRN',
 'CCS',
 'CDE',
 'CDMO',
 'CDXC',
 'CDZI',
 'CELL',
 'CERE',
 'CERS',
 'CHS',
 'CIA',
 'CIM',
 'CLDT',
 'CLNN',
 'CLSK',
 'CMBM',
 'CMCO',
 'CMPR',
 'CMTL',
 'CNDT',
 'COHR',
 'COLB',
 'COUR',
 'CRMD',
 'CRNC',
 'CROX',
 'CSSE',
 'CSTL',
 'CSTM',
 'CSTR',
 'CTLP',
 'CTSO',
 'CTXR',
 'C

## Parse stocks

In [105]:
def add_targets(df):
    required_columns = ["Date", "Close", "High", "Low", "Open"]
    df.loc[:, "IntradayReturn"] = df["Close"]/df["Open"] - 1
    df.loc[:, "NextDayReturn"] = df.shift(-1)["Close"] / df.shift(-1)["Open"] - 1
    return df
stocks.loc[:, ["IntradayReturn", "NextDayReturn"]] = np.nan

In [106]:
#stocks[["ID", "Date"]].dtypes
stocks.set_index(["ID", "Date"], verify_integrity=True, inplace=True)
stocks.sort_index(ascending=True, inplace=True)

In [107]:
stocks = stocks.groupby("ID", as_index=False).apply(add_targets)
stocks.index = stocks.index.droplevel(None)
stocks.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open,IntradayReturn,NextDayReturn
ID,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAN,1992-11-09,5.7972,5.8342,5.7231,5.7416,0.009684,0.0
AAN,1992-11-10,1.0375,1.0375,1.0375,1.0375,0.0,0.0
AAN,1992-11-11,1.0375,1.0375,1.0375,1.0375,0.0,0.0
AAN,1992-11-12,1.0745,1.1116,1.0745,1.0745,0.0,0.0
AAN,1992-11-13,1.0745,1.1116,1.0745,1.0745,0.0,0.0


## Parse Stories

In [108]:
stories.loc[:, "Date"] = stories.Date.dt.tz_convert(None)
stories.loc[:, "Date"] = stories.Date.dt.date

## Merging

In [109]:
stories.rename(columns=dict(stocks="ID"), inplace=True)
stories.columns

Index(['Unnamed: 0', 'time', 'ID', 'author', 'title', 'channels', 'body',
       'html_body', 'Date'],
      dtype='object')

In [110]:
stocks.columns

Index(['Close', 'High', 'Low', 'Open', 'IntradayReturn', 'NextDayReturn'], dtype='object')

In [111]:
stories[["Date", "ID", "body"]].merge(stocks[["IntradayReturn", "NextDayReturn"]], on=["Date", "ID"], how="inner")

Unnamed: 0,Date,ID,body,IntradayReturn,NextDayReturn
