In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [2]:
df_finbert = pd.read_csv('finBert_sentiment.csv')
df_finbert.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock,sentiment
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A,neutral
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A,neutral
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A,neutral
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A,neutral
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A,positive


In [4]:
df_finbert.shape

(333509, 5)

## Data processing notebook
Here are the files for stock and news. Some basic processing were done to make it more concise

In [2]:
# df_stock = pd.read_csv("SPX.csv")
# df_stock.head()

In [3]:
# df_stock['Date'] = pd.to_datetime(df_stock['Date'])
# df_stock.set_index('Date', inplace=True)
# df_stock = df_stock.iloc[::-1]
# df_stock.head()

In [4]:
df_snp = pd.read_csv("all_stocks_5yr.csv")
df_snp['date'] = pd.to_datetime(df_snp['date'])
df_snp.set_index('date', inplace=True)
df_snp = df_snp.iloc[::-1]
df_snp.head()

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-07,72.7,75.0,72.69,73.86,4534912,ZTS
2018-02-06,72.74,74.56,72.13,73.27,4924323,ZTS
2018-02-05,76.64,76.92,73.18,73.83,2962031,ZTS
2018-02-02,77.53,78.12,76.73,76.78,2595187,ZTS
2018-02-01,76.84,78.27,76.69,77.82,2982259,ZTS


In [5]:
snp500_tickers = df_snp['Name'].unique()
snp500_tickers = snp500_tickers.tolist()

In [6]:
df_news = pd.read_csv("analyst_ratings_processed.csv")
# df_news.head()

In [7]:
df_news = df_news.dropna(subset=['title'])  # doesn't seem necessary
del df_news['Unnamed: 0']  # this was the index
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce', utc=True)
df_news = df_news.dropna(subset=['date'])
df_news.set_index('date', inplace=True)
df_news.head()

Unnamed: 0_level_0,title,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-05 14:30:00+00:00,Stocks That Hit 52-Week Highs On Friday,A
2020-06-03 14:45:00+00:00,Stocks That Hit 52-Week Highs On Wednesday,A
2020-05-26 08:30:00+00:00,71 Biggest Movers From Friday,A
2020-05-22 16:45:00+00:00,46 Stocks Moving In Friday's Mid-Day Session,A
2020-05-22 15:38:00+00:00,B of A Securities Maintains Neutral on Agilent...,A


In [8]:
df_news = df_news[df_news['stock'].isin(snp500_tickers)]


In [9]:
df_news.shape, df_snp.shape

((383051, 2), (619040, 6))

In [10]:
df_snp.columns, df_news.columns

(Index(['open', 'high', 'low', 'close', 'volume', 'Name'], dtype='object'),
 Index(['title', 'stock'], dtype='object'))

In [11]:
df_snp.rename(columns={'Name': 'stock'}, inplace=True)

df_snp.index = pd.to_datetime(df_snp.index).tz_localize('UTC').date
df_news.index = pd.to_datetime(df_news.index).date

df_snp = df_snp.reset_index().rename(columns={'index': 'date'})
df_news = df_news.reset_index().rename(columns={'index': 'date'})


In [12]:
df_merged = df_snp.merge(
    df_news,
    on=['date', 'stock'],
    how='left'
)

In [17]:
df_with_news = df_merged[df_merged['title'].notna()]

In [22]:
df_with_news

Unnamed: 0,date,open,high,low,close,volume,stock,title
21,2018-01-08,73.43,74.42,73.1607,74.24,3631552,ZTS,Zoetis 8-K Shows Expectation For Net Reduction...
32,2017-12-20,72.84,73.16,72.1700,72.71,1806257,ZTS,Zoetis Spikes to High of $73.08 on Volume; Act...
38,2017-12-12,71.77,72.66,71.6400,72.23,1939851,ZTS,Zoetis Launches Vanguard CIV H3N2/H3N8 Vaccine...
39,2017-12-11,71.84,71.97,71.5200,71.77,1172513,ZTS,Zoetis Raises Qtr. Dividend From $.105/Share T...
47,2017-11-29,71.99,72.10,71.2800,71.61,1618690,ZTS,Analyst: Zoetis Deserves To Trade At A Premium
...,...,...,...,...,...,...,...,...
697721,2013-04-10,79.00,81.85,79.0000,81.23,1758927,AAP,"Advance Auto Parts CFO on Webcast, Says Q1 Has..."
697725,2013-04-04,82.20,82.97,81.9300,82.20,819476,AAP,George Sherman Appointed President of Advance ...
697726,2013-04-04,82.20,82.97,81.9300,82.20,819476,AAP,Advance Auto Parts Names George Sherman President
697763,2013-02-08,78.34,79.72,78.0100,78.90,1298137,AAP,UPDATE: JP Morgan Upgrades Advance Auto Parts ...


In [24]:
stock_counts = df_with_news['stock'].value_counts()
print(stock_counts.describe())


count     392.000000
mean      412.696429
std       348.523091
min         1.000000
25%       179.000000
50%       297.000000
75%       495.000000
max      1984.000000
Name: count, dtype: float64
