In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

## Data processing notebook
Here are the files for stock and news. Some basic processing were done to make it more concise

In [2]:
df_stock = pd.read_csv("SPX.csv")
df_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1927-12-30,17.66,17.66,17.66,17.66,17.66,0
1,1928-01-03,17.76,17.76,17.76,17.76,17.76,0
2,1928-01-04,17.719999,17.719999,17.719999,17.719999,17.719999,0
3,1928-01-05,17.549999,17.549999,17.549999,17.549999,17.549999,0
4,1928-01-06,17.66,17.66,17.66,17.66,17.66,0


In [3]:
df_stock['Date'] = pd.to_datetime(df_stock['Date'])
df_stock.set_index('Date', inplace=True)
df_stock = df_stock.iloc[::-1]
df_stock.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-04,3406.459961,3486.25,3405.169922,3443.439941,3443.439941,4783040000
2020-11-03,3336.25,3389.48999,3336.25,3369.159912,3369.159912,4220070000
2020-11-02,3296.199951,3330.139893,3279.73999,3310.23999,3310.23999,4310590000
2020-10-30,3293.590088,3304.929932,3233.939941,3269.959961,3269.959961,4840450000
2020-10-29,3277.169922,3341.050049,3259.820068,3310.110107,3310.110107,4903070000


In [19]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url)[0]

snp500_tickers = df['Symbol'].tolist()

In [4]:
df_news = pd.read_csv("analyst_ratings_processed.csv")
df_news.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A


In [5]:
df_news = df_news.dropna(subset=['title'])  # doesn't seem necessary
del df_news['Unnamed: 0']  # this was the index
df_news['date'] = pd.to_datetime(df_news['date'], errors='coerce', utc=True)
df_news = df_news.dropna(subset=['date'])
df_news.set_index('date', inplace=True)
df_news.head()

Unnamed: 0_level_0,title,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-05 14:30:00+00:00,Stocks That Hit 52-Week Highs On Friday,A
2020-06-03 14:45:00+00:00,Stocks That Hit 52-Week Highs On Wednesday,A
2020-05-26 08:30:00+00:00,71 Biggest Movers From Friday,A
2020-05-22 16:45:00+00:00,46 Stocks Moving In Friday's Mid-Day Session,A
2020-05-22 15:38:00+00:00,B of A Securities Maintains Neutral on Agilent...,A


In [6]:
df_news.shape, df_stock.shape

((1397891, 2), (23323, 6))

In [7]:
df_news.iloc[-1]

title    China Zenix Auto International Opens For Tradi...
stock                                                   ZX
Name: 2011-05-12 13:36:00+00:00, dtype: object

In [8]:
df_stock = df_stock[df_stock.index >= '2010-01-01']
df_stock.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-04,3406.459961,3486.25,3405.169922,3443.439941,3443.439941,4783040000
2020-11-03,3336.25,3389.48999,3336.25,3369.159912,3369.159912,4220070000
2020-11-02,3296.199951,3330.139893,3279.73999,3310.23999,3310.23999,4310590000
2020-10-30,3293.590088,3304.929932,3233.939941,3269.959961,3269.959961,4840450000
2020-10-29,3277.169922,3341.050049,3259.820068,3310.110107,3310.110107,4903070000


In [12]:
df_news['stock'].value_counts()[:1000]

stock
MRK     3334
MS      3242
MU      3144
NVDA    3133
QQQ     3100
        ... 
FOX      385
XEL      385
ABB      385
ITT      385
TAL      385
Name: count, Length: 1000, dtype: int64

In [23]:
df_snp = df_news[df_news['stock'].isin(snp500_tickers)]


In [24]:
df_snp

Unnamed: 0_level_0,title,stock
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-06-05 14:30:00+00:00,Stocks That Hit 52-Week Highs On Friday,A
2020-06-03 14:45:00+00:00,Stocks That Hit 52-Week Highs On Wednesday,A
2020-05-26 08:30:00+00:00,71 Biggest Movers From Friday,A
2020-05-22 16:45:00+00:00,46 Stocks Moving In Friday's Mid-Day Session,A
2020-05-22 15:38:00+00:00,B of A Securities Maintains Neutral on Agilent...,A
...,...,...
2013-01-31 22:42:00+00:00,IPO for Pfizer's Zoetis Prices 86.1M Shares at...,ZTS
2013-01-31 21:12:00+00:00,"ISI Group Initiates Coverage on Zoetis at Buy,...",ZTS
2013-01-23 23:18:00+00:00,"Pfizer, Spinoff Zoetis Receive Positive Mad Mo...",ZTS
2013-01-22 18:19:00+00:00,Will These 2 IPOs Flourish?,ZTS


['A',
 'AAPL',
 'ABBV',
 'ABNB',
 'ABT',
 'ACGL',
 'ACN',
 'ADBE',
 'ADI',
 'ADM',
 'ADP',
 'ADSK',
 'AEE',
 'AEP',
 'AES',
 'AFL',
 'AIG',
 'AIZ',
 'AJG',
 'AKAM',
 'ALB',
 'ALGN',
 'ALL',
 'ALLE',
 'AMAT',
 'AMCR',
 'AMD',
 'AME',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'ANET',
 'ANSS',
 'AON',
 'AOS',
 'APA',
 'APD',
 'APH',
 'APO',
 'APTV',
 'ARE',
 'ATO',
 'AVB',
 'AVGO',
 'AVY',
 'AWK',
 'AXON',
 'AXP',
 'AZO',
 'BA',
 'BAC',
 'BALL',
 'BAX',
 'BBY',
 'BDX',
 'BEN',
 'BF.B',
 'BG',
 'BIIB',
 'BK',
 'BKNG',
 'BKR',
 'BLDR',
 'BLK',
 'BMY',
 'BR',
 'BRK.B',
 'BRO',
 'BSX',
 'BX',
 'BXP',
 'C',
 'CAG',
 'CAH',
 'CARR',
 'CAT',
 'CB',
 'CBOE',
 'CBRE',
 'CCI',
 'CCL',
 'CDNS',
 'CDW',
 'CEG',
 'CF',
 'CFG',
 'CHD',
 'CHRW',
 'CHTR',
 'CI',
 'CINF',
 'CL',
 'CLX',
 'CMCSA',
 'CME',
 'CMG',
 'CMI',
 'CMS',
 'CNC',
 'CNP',
 'COF',
 'COO',
 'COP',
 'COR',
 'COST',
 'CPAY',
 'CPB',
 'CPRT',
 'CPT',
 'CRL',
 'CRM',
 'CRWD',
 'CSCO',
 'CSGP',
 'CSX',
 'CTAS',
 'CTRA',
 'CTSH',
 'CTVA',
 'CVS',
 