In [26]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import dateutil.parser

# Netflix

In [223]:
# dictionary of text dates and links
url_dict = {'04/16/2019':'https://www.reuters.com/article/usa-stocks/us-stocks-wall-street-rises-as-netflix-boosts-tech-internet-stocks-idUSL3N1ZF4XW',
            '04/29/2019':'https://www.reuters.com/article/us-television-13reasonswhy/u-s-youth-suicides-up-after-netflix-show-cause-unclear-study-idUSKCN1S5257',
            '04/30/2019':'https://www.reuters.com/article/us-apple-television/in-streaming-wars-apple-says-it-can-coexist-with-netflix-idUSKCN1S730H',
            '05/09/2019':'https://www.reuters.com/article/us-storybots-m-a-netflix/netflix-buys-kids-show-producer-storybots-idUSKCN1SF1ZK',
            '05/15/2019':'https://www.reuters.com/article/us-investment-funds/big-u-s-hedge-funds-regain-ardor-for-faangs-in-first-quarter-filings-idUSKCN1SL2QL',
           '05/28/2019':'https://www.reuters.com/article/usa-abortion-netflix/netflix-to-rethink-investment-in-georgia-if-abortion-law-takes-effect-idUSL2N2340Y9'}


In [224]:
# extract info from each website and store it in dataframe
df_text = pd.DataFrame(columns = ['Date','Header','Text'])
header = []
text = []
date = []
for key in url_dict:
    page = requests.get(url_dict[key]).text
    soup = BeautifulSoup(page, 'html.parser')
    date.append(datetime.strptime(key, '%m/%d/%Y').date())
    url_header = soup.find('h1').text
    header.append(url_header)
    url_text = soup.find('div', class_ = 'ArticleBodyWrapper').text
    text.append(url_text)
df_text['Date']=date
df_text['Header']=header
df_text['Text']=text


In [225]:
df_text

Unnamed: 0,Date,Header,Text
0,2019-04-16,US STOCKS-Wall Street rises as Netflix boosts ...,By April Joyner4 Min Read* Netflix jumps after...
1,2019-04-29,"U.S. youth suicides up after Netflix show, cau...",By Reuters Staff3 Min ReadLOS ANGELES (Reuters...
2,2019-04-30,"In streaming wars, Apple says it can coexist w...",By Stephen Nellis2 Min ReadFILE PHOTO: Tim Coo...
3,2019-05-09,Netflix buys kids show producer StoryBots,By Reuters Staff2 Min ReadThe Netflix logo is ...
4,2019-05-15,Big U.S. hedge funds regain ardor for FAANGs i...,"By Jennifer Ablan, Noel Randewich4 Min Read(Re..."
5,2019-05-28,Netflix to 'rethink' investment in Georgia if ...,"By Reuters Staff3 Min ReadLOS ANGELES, May 28 ..."


In [226]:
# import stock data of netflix
df_stock = pd.read_csv('NFLX.csv', header=0)
df_stock['Date']=df_stock['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date())
df_stock.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-12-16,119.800003,123.0,118.089996,122.639999,122.639999,13181000
1,2015-12-17,123.970001,126.349998,122.419998,122.510002,122.510002,17284900
2,2015-12-18,120.849998,122.190002,117.919998,118.019997,118.019997,17948100
3,2015-12-21,119.510002,119.589996,115.660004,116.629997,116.629997,11670000
4,2015-12-22,117.300003,117.43,114.860001,116.239998,116.239998,9689000


In [227]:
# Merge two dataframes
df = pd.merge(df_text, df_stock, how='inner', on='Date')
df = df.sort_values('Date')

In [244]:
# show % change in stock price after news article publish
pct_stock = df['Open'].pct_change()*100
df['% Change in Stock Price'] = pct_stock
pct_volume = df['Volume'].pct_change()*100
df['% Change in Volume'] = pct_volume
df.head()

Unnamed: 0,Date,Header,Text,Open,High,Low,Close,Adj Close,Volume,% Change in Stock Price,% Change in Volume
0,2019-04-16,US STOCKS-Wall Street rises as Netflix boosts ...,By April Joyner4 Min Read* Netflix jumps after...,355.0,364.480011,352.720001,359.459991,359.459991,18740200,,
1,2019-04-29,"U.S. youth suicides up after Netflix show, cau...",By Reuters Staff3 Min ReadLOS ANGELES (Reuters...,373.679993,374.579987,369.119995,371.829987,371.829987,3821700,5.26197,-79.606941
2,2019-04-30,"In streaming wars, Apple says it can coexist w...",By Stephen Nellis2 Min ReadFILE PHOTO: Tim Coo...,369.559998,374.5,368.350006,370.540009,370.540009,3870100,-1.102546,1.266452
3,2019-05-09,Netflix buys kids show producer StoryBots,By Reuters Staff2 Min ReadThe Netflix logo is ...,360.899994,364.200012,352.75,362.75,362.75,5882600,-2.343328,52.00124
4,2019-05-15,Big U.S. hedge funds regain ardor for FAANGs i...,"By Jennifer Ablan, Noel Randewich4 Min Read(Re...",343.339996,356.5,341.390015,354.98999,354.98999,6340100,-4.865613,7.777173


# Processing the text

In [229]:
# import spacy libraries
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')

In [231]:
text_nlp = nlp(df['Text'][1]) 
# Remove stop words
text_nlp_clean = [word for word in text_nlp if word.is_stop == False]
# Lemmatize words
text_nlp_clean = [word.lemma_ for word in text_nlp_clean]

In [242]:
import string

text_clean = []
for ind, row in df.iterrows():
    
    # lower casing
    text_lower = row['Text'].lower()
    
    # tokenization
    text_nlp = nlp(row['Text']) 
    
    # remove punctuation and empty space
    text_nlp_clean = [word for word in text_nlp if not word.is_punct | word.is_space]
    
    # Remove stop words
    text_nlp_clean = [word for word in text_nlp_clean if word.is_stop == False]
    
    # Lemmatize tokens
    text_nlp_clean = [word.lemma_ for word in text_nlp_clean]
    
    # add preprocessed text to list
    text_clean.append(text_nlp_clean)
    

In [243]:
text_clean[4]

['Jennifer',
 'Ablan',
 'Noel',
 'Randewich4',
 'Min',
 'Read(Reuters',
 'big',
 'high',
 'profile',
 'U.S.',
 'hedge',
 'fund',
 'investor',
 'money',
 'manager',
 'fall',
 'love',
 'faang',
 'quarter',
 'accord',
 'regulatory',
 'filing',
 'release',
 'Wednesday',
 'Facebook',
 'Amazon',
 'Netflix',
 'Google',
 'logo',
 'see',
 'combination',
 'photo',
 'Reuters',
 'file',
 'reuter',
 'File',
 'PhotosAfter',
 'dumping',
 'share',
 'Facebook',
 'Inc',
 'Apple',
 'Inc',
 'Amazon.com',
 'Inc',
 'Netflix',
 'Inc',
 'Alphabet',
 'Inc',
 'FAANG',
 'component',
 'prominent',
 'hedge',
 'fund',
 'manager',
 'include',
 'Tiger',
 'Global',
 'Management',
 'LLC',
 'move',
 'popular',
 'group',
 'Tiger',
 'manage',
 'Chase',
 'Coleman',
 'boost',
 'Facebook',
 'stake',
 '64.5',
 '8.8',
 'million',
 'class',
 'share',
 'quarter',
 'accord',
 'filing',
 'Securities',
 'Exchange',
 'Commission',
 'increase',
 'stake',
 'stream',
 'company',
 'Netflix',
 '42.8',
 '2.1',
 'million',
 'share',
 'Soro

# Sentiment

### Method 1: Determine sentiment and compare with stock price (Create rule based system)

### Method 2: Use pretrained model 

# Testing