In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import dateutil.parser

In [None]:
# dictionary of text dates and links

# August 2, 2015
url1 = 'https://www.reuters.com/article/us-toyota-sales-idUSKCN0Q805F20150803'

# August 18, 2015
url2 = 'https://uk.reuters.com/article/uk-china-blast-toyota/global-automakers-divert-shipments-from-chinas-tianjin-port-after-blasts-idUKKCN0QO07W20150819'

# September 3, 2015
url3 = 'https://www.reuters.com/article/us-volkswagen-chairman-poetsch/vws-finance-chief-set-to-become-new-chairman-idUSKCN0R31B620150903'

# September 4, 2-15
url4 = 'https://www.reuters.com/article/us-toyota-research-robotics-idUSKCN0R41X220150904'

# September 18, 2015
url5 = 'https://www.reuters.com/article/us-usa-volkswagen-idUSKCN0RI1VK20150918'

# September 21, 2015
url6 = 'https://www.reuters.com/article/us-usa-volkswagen-emission-idUSKCN0RL2EI20150922'

# September 24, 2015
url7 = 'https://www.reuters.com/article/usa-volkswagen-deception-idUSL1N11U1OB20150924'

# September 29, 2015
url8 = 'https://www.reuters.com/article/volkswagen-emissions-technology-idUSL1N11Z1XQ20150929'


# October 8, 2015
url9 = 'https://www.reuters.com/article/us-volkswagen-emissions-consumers-insigh-idUSKCN0S20CK20151008'

# October 13, 2015
url10 = 'https://in.reuters.com/article/us-volkswagen-emissions-investment/vw-looks-to-cutbacks-and-electric-cars-to-overcome-scandal-idUKKCN0S710020151013'

# October 21, 2015
url11 = 'https://ca.reuters.com/article/businessNews/idCAKCN0SF1FU20151021'

# October 22, 2015
url12 = 'https://www.reuters.com/article/us-volkswagen-emissions-brazil-idUSKCN0SG1N720151022'


# October 28, 2015
url13 = 'https://www.reuters.com/article/us-volkswagen-emissions-dealers-idUSKCN0SM2SG20151028'

# November 4, 2015
url14 = 'https://www.reuters.com/article/us-volkswagen-emissions-idUSKCN0ST1VY20151104'

# November 8, 2015
url15 = 'https://www.reuters.com/article/volkswagen-emissions/vw-engineers-have-admitted-manipulating-co2-emissions-data-paper-idUKL8N1320KD20151108'

url_dict = {'08/02/2015':url1,
            '08/18/2015':url2,
            '09/03/2015':url3,
            '09/04/2015':url4,
            '09/18/2015':url5,
           '09/21/2015':url6,
           '09/24/2015':url7,
            '09/29/2015':url8,
            '10/08/2015':url9,
            '10/13/2015':url10,
            '10/21/2015':url11,
           '10/22/2015':url12,
           '10/28/2015':url13,
            '11/04/2015':url14,
           '11/08/2015':url15,
           }


In [None]:
# extract info from each website and store it in dataframe
df_text = pd.DataFrame(columns = ['Date','Header','Text'])
header = []
text = []
date = []
for key in url_dict:
    page = requests.get(url_dict[key]).text
    soup = BeautifulSoup(page, 'html.parser')
    date.append(datetime.strptime(key, '%m/%d/%Y').date())
    url_header = soup.find('h1').text
    header.append(url_header)
    url_text = soup.find('div', class_ = 'ArticleBodyWrapper').text
    text.append(url_text)
df_text['Date']=date
df_text['Header']=header
df_text['Text']=text


In [None]:
df_text.head()

In [None]:
# import stock data of netflix
df_stock = pd.read_csv('Volkswagen.csv', header=0)
df_stock['Date']=df_stock['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').date())

df_stock = df_stock[df_stock[' Volume'] != ' N/A']

# Remove $ sign and change to int variable
df_stock[' Open'] = df_stock[' Open'].apply(lambda x : x[1:])
df_stock[' High'] = df_stock[' High'].apply(lambda x : x[1:])
df_stock[' Low'] = df_stock[' Low'].apply(lambda x : x[1:])
df_stock[' Close/Last'] = df_stock[' Close/Last'].apply(lambda x : x[1:])

# Remove $ sign and change to int variable
df_stock[' Open'] = df_stock[' Open'].astype(float)
df_stock[' High'] = df_stock[' High'].astype(float)
df_stock[' Volume'] = df_stock[' Volume'].astype(float)
df_stock[' Low'] = df_stock[' Low'].astype(float)
df_stock[' Close/Last'] = df_stock[' Close/Last'].astype(float)

columns_remove = [' High', ' Low', ' Close/Last']
df_stock = df_stock.drop(columns_remove, axis=1)


df_stock.head()

In [None]:
# Merge two dataframes
df = pd.merge(df_text, df_stock, how='inner', on='Date')
df = df.sort_values('Date')

In [None]:
# show % change in stock price after news article publish
pct_stock = df[' Open'].pct_change()*100
df['% Change in Stock Price'] = pct_stock
pct_volume = df[' Volume'].pct_change()*100
df['% Change in Volume'] = pct_volume
df.head()

# Processing the text


## import spacy libraries
import spacy
import en_core_web_sm
from  spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

In [None]:
text_nlp = nlp(df['Text'][1]) 
# Remove stop words
text_nlp_clean = [word for word in text_nlp if word.is_stop == False]
# Lemmatize words
text_nlp_clean = [word.lemma_ for word in text_nlp_clean]

In [None]:
text_clean = []
for ind, row in df.iterrows():
    
    # lower casing
    text_lower = row['Text'].lower()
    
    # tokenization
    text_nlp = nlp(row['Text']) 
    
    # remove punctuation and empty space
    text_nlp_clean = [word for word in text_nlp if not word.is_punct | word.is_space]
    
    # Remove stop words
    text_nlp_clean = [word for word in text_nlp_clean if word.is_stop == False]
    
    # Lemmatize tokens
    text_nlp_clean = [word.lemma_ for word in text_nlp_clean]
    
    # add preprocessed text to list
    text_clean.append(text_nlp_clean)

# Using a Pretrained Model 

In [None]:
from nltk.sentiment import SentimentAnalyzer

In [None]:
# a function to return sentiment score
def polarity_score(text):
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(text)['compound']

vader_score = []
for text in df['Text']:
    score = polarity_score(text)
    vader_score.append(score)

df['Vader Sentiment Score'] = vader_score

In [None]:
df.head()

## Sentiment Analysis

In [None]:
# run this cell to import nltk
import nltk
from os import getcwd
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 
from utils import process_tweet, build_freqs

In [None]:
nltk.download('twitter_samples')
nltk.download('stopwords')

In [None]:

# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [None]:
# split the data into two pieces (80-20), one for training and one for testing (validation set)  
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [None]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [None]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

In [None]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

In [None]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        
        x[0,1] += freqs.get((word,1),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [None]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

In [None]:
Y.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline



### SVM Parameters

In [None]:
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1]}  

In [None]:

gs = GridSearchCV(estimator=svm.SVC(),
                 param_grid=param_grid,
                 cv=5)
gs.fit(X,Y.ravel())

# Update best_score_param_estimator_gs
classifier = gs.best_estimator_

print(classifier)


In [None]:

# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def predict_tweet(tweet, freqs):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = classifier.predict(x)
    
    ### END CODE HERE ###
    
    return y_pred

In [None]:
# UNQ_C4 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def predict_tweet_prob(tweet, freqs):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # extract the features of the tweet and store it into x
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = classifier.predict_proba(x)
    
    ### END CODE HERE ###
    
    return y_pred

In [None]:

my_tweet = 'The plot was terrible and I was sad until the ending!'
predict_tweet(my_tweet, freqs)

# Testing the Model 

In [None]:
sentiment = []
sentiment_prob = []
for i in df.iloc[:,2]:
    sentiment.append(predict_tweet(i, freqs)[0])
    sentiment_prob.append(predict_tweet_prob(i, freqs)[0][1])

In [None]:
sentiment


In [None]:
sentiment_prob

In [None]:

df['sentiment'] = sentiment
df['sentiment_prob'] = sentiment_prob
df