In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
from nltk import word_tokenize
from nltk.corpus import sentiwordnet as swn
from string import punctuation
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
tqdm.pandas(tqdm_notebook)

  from pandas import Panel


# Sentiment analysis on the articles
This notebook implements a naive sentiment analysis on our dataset. Naive sentiment analysis employs the dictionary method to assign a sentiment to each word. By using the Sentiwordnet, it not only takes into account the actual word but also the word meaning. The cell below loads in our article dataset and also corrects the type of certain columns.

In [2]:
articles = pd.read_json('articles.json')
articles.publicationDay = pd.to_datetime(articles.publicationDay, unit='ms')
articles.publicationWeek = pd.to_datetime(articles.publicationWeek, unit='ms')

In the next cell we create a set of english stopwords and also a list of punctuation symbols. These combined list of words will be entirely ignored when calculating polarities. Each word will be ignored when using this list.

In [3]:
stopwords=set(stopwords.words('english')+list(punctuation))

In [4]:
def remove_words(article):
    article = word_tokenize(article)
    return [word for word in article if not word in stopwords]

['10', 'December', '2019', 'Dear', 'Boris', 'Johnson', 'In', 'medicine', '–', 'unlike', 'politics', 'anything', 'seems', 'go', 'days', '–', 'situations', 'called', '“', 'never', 'events', '”', 'These', 'instances', 'occur', 'patient', 'seriously', 'harmed', 'spite', 'protocols', 'protective', 'measures', 'prevent', 'happening', '“', 'Never', 'events', '”', 'serious', 'manmade', 'disasters', 'clinicians', 'involved', 'bear', 'burden', 'tragic', 'events', 'rest', 'careers', 'Like', 'many', 'junior', 'doctors', 'worked', 'overwhelmed', 'understaffed', 'A', 'amp', 'E', 'departments', 'I', '’', 'seen', 'things', 'happen', 'result', 'overstretched', 'conditions', 'I', 'believe', 'classed', '“', 'never', 'events', '”', 'Since', '2016', 'nearly', '5,500', 'patients', 'died', 'England', 'alone', 'direct', 'result', 'waited', 'long', 'admitted', 'hospital', 'To', 'put', 'perspective', '’', 'nearly', 'twice', 'number', 'people', 'killed', 'terror', 'attacks', 'UK', 'since', '1970', 'We', 'outrage

In the next cell we create a function which we can give the text of an article. This function will look at each word in article. Each word will get a sentiment-score `weight` for each different meaning of the word in the dictonary. These weights are then averaged and added to the total article polarity `articlePolarity` which represents the cumulative averaged sentiment score of each word.

In [5]:
def naiveSentiment(article):
    articlePolarity = 0
    numExceptions = 0
    for word in article:
        word = word.lower()
        numMeanings = 0
        weight = 0.0
        try:
            for meaning in swn.senti_synsets(word):
                if meaning.pos_score() > meaning.neg_score():
                    weight += (meaning.pos_score() - meaning.neg_score())
                    numMeanings += 1
                elif meaning.pos_score() < meaning.neg_score():
                    weight -= (meaning.neg_score() - meaning.pos_score())
                    numMeanings += 1
        except:
            numExceptions += 1
        if numMeanings > 0:
            articlePolarity += (weight/numMeanings)
    return articlePolarity

In [25]:
print(remove_words(articles.bodyText[0]))
x = []
for word in remove_words(articles.bodyText[0]):
    x.append(naiveSentiment(word))
print(sum(x)/len(x))

['10', 'December', '2019', 'Dear', 'Boris', 'Johnson', 'In', 'medicine', '–', 'unlike', 'politics', 'anything', 'seems', 'go', 'days', '–', 'situations', 'called', '“', 'never', 'events', '”', 'These', 'instances', 'occur', 'patient', 'seriously', 'harmed', 'spite', 'protocols', 'protective', 'measures', 'prevent', 'happening', '“', 'Never', 'events', '”', 'serious', 'manmade', 'disasters', 'clinicians', 'involved', 'bear', 'burden', 'tragic', 'events', 'rest', 'careers', 'Like', 'many', 'junior', 'doctors', 'worked', 'overwhelmed', 'understaffed', 'A', 'amp', 'E', 'departments', 'I', '’', 'seen', 'things', 'happen', 'result', 'overstretched', 'conditions', 'I', 'believe', 'classed', '“', 'never', 'events', '”', 'Since', '2016', 'nearly', '5,500', 'patients', 'died', 'England', 'alone', 'direct', 'result', 'waited', 'long', 'admitted', 'hospital', 'To', 'put', 'perspective', '’', 'nearly', 'twice', 'number', 'people', 'killed', 'terror', 'attacks', 'UK', 'since', '1970', 'We', 'outrage

In [18]:
articles['filteredbodyText'] = articles.bodyText.progress_apply(remove_words)
articles['filteredwordcount'] = articles.filteredbodyText.apply(len)

100%|██████████████████████████████████████████████████████████████████████████████| 7880/7880 [01:24<00:00, 93.57it/s]


In [7]:
articles['articlePolarity'] = articles.filteredbodyText.progress_apply(naiveSentiment)

100%|██████████████████████████████████████████████████████████████████████████████| 7880/7880 [07:14<00:00, 18.14it/s]


In [13]:
articles.articlePolarity.describe()

count    7880.000000
mean       14.416065
std        26.769659
min       -27.378959
25%         5.222044
50%        10.119936
75%        16.333662
max       582.724918
Name: articlePolarity, dtype: float64

In [19]:
articles['articlePolarityNormalized'] = articles.articlePolarity / articles.filteredwordcount

In [20]:
articles.articlePolarityNormalized.describe()

count    7880.000000
mean        0.024709
std         0.016693
min        -0.055556
25%         0.013772
50%         0.024361
75%         0.035184
max         0.105938
Name: articlePolarityNormalized, dtype: float64

In [None]:
resp = articles.groupby('publicationDay').articlePolarity.mean()
plt.plot(resp)
plt.title('Average sentiment score per Day')
plt.xlabel('Days')
plt.ylabel('sentiment')

In [None]:
resp = articles.groupby('publicationWeek').articlePolarity.mean()
plt.plot(resp)
plt.title('Average sentiment score per Week')
plt.xlabel('Weeks')
plt.ylabel('Sentiment')

In [None]:
articles.to_json('articles_sentiment.json')