In [1]:
import pandas as pd
import altair as alt

In [2]:
# import scraped csv to pandas df
briefings_df = pd.read_csv('../data/cleaned_briefings.csv')
briefings_df

Unnamed: 0,date,timestamp,speaker,text
0,2020-02-26,05:39,Donald Trump,Thank you very much everybody. Thank you very ...
1,2020-02-26,06:59,Donald Trump,A lot of people thought we shouldn’t have done...
2,2020-02-26,07:51,Donald Trump,We have a total of 15. We took in some from Ja...
3,2020-02-26,09:58,Donald Trump,China you know about. Where it started. I spok...
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly ..."
...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back."
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans ...
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if yo..."


### TextBlob Sentiment Analysis

In [3]:
from textblob import TextBlob

In [4]:
# for each paragraph, determine polarity and subjectivity
briefings_df['tb_polarity'] = briefings_df['text'].apply(lambda text: TextBlob(text).sentiment.polarity)
briefings_df['tb_subjectivity'] = briefings_df['text'].apply(lambda text: TextBlob(text).sentiment.subjectivity)

In [5]:
briefings_df

Unnamed: 0,date,timestamp,speaker,text,tb_polarity,tb_subjectivity
0,2020-02-26,05:39,Donald Trump,Thank you very much everybody. Thank you very ...,0.078559,0.562093
1,2020-02-26,06:59,Donald Trump,A lot of people thought we shouldn’t have done...,0.284714,0.431381
2,2020-02-26,07:51,Donald Trump,We have a total of 15. We took in some from Ja...,0.221088,0.506516
3,2020-02-26,09:58,Donald Trump,China you know about. Where it started. I spok...,-0.038796,0.439352
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly ...",0.036440,0.635832
...,...,...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.000000,0.000000
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back.",0.250000,0.250000
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans ...,0.066667,0.200000
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if yo...",0.241667,0.527083


### Vader Sentiment Analysis

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [7]:
analyzer = SentimentIntensityAnalyzer()

In [8]:
# for each paragraph, determine compound polarity score
briefings_df['v_compound_polarity'] = briefings_df['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

In [9]:
briefings_df

Unnamed: 0,date,timestamp,speaker,text,tb_polarity,tb_subjectivity,v_compound_polarity
0,2020-02-26,05:39,Donald Trump,Thank you very much everybody. Thank you very ...,0.078559,0.562093,0.7316
1,2020-02-26,06:59,Donald Trump,A lot of people thought we shouldn’t have done...,0.284714,0.431381,0.9510
2,2020-02-26,07:51,Donald Trump,We have a total of 15. We took in some from Ja...,0.221088,0.506516,0.9888
3,2020-02-26,09:58,Donald Trump,China you know about. Where it started. I spok...,-0.038796,0.439352,0.9124
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly ...",0.036440,0.635832,-0.8626
...,...,...,...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.000000,0.000000,0.0000
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back.",0.250000,0.250000,0.3804
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans ...,0.066667,0.200000,-0.8689
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if yo...",0.241667,0.527083,0.9225


### Emotion Analysis using the NRC Emotion Lexicon

- Let's see what we can uncover using the popular open-sourced emotion lexicon published by the NRC (National Research Council Canada).
- In addition to 'positive' and 'negative', we have word associations for 8 overarching emotion categories.

In [10]:
# read in raw emotion lexicon
filepath = "../NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=1, sep='\t')

# pivot df so we have one row per word, one column per emotion
emolex_df = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
# emolex_df = emolex_df.dropna(subset=['word']).reset_index(drop=True)

# rename df columns
emolex_df.columns.name = 'index'
emolex_df = emolex_df.rename(columns={"positive": "nrc_positive", "negative": "nrc_negative"})

emolex_df

index,word,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust
0,,0,0,0,0,0,0,0,0,0,0
1,aback,0,0,0,0,0,0,0,0,0,0
2,abacus,0,0,0,0,0,0,0,0,0,1
3,abandon,0,0,0,1,0,1,0,1,0,0
4,abandoned,1,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
14177,zone,0,0,0,0,0,0,0,0,0,0
14178,zoo,0,0,0,0,0,0,0,0,0,0
14179,zoological,0,0,0,0,0,0,0,0,0,0
14180,zoology,0,0,0,0,0,0,0,0,0,0


#### Using this lexicon, we can now we can now easily lookup all words from a single paragraph of the corpus:

In [11]:
paragraph_words = briefings_df.text[500].split()
emolex_df[pd.DataFrame(emolex_df.word.tolist()).isin(paragraph_words).any(1)]

index,word,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust
1238,big,0,0,0,0,0,0,0,0,0,0
1811,care,0,0,0,0,0,0,0,0,0,0
3088,cut,0,0,0,0,0,0,0,0,0,0
5094,force,1,0,0,1,0,1,0,0,0,0
6334,important,0,0,0,0,0,0,1,0,0,1
7759,matter,0,0,0,0,0,0,0,0,0,0
7814,meeting,0,0,0,0,0,0,0,0,0,0
9956,public,0,1,0,0,0,0,1,0,0,0
12549,task,0,0,0,0,0,0,1,0,0,0
12563,tax,0,0,0,0,0,1,0,1,0,0


#### Let's calculate and store aggregate emotion scores for each paragraph in the corpus:

In [12]:
# create empty df to store aggregated emotion calcs
data = pd.DataFrame([])

for text in briefings_df['text']:
    paragraph_words = text.split()
    paragraph_emos = emolex_df[pd.DataFrame(emolex_df.word.tolist()).isin(paragraph_words).any(1)].mean()
    data = data.append(paragraph_emos, ignore_index=True)
    
# combine aggregated emotion scores with transcript df
briefings_df = briefings_df.join(data)

# drop empty 'word' column, fill NaNs with zero
briefings_df = briefings_df.drop(columns=['word'])
briefings_df = briefings_df.fillna(0)

briefings_df

Unnamed: 0,date,timestamp,speaker,text,tb_polarity,tb_subjectivity,v_compound_polarity,anger,anticipation,disgust,fear,joy,nrc_negative,nrc_positive,sadness,surprise,trust
0,2020-02-26,05:39,Donald Trump,Thank you very much everybody. Thank you very ...,0.078559,0.562093,0.7316,0.136364,0.136364,0.090909,0.181818,0.090909,0.272727,0.181818,0.136364,0.090909,0.090909
1,2020-02-26,06:59,Donald Trump,A lot of people thought we shouldn’t have done...,0.284714,0.431381,0.9510,0.055556,0.222222,0.111111,0.166667,0.055556,0.166667,0.222222,0.055556,0.055556,0.166667
2,2020-02-26,07:51,Donald Trump,We have a total of 15. We took in some from Ja...,0.221088,0.506516,0.9888,0.000000,0.130435,0.000000,0.043478,0.173913,0.000000,0.391304,0.043478,0.086957,0.217391
3,2020-02-26,09:58,Donald Trump,China you know about. Where it started. I spok...,-0.038796,0.439352,0.9124,0.142857,0.214286,0.000000,0.142857,0.214286,0.214286,0.357143,0.000000,0.071429,0.214286
4,2020-02-26,10:52,Donald Trump,"We’re bringing in a specialist, a very highly ...",0.036440,0.635832,-0.8626,0.000000,0.153846,0.000000,0.076923,0.076923,0.230769,0.153846,0.076923,0.000000,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674,2020-04-27,01:01:23,Unnamed (Reporter),[crosstalk 00:13:23].,0.000000,0.000000,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9675,2020-04-27,01:01:24,Donald Trump,"Let’s do one more. Please, in the back.",0.250000,0.250000,0.3804,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9676,2020-04-27,01:01:26,Unnamed,If an American president loses more Americans ...,0.066667,0.200000,-0.8689,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,0.666667
9677,2020-04-27,01:01:36,Donald Trump,"So yeah, we’ve lost a lot of people, but if yo...",0.241667,0.527083,0.9225,0.071429,0.142857,0.000000,0.071429,0.071429,0.142857,0.214286,0.071429,0.071429,0.071429


In [14]:
# save scores df to csv
briefings_df.to_csv("../data/scored_briefings.csv",index=False)