In [1]:
import os
import tqdm
import pandas as pd
import nltk
nltk.download(['stopwords', 'vader_lexicon'])
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords as CorpusStopWords

DATA_DIR = '../data_collection'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Serra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Serra\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
sia = SentimentIntensityAnalyzer()

def sentiment_analyzer(content):
    # Clean extra spaces
    content = ' '.join([w.strip() for w in content.split()])
    # Clean English stop words
    content = ' '.join([w for w in content.split() if w.lower() not in CorpusStopWords.words('english')])
    # Clean media URLs
    content = ' '.join([w for w in content.split() if not w.startswith('http')])
    # Clean unwanted characters
    content = ''.join([c for c in content if c.isalpha() or c.isnumeric() or c in ' .,!'])
    # Run sentiment analysis
    polarity = sia.polarity_scores(content)
    return polarity

# Example
print(sentiment_analyzer('This is a test, and it is awesome!'))

{'neg': 0.0, 'neu': 0.185, 'pos': 0.815, 'compound': 0.6588}


In [3]:
def add_sentiments(tweet_file):
    df = pd.read_csv(tweet_file, index_col=0)
    polarity_results = []
    for s in tqdm.tqdm(df['rawContent'], desc='Analyzing Tweet Contents', position=1, mininterval=10):
        polarity_results.append(sentiment_analyzer(s))
    df['compound'] = [p['compound'] for p in polarity_results]
    df['pos'] = [p['pos'] for p in polarity_results]
    df['neu'] = [p['neu'] for p in polarity_results]
    df['neg'] = [p['neg'] for p in polarity_results]
    return df

In [4]:
# Example file
new_df = add_sentiments(DATA_DIR + '/2020-02.csv')
new_df[['rawContent', 'pos', 'neu', 'neg', 'compound']].sample(10)


Analyzing Tweet Contents: 100%|█████████████████████████████████████████████████████| 141/141 [00:01<00:00, 132.49it/s][A


Unnamed: 0,rawContent,pos,neu,neg,compound
53,1. South Korea has posted its morning (Feb. 24...,0.0,1.0,0.0,0.0
74,UPDATE: MoH advises Kenyans against non-essent...,0.049,0.951,0.0,0.1027
24,🇨🇦 pleased to provide $2M to @WHO to help coun...,0.376,0.506,0.118,0.8481
58,Good question from @VOANews reporter asking ab...,0.131,0.633,0.235,-0.3182
46,Italians being Italians: took every kind of pa...,0.193,0.625,0.182,0.0516
35,South Korea is doing testing 🧪 correctly - com...,0.247,0.753,0.0,0.6786
122,"Unfortunately, it’s going to get much worse, b...",0.246,0.517,0.237,0.0516
126,Stop using this health emergency to “praise” t...,0.184,0.51,0.306,-0.4215
22,"Till 1800 (JST) today, no new #COVID-19 positi...",0.262,0.738,0.0,0.8176
75,It saddens me to hear that this year‘s Newroz ...,0.249,0.571,0.18,0.0772


In [5]:
# Iterate files and add sentiment
for filename in tqdm.tqdm(os.listdir(DATA_DIR), desc='Tweet files', position=0):
    if not filename.endswith('.csv'):
        continue
    full_path = os.path.join(DATA_DIR, filename)
    new_df = add_sentiments(full_path)
    save_path = os.path.join('data', filename)
    new_df.to_csv(save_path)

Tweet files:   0%|                                                                              | 0/42 [00:00<?, ?it/s]
Analyzing Tweet Contents: 100%|█████████████████████████████████████████████████████| 141/141 [00:01<00:00, 139.43it/s][A
Tweet files:   5%|███▎                                                                  | 2/42 [00:01<00:20,  1.94it/s]
Analyzing Tweet Contents:   0%|                                                               | 0/4601 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  29%|██████████████▉                                    | 1351/4601 [00:10<00:24, 135.04it/s][A
Analyzing Tweet Contents:  59%|█████████████████████████████▉                     | 2702/4601 [00:20<00:14, 132.40it/s][A
Analyzing Tweet Contents: 100%|███████████████████████████████████████████████████| 4601/4601 [00:34<00:00, 132.52it/s][A
Tweet files:   7%|█████                                                                 | 3/42 [00:35<09:38, 14.83s/it]
Analyzing Tweet Contents:

Analyzing Tweet Contents: 100%|███████████████████████████████████████████████████| 1475/1475 [00:11<00:00, 124.91it/s][A
Tweet files:  50%|██████████████████████████████████▌                                  | 21/42 [06:33<05:28, 15.63s/it]
Analyzing Tweet Contents: 100%|███████████████████████████████████████████████████| 1147/1147 [00:09<00:00, 121.28it/s][A
Tweet files:  52%|████████████████████████████████████▏                                | 22/42 [06:42<04:35, 13.80s/it]
Analyzing Tweet Contents: 100%|███████████████████████████████████████████████████| 1156/1156 [00:09<00:00, 125.30it/s][A
Tweet files:  55%|█████████████████████████████████████▊                               | 23/42 [06:51<03:56, 12.44s/it]
Analyzing Tweet Contents:   0%|                                                               | 0/1907 [00:00<?, ?it/s][A
Analyzing Tweet Contents: 100%|███████████████████████████████████████████████████| 1907/1907 [00:15<00:00, 119.57it/s][A
Tweet files:  57%|███████