In [1]:
import os
import tqdm
import pandas as pd
import nltk
nltk.download(['stopwords', 'vader_lexicon'])
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords as CorpusStopWords

DATA_DIR = '../data_collection'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Serra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Serra\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
sia = SentimentIntensityAnalyzer()

def sentiment_analyzer(content):
    # Clean extra spaces
    content = ' '.join([w.strip() for w in content.split()])
    # Clean English stop words
    content = ' '.join([w for w in content.split() if w.lower() not in CorpusStopWords.words('english')])
    # Clean media URLs
    content = ' '.join([w for w in content.split() if not w.startswith('http')])
    # Clean unwanted characters
    content = ''.join([c for c in content if c.isalpha() or c.isnumeric() or c in ' .,!'])
    # Run sentiment analysis
    polarity = sia.polarity_scores(content)
    return polarity

# Example
print(sentiment_analyzer('This is a test, and it is awesome!'))

{'neg': 0.0, 'neu': 0.185, 'pos': 0.815, 'compound': 0.6588}


In [3]:
def add_sentiments(tweet_file):
    df = pd.read_csv(tweet_file, index_col=0)
    polarity_results = []
    for s in tqdm.tqdm(df['rawContent'], desc='Analyzing Tweet Contents', position=1, mininterval=10):
        polarity_results.append(sentiment_analyzer(s))
    df['compound'] = [p['compound'] for p in polarity_results]
    df['pos'] = [p['pos'] for p in polarity_results]
    df['neu'] = [p['neu'] for p in polarity_results]
    df['neg'] = [p['neg'] for p in polarity_results]
    return df

In [4]:
# Example file
new_df = add_sentiments(DATA_DIR + '/2020-02.csv')
new_df[['rawContent', 'pos', 'neu', 'neg', 'compound']].sample(10)


Analyzing Tweet Contents:   0%|                                                                                                      | 0/4863 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  37%|█████████████████████████████████▌                                                        | 1811/4863 [00:10<00:16, 181.06it/s][A
Analyzing Tweet Contents: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 4863/4863 [00:27<00:00, 177.60it/s][A


Unnamed: 0,rawContent,pos,neu,neg,compound
2951,these countries in #africa are more vulnerable...,0.0,0.84,0.16,-0.2263
4573,Cooles Dashboard #Coronavirus #COVID -19 (Mobi...,0.0,1.0,0.0,0.0
2654,these are the #iran elections in a time of unc...,0.124,0.783,0.092,0.1779
3715,#COVIDー19 #COVID19 #COVID #COVID2019 #coronavi...,0.0,0.758,0.242,-0.4939
2945,https://t.co/v0sUZuYYzO 3M 8233 N100 Disposabl...,0.095,0.905,0.0,0.3182
1490,#COVID19 cases are climbing across the globe. ...,0.224,0.776,0.0,0.3818
2659,What progress in 6 weeks - super impressive #C...,0.714,0.286,0.0,0.875
3307,When Politics and #COVID #flu #pandemic #influ...,0.0,0.506,0.494,-0.4404
7,"Make this happen soonest, please, for humanity...",0.277,0.723,0.0,0.3182
466,"In this blog, we introduce our Novel #Coronavi...",0.103,0.897,0.0,0.3182


In [5]:
# Iterate files and add sentiment
for filename in tqdm.tqdm(os.listdir(DATA_DIR), desc='Tweet files', position=0):
    if not filename.endswith('.csv'):
        continue
    full_path = os.path.join(DATA_DIR, filename)
    new_df = add_sentiments(full_path)
    save_path = os.path.join('data', filename)
    new_df.to_csv(save_path)

Tweet files:   0%|                                                                                                                     | 0/42 [00:00<?, ?it/s]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/4863 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  36%|████████████████████████████████                                                          | 1733/4863 [00:10<00:18, 173.28it/s][A
Analyzing Tweet Contents: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 4863/4863 [00:28<00:00, 168.78it/s][A
Tweet files:   5%|█████▏                                                                                                       | 2/42 [00:28<09:39, 14.48s/it]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/7936 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  22%|███

Analyzing Tweet Contents:  63%|█████████████████████████████████████████████████████████                                 | 5036/7936 [00:30<00:17, 165.25it/s][A
Analyzing Tweet Contents: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 7936/7936 [00:47<00:00, 165.41it/s][A
Tweet files:  24%|█████████████████████████▋                                                                                  | 10/42 [06:43<24:40, 46.26s/it]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/7680 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  22%|███████████████████▍                                                                      | 1658/7680 [00:10<00:36, 165.78it/s][A
Analyzing Tweet Contents:  43%|██████████████████████████████████████▊                                                   | 3316/7680 [00:20<00:26, 165.08it/s][A
Analyzing Tweet Contents:  65%|

Tweet files:  43%|██████████████████████████████████████████████▎                                                             | 18/42 [12:59<18:47, 46.96s/it]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/7936 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  19%|█████████████████▍                                                                        | 1537/7936 [00:10<00:41, 153.63it/s][A
Analyzing Tweet Contents:  39%|██████████████████████████████████▊                                                       | 3074/7936 [00:20<00:32, 150.41it/s][A
Analyzing Tweet Contents:  58%|███████████████████████████████████████████████████▉                                      | 4583/7936 [00:30<00:22, 150.56it/s][A
Analyzing Tweet Contents:  77%|█████████████████████████████████████████████████████████████████████                     | 6091/7936 [00:40<00:12, 148.98it/s][A
Analyzing Tweet Contents: 100%|

Analyzing Tweet Contents:  67%|████████████████████████████████████████████████████████████                              | 4611/6915 [00:30<00:15, 151.10it/s][A
Analyzing Tweet Contents: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 6915/6915 [00:45<00:00, 153.21it/s][A
Tweet files:  62%|██████████████████████████████████████████████████████████████████▊                                         | 26/42 [19:38<12:57, 48.61s/it]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/7936 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  19%|█████████████████                                                                         | 1505/7936 [00:10<00:42, 150.50it/s][A
Analyzing Tweet Contents:  39%|██████████████████████████████████▉                                                       | 3084/7936 [00:20<00:31, 154.85it/s][A
Analyzing Tweet Contents:  59%|

Tweet files:  79%|████████████████████████████████████████████████████████████████████████████████████▊                       | 33/42 [25:43<07:46, 51.86s/it]
Analyzing Tweet Contents:   0%|                                                                                                      | 0/7880 [00:00<?, ?it/s][A
Analyzing Tweet Contents:  19%|████████████████▊                                                                         | 1470/7880 [00:10<00:43, 147.00it/s][A
Analyzing Tweet Contents:  38%|█████████████████████████████████▊                                                        | 2955/7880 [00:20<00:33, 147.88it/s][A
Analyzing Tweet Contents:  57%|███████████████████████████████████████████████████▏                                      | 4483/7880 [00:30<00:22, 150.12it/s][A
Analyzing Tweet Contents:  76%|████████████████████████████████████████████████████████████████████▋                     | 6011/7880 [00:40<00:12, 150.72it/s][A
Analyzing Tweet Contents: 100%|

Tweet files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [32:19<00:00, 46.17s/it]
