
## Sentiment and Emotion Analysis of COVID-19 Related Tweets Using NLP

This notebook analyzes the sentiment and emotion of COVID-19 related tweets collected via Twitter streams. Natural Language Processing (NLP) techniques are used to preprocess the collected tweets and then score them using ML models. 

The sentiment and emotion scores of each tweet are then averaged over a set of 20 tweets, and the resulting scores are stored in a file. This approach allows for an overall assessment of the sentiment and emotion trends related to COVID-19 on Twitter.

In [4]:
# utilities
import re
import numpy as np
import pandas as pd
import json
import pandas as pd
import gzip, pickle
import os
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from functions import preprocess, writeFile

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/christiedjidjev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christiedjidjev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/christiedjidjev/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christiedjidjev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# function taken from https://github.com/mhoikka/twitsent/blob/main/twitsent/src/twitsent/parse_sentiment.py
def parse(interval_lists):
        """
        Sentiment is parsed from collected tweet text and converted to a score
        Parameters
        --------
        interval_lists : list of lists
            Cleaned tweet text grouped by the time interval of data collection
            
        Returns
        --------
        all_sentiment : list of lists
            Sentiment score data for each tweet in interval_lists, grouped
            correspondingly by time interval
            
        Raises
        --------
        
        """

        #initialize sentiment analysis tool
        sia = SentimentIntensityAnalyzer()
        #create lists to store sentiment analysis scores in a format that mimics the interval_lists fed as input to this method
        sent_list = []  #temp sublist of all_sentiment
        all_sentiment = []  #this list is returned

        #list of words that are meaningless to sentiment analysis
        stopwords = nltk.corpus.stopwords.words("english")

        #use all meaningful words in the strings within interval_lists to parse sentiment and store the sentiment within all_sentiment
        for interval in interval_lists:
            for tweet in interval:
                sent_list.append(sia.polarity_scores(tweet)['compound'])
            all_sentiment.append(sent_list.copy())
            sent_list.clear()

        return all_sentiment

In [6]:
emotions = ['anger', 'anticipation', 'fear', 'joy', 'love', 'optimism', 'pessimism']
avg_labels = {em:[] for em in emotions}

# loads a trained model and a vectorizer from a pickle file
fileName = '/Users/christiedjidjev/Library/CloudStorage/OneDrive-Personal/Classes/Twitter Sentiment/training.txt'
with gzip.GzipFile(fileName, 'rb') as f:
    trained_models, vectoriser = pickle.load(f)

folder_path = '/Users/christiedjidjev/Library/CloudStorage/OneDrive-Personal/Classes/Twitter Sentiment/twitter data by 20 covid/'

avg_labels['positive_sentiment'] = []
avg_labels['negative_sentiment'] = []

# loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename == '.DS_Store':
        continue
    f = os.path.join(folder_path, filename)
    data = []
    
    
    # Read the text file
    with open(f, 'r', encoding= "ISO-8859-1") as f:
        count = 0
        new_tweet = ""

        for line in f:
            count+=1
            # Remove new line from 'line'
            line = line.rstrip("\n")
            #print(line)
            new_tweet += line
            if line == "}":
                try:
                    json_dict = json.loads(new_tweet)
                    del json_dict['matching_rules']
                    data.append(json_dict['data'])
                    new_tweet = ""
                except:
                    pass

    df_20 = pd.DataFrame(data)

    dataset=df_20[['text']]

    #Making statement text in lowercase
    dataset = dataset.copy()
    dataset['text']=dataset['text'].str.lower()
    dataset['text'].tail()

    dataset = preprocess(dataset)

    #putting words back into a single string
    dataset['text'] = dataset['text'].apply(lambda x: ' '.join(x))
    dataset['text'].head()
    dataset_copy = dataset.copy()
    #dataset = dataset_copy

    X=dataset.text
    X = vectoriser.transform(X)

    all_emotions = ['anger','anticipation','disgust','fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

    best_method = {'anger':'SVCmodel', 'anticipation':'SVCmodel', 'fear':'BernoulliNB','joy':'SVCmodel',\
                'love':'SVCmodel','optimism':'log_regression','pessimism':'BernoulliNB'}

    emotions = list(best_method.keys())

    labels = {}
    for em in emotions:
        y = trained_models[em][best_method[em]].predict(X)
        labels[em] = y


    tweet_list = []
    for i in range(len(df_20)):
        x=dataset['text'][i]
        tweet_list.append(x)

    tweet_list = [tweet.split() for tweet in tweet_list]

    sentiment_list = parse(tweet_list)

    #take mean of sentiment scores for each interval, using zero for the mean of any intervals that have no scores due to lack of data
    avg_sent = [
        
        sum(interval) / len(interval) if len(interval) != 0 else 0
        for interval in sentiment_list
    ]

    for em in labels:
        lst = [int(x) for x in labels[em]]
        avg_labels[em].append(sum(lst)/len(lst))

    sentiment_pos = [s if s > 0 else 0 for s in avg_sent]
    sentiment_neg = [s if s < 0 else 0 for s in avg_sent]

    avg_labels['positive_sentiment'].append(sum(sentiment_pos)/len(sentiment_pos))
    avg_labels['negative_sentiment'].append(sum(sentiment_neg)/len(sentiment_neg))

writeFile('avg_labels.gzip', avg_labels)