In [1]:
import numpy as np
import pandas as pd

import textblob
from afinn import Afinn
import spacy
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn import metrics

from datetime import datetime

from IPython.core.display import HTML
import tba3102
import model_evaluation_utils as meu



display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options()

afn = Afinn(emoticons=True)
nlp = spacy.load('en_core_web_sm')

In [2]:
def analyze_sentiment_sentiwordnet_lexicon(text, verbose=False):

    # tokenize and POS tag text tokens
    tagged_text = [(token.text, token.tag_) for token in nlp(text)]
    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:

        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]

        # if senti-synset is found
        if ss_set:

            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1

    # aggregate final scores
    final_score = pos_score - neg_score
    
    if token_count != 0:
        
        norm_final_score = round(float(final_score) / token_count, 2)
        
    else:
        
        norm_final_score = final_score
        
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'

    if verbose:

        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)

        print('SENTIMENT STATS:')
        print('Predicted Sentiment', final_sentiment)
        print('Objectivity', norm_obj_score)
        print('Positive', norm_pos_score)
        print('Negative', norm_neg_score)
        print('Overall', norm_final_score)

    return final_sentiment



def analyze_sentiment_vader_lexicon(text, threshold=0.1, verbose=False):

    # analyze the sentiment for text
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)

    # get aggregate scores and final sentiment
    agg_score = scores['compound']
    final_sentiment = 'positive' if agg_score >= threshold else 'negative'

    if verbose:

        # display detailed sentiment statistics
        positive = str(round(scores['pos'], 2)*100)+'%'
        final = round(agg_score, 2)
        negative = str(round(scores['neg'], 2)*100)+'%'
        neutral = str(round(scores['neu'], 2)*100)+'%'

        print('SENTIMENT STATS:')
        print('Predicted Sentiment', final_sentiment)
        print('Polarity Score', final)
        print('Positive', positive)
        print('Negative', negative)
        print('Neutral', neutral)

    return final_sentiment



def lexicon_sentiment_analysis(dataset_name, df, column_name_text, column_name_actual_sentiment, lexicon):
    
    sentiment_polarity = []
    predicted_sentiments = []
    
    print('Lexicon-based Sentiment Analysis: {} with {}'.format(dataset_name, lexicon))
    print('-' * 80)
    
    if lexicon == 'textblob':
        
        sentiment_polarity = [textblob.TextBlob(text).sentiment.polarity for text in df[column_name_text].array]
        predicted_sentiments = ['positive' if score >= 0.1 else 'negative' for score in sentiment_polarity]
        
    elif lexicon == 'afinn':
        
        sentiment_polarity = [afn.score(text) for text in df[column_name_text].array]
        predicted_sentiments = ['positive' if score >= 1.0 else 'negative' for score in sentiment_polarity]
    
    elif lexicon == 'sentiwordnet':
                
        predicted_sentiments = [analyze_sentiment_sentiwordnet_lexicon(text, verbose=False) for text in df[column_name_text].array]
        
    elif lexicon == 'vader':
        
        predicted_sentiments = [analyze_sentiment_vader_lexicon(text, verbose=False) for text in df[column_name_text].array]
    
        
    meu.get_metrics(true_labels=df[column_name_actual_sentiment].array, predicted_labels=predicted_sentiments)
    meu.display_classification_report(true_labels=df[column_name_actual_sentiment].array, predicted_labels=predicted_sentiments, 
                                      classes=['positive', 'negative'])
    print(metrics.confusion_matrix(y_true=df[column_name_actual_sentiment].array, y_pred=predicted_sentiments, labels=['positive', 'negative']))
    print('=' * 80)

In [3]:
print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2023-04-04 20:39:38.621019


In [4]:
datasets = ['without_emoticon', 'with_emoticon', 'with_emoticon_nospelling_nolemma']
lexicons = ['textblob', 'afinn', 'sentiwordnet', 'vader']

for dataset in datasets:
    
    dataset_filename = 'training_16000_cleaned_' + dataset + '.csv'
    
    df = pd.read_csv('../data/' + dataset_filename, encoding='ISO-8859-1')
    df.loc[df['polarity'] == 0, 'polarity'] = 'negative'
    df.loc[df['polarity'] == 4, 'polarity'] = 'positive'        
    
    for lexicon in lexicons:
        
        lexicon_sentiment_analysis(dataset, df, 'cleaned_tweet', 'polarity', lexicon)

Lexicon-based Sentiment Analysis: without_emoticon with textblob
--------------------------------------------------------------------------------
Accuracy: 0.6214
Precision: 0.6303
Recall: 0.6214
F1 Score: 0.6149
              precision    recall  f1-score   support

    positive       0.66      0.49      0.56      7998
    negative       0.60      0.75      0.67      7996

    accuracy                           0.62     15994
   macro avg       0.63      0.62      0.61     15994
weighted avg       0.63      0.62      0.61     15994

[[3927 4071]
 [1984 6012]]
Lexicon-based Sentiment Analysis: without_emoticon with afinn
--------------------------------------------------------------------------------
Accuracy: 0.6351
Precision: 0.6387
Recall: 0.6351
F1 Score: 0.6328
              precision    recall  f1-score   support

    positive       0.66      0.56      0.60      7998
    negative       0.62      0.71      0.66      7996

    accuracy                           0.64     15994
   ma

In [5]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2023-04-04 20:47:10.240434
