In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import spacy
from tqdm import tqdm
from sklearn.metrics import classification_report

In [6]:
# Read the csv file
df = pd.read_csv('data.csv')

In [7]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [9]:
# Drop textID and selected_text column
df.drop(columns=['textID', 'selected_text'], inplace=True)

In [10]:
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [11]:
df["sentiment"].value_counts()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

In [12]:
df["text"] = df["text"].astype(str)

In [13]:
nlp = spacy.load("en_core_web_sm")
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader_model = SentimentIntensityAnalyzer()


def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    

def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [14]:
# Settings
to_lemmatize = True
pos = set()

# Storage
tweets = []
all_vader_output = []
gold = []

# Apply VADER
for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing tweets"):
    the_tweet = row['text']
    vader_output = run_vader(the_tweet, lemmatize=to_lemmatize, parts_of_speech_to_consider=pos)
    vader_label = vader_output_to_label(vader_output)
    
    tweets.append(the_tweet)
    all_vader_output.append(vader_label)
    gold.append(row['sentiment'])

# Evaluate
print(classification_report(gold, all_vader_output))

Processing tweets: 100%|██████████| 27481/27481 [05:26<00:00, 84.08it/s]


              precision    recall  f1-score   support

    negative       0.70      0.63      0.66      7781
     neutral       0.74      0.43      0.54     11118
    positive       0.55      0.89      0.68      8582

    accuracy                           0.63     27481
   macro avg       0.66      0.65      0.63     27481
weighted avg       0.67      0.63      0.62     27481



In [15]:
# Print tweet, VADER output (scores), and the gold sentiment label at specific index
tweet_index = 43
print(f"tweet: {tweets[tweet_index]}")
print(f"VADER output: {all_vader_output[tweet_index]}")
print(f"gold: {gold[tweet_index]}")

tweet: RATT ROCKED NASHVILLE TONITE..ONE THING SUCKED, NO ENCORE!  LIKE IN THE 80`S THEY STILL HAVE A FUN SHOW. PEARCY HAS THAT HOTT BAD BOY LOOK
VADER output: negative
gold: neutral
