In [31]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import ssl
from textblob import TextBlob
import numpy as np

In [2]:
# code for downloading the NLTK required stuff
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download('vader-lexicon')

In [25]:
all_data = pd.read_csv("full_sentiment_dataset.csv")[['text', 'sentiment']]
all_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
3,i dont think you can vote anymore! i tried,negative
4,haha better drunken tweeting you mean?,positive
...,...,...
57622,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
57623,Finally got my iPhone 6 in the mail and it com...,negative
57624,@umo_games @Apple ended up getting a new compu...,neutral
57625,The 19-Year-Old #WizKid Who Turned Down @Apple...,neutral


In [26]:
short_data = all_data[:1000]
short_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
3,i dont think you can vote anymore! i tried,negative
4,haha better drunken tweeting you mean?,positive
...,...,...
995,is gutted she isnt going out tonight,negative
996,_Punk_Robot wtf.....winter isn't due til mond...,negative
997,"So I got up, went outside planted a few flower...",positive
998,this week of mine was not easy! but finally i...,negative


In [75]:
def get_text_sentiment(text, algo):
    """
    Gets the sentiment of input text using either VADER or TextBlob.
    """
    # get the score
    score = None # variable for the output score
    if algo == 'vader':
        analyzer = SentimentIntensityAnalyzer()
        score = analyzer.polarity_scores(text)['compound']
    else:
        analyzer = TextBlob(text)
        score = analyzer.sentiment.polarity
    
    # format it as a label
    threshold = 0.1
    if score < -threshold:
        return 'negative'
    elif score > threshold:
        return 'positive'
    else:
        return 'neutral'

In [60]:
def total_disagreements(original_label, textblob_label, vader_label):
    total = 0
    if original_label != textblob_label:
        total += 1
    if original_label != vader_label:
        total += 1
    return total

In [29]:
num_0_disagree = 0; num_1_disagree = 0; num_2_disagree = 0

disagreed = []
num_printed = 0

short_data = all_data[:1000]
for i, row in all_data.iterrows():
    
    try:
        # log progress
        if (i + 1) % 1000 == 0:
            print("row {} of 57626 reached".format(i))

        text = row['text']; label = row['sentiment'] # get the data & label
        vader_pred = get_text_sentiment(text, 'vader') # get pred for VADER
        textblob_pred = get_text_sentiment(text, 'textblob') # get pred for TextBlob

        # get the statistics we need
        total_disagrees = total_disagreements(label, textblob_pred, vader_pred)
        if total_disagrees == 0:
            num_0_disagree += 1
        elif total_disagrees == 1:
            num_1_disagree += 1
            disagreed.append((text, label))
            if num_printed < 10:
                print(text, label)
            num_printed += 1
        else:
            num_2_disagree += 1
            
    except:
        print("error on row {}".format(i))

says good (or should i say bad?) afternoon!  http://plurk.com/p/wxpdj neutral
 haha better drunken tweeting you mean? positive
had an awsome salad! I recommend getting the Spicey buffalo chicken salad! positive
 Thank a yoou  how are you? #TwitterTakeover positive
so i have like no more friends it's kinda sad negative
i have perused the #fieldnotes website and it is good.  too bad i must return to work neutral
Also I popped the phone open and got all that goddamn dust out, but I wore out a clip on the camera panel so I had to glue it shut negative
 aaawww  no worries fresh start to work on growing it out again positive
is sooo tired and too busy to tweet  im glad the weekend is here... yay 4 day-weekend positive
 oh he is so cute... is he in uniteddogs.com? Poppy is there positive
row 999 of 57626 reached
row 1999 of 57626 reached
row 2999 of 57626 reached
row 3999 of 57626 reached
row 4999 of 57626 reached
row 5999 of 57626 reached
row 6999 of 57626 reached
row 7999 of 57626 reached
r

In [30]:
print(num_0_disagree, num_1_disagree, num_2_disagree)

total = num_0_disagree + num_1_disagree + num_2_disagree

print(num_0_disagree / total) # both VADER & TextBlob agree w/ original label
print(num_1_disagree / total) # either VADER or TextBlob disagrees w/ original label
print(num_2_disagree / total) # both VADER & TextBlob disagree w/ original label

21187 17773 18666
0.3676639017110332
0.30841981050220385
0.3239162877867629


In [76]:
# randomly select 1000
num_0_disagree = 0; num_1_disagree = 0; num_2_disagree = 0
rand_indices = np.random.randint(low=0, high=57627, size=1000, dtype=int)
for j, i in enumerate(rand_indices):
    row = all_data.iloc[i]
    text = row['text']; label = row['sentiment'] # get the data & label
#     if j == 0:
#         print(text, label)
    vader_pred = get_text_sentiment(text, 'vader') # get pred for VADER
    textblob_pred = get_text_sentiment(text, 'textblob') # get pred for TextBlob
    
    # get the statistics we need
    total_disagrees = total_disagreements(label, textblob_pred, vader_pred)
    if total_disagrees == 0:
        num_0_disagree += 1
    elif total_disagrees == 1:
        num_1_disagree += 1
        disagreed.append((text, label))
        if num_printed < 10:
            print(text, label)
        num_printed += 1
    else:
        num_2_disagree += 1

In [77]:
print(num_0_disagree, num_1_disagree, num_2_disagree)

total = num_0_disagree + num_1_disagree + num_2_disagree

print(num_0_disagree / total) # both VADER & TextBlob agree w/ original label
print(num_1_disagree / total) # either VADER or TextBlob disagrees w/ original label
print(num_2_disagree / total) # both VADER & TextBlob disagree w/ original label

380 314 306
0.38
0.314
0.306
