This notebook removes the data samples that we decided were "bad", e.g. both TextBlob and VADER disagreed with the original label.

In [44]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import ssl
from textblob import TextBlob
import numpy as np

In [45]:
all_data = pd.read_csv("full_sentiment_dataset.csv") # data without the rows removed
all_data = all_data[['text', 'sentiment']]
all_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
3,i dont think you can vote anymore! i tried,negative
4,haha better drunken tweeting you mean?,positive
...,...,...
57622,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
57623,Finally got my iPhone 6 in the mail and it com...,negative
57624,@umo_games @Apple ended up getting a new compu...,neutral
57625,The 19-Year-Old #WizKid Who Turned Down @Apple...,neutral


In [46]:
def get_text_sentiment(text, algo):
    """
    Gets the sentiment of input text using either VADER or TextBlob.
    """
    # get the score
    score = None # variable for the output score
    if algo == 'vader':
        analyzer = SentimentIntensityAnalyzer()
        score = analyzer.polarity_scores(text)['compound']
    else:
        analyzer = TextBlob(text)
        score = analyzer.sentiment.polarity
    
    # format it as a label
    threshold = 0.15
    if score < -threshold:
        return 'negative'
    elif score > threshold:
        return 'positive'
    else:
        return 'neutral'

In [47]:
def total_disagreements(original_label, textblob_label, vader_label):
    total = 0
    if original_label != textblob_label:
        total += 1
    if original_label != vader_label:
        total += 1
    return total

In [48]:
num_0_disagree = 0; num_1_disagree = 0; num_2_disagree = 0

for i, (index_label, row) in enumerate(all_data.iterrows()):
    
    try:
        # log progress
        if (i + 1) % 1000 == 0:
            print("row {} of 57626 reached".format(i))

        text = row['text']; label = row['sentiment'] # get the data & label
        vader_pred = get_text_sentiment(text, 'vader') # get pred for VADER
        textblob_pred = get_text_sentiment(text, 'textblob') # get pred for TextBlob

        # get the statistics we need
        total_disagrees = total_disagreements(label, textblob_pred, vader_pred)
        if total_disagrees == 0: # keep it in the data frame
            num_0_disagree += 1
        elif total_disagrees == 1: # keep in DF
            num_1_disagree += 1
        else: # remove from DF
            num_2_disagree += 1
            all_data.drop(index_label, inplace=True)
            
    except Exception as e:
        print("error on row {}: {}".format(i, e))

row 999 of 57626 reached
row 1999 of 57626 reached
row 2999 of 57626 reached
row 3999 of 57626 reached
row 4999 of 57626 reached
row 5999 of 57626 reached
row 6999 of 57626 reached
row 7999 of 57626 reached
row 8999 of 57626 reached
row 9999 of 57626 reached
row 10999 of 57626 reached
row 11999 of 57626 reached
row 12999 of 57626 reached
error on row 13133: 'float' object has no attribute 'encode'
row 13999 of 57626 reached
row 14999 of 57626 reached
row 15999 of 57626 reached
row 16999 of 57626 reached
row 17999 of 57626 reached
row 18999 of 57626 reached
row 19999 of 57626 reached
row 20999 of 57626 reached
row 21999 of 57626 reached
row 22999 of 57626 reached
row 23999 of 57626 reached
row 24999 of 57626 reached
row 25999 of 57626 reached
row 26999 of 57626 reached
row 27999 of 57626 reached
row 28999 of 57626 reached
row 29999 of 57626 reached
row 30999 of 57626 reached
row 31999 of 57626 reached
row 32999 of 57626 reached
row 33999 of 57626 reached
row 34999 of 57626 reached
row 3

In [49]:
print(num_0_disagree, num_1_disagree, num_2_disagree)

21187 17773 18666


In [50]:
all_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
4,haha better drunken tweeting you mean?,positive
6,had an awsome salad! I recommend getting the S...,positive
...,...,...
57618,RT @toricolelli: My phones been charging for a...,negative
57621,'@WhoaBiebz: GET YOUR SHIT TOGETHER OR I'LL GU...,negative
57622,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
57624,@umo_games @Apple ended up getting a new compu...,neutral


In [51]:
# list of strings
lst = ['Geeks', 'For', 'Geeks', 'is',
            'portal', 'for', 'Geeks']
 
# Calling DataFrame constructor on list
df = pd.DataFrame(lst)
print(df)

        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks


In [52]:
df.drop(0, inplace=True)
df

Unnamed: 0,0
1,For
2,Geeks
3,is
4,portal
5,for
6,Geeks


In [53]:
df.drop(1, inplace=True)

In [54]:
df

Unnamed: 0,0
2,Geeks
3,is
4,portal
5,for
6,Geeks


In [55]:
all_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
4,haha better drunken tweeting you mean?,positive
6,had an awsome salad! I recommend getting the S...,positive
...,...,...
57618,RT @toricolelli: My phones been charging for a...,negative
57621,'@WhoaBiebz: GET YOUR SHIT TOGETHER OR I'LL GU...,negative
57622,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
57624,@umo_games @Apple ended up getting a new compu...,neutral


In [56]:
len(all_data)

38961

In [57]:
38961+18666

57627

In [58]:
new_data = all_data.reset_index()

In [61]:
new_data = new_data[['text', 'sentiment']]

In [62]:
new_data

Unnamed: 0,text,sentiment
0,Spent the entire morning in a meeting w/ a ven...,neutral
1,Oh! Good idea about putting them on ice cream,positive
2,says good (or should i say bad?) afternoon! h...,neutral
3,haha better drunken tweeting you mean?,positive
4,had an awsome salad! I recommend getting the S...,positive
...,...,...
38956,RT @toricolelli: My phones been charging for a...,negative
38957,'@WhoaBiebz: GET YOUR SHIT TOGETHER OR I'LL GU...,negative
38958,Those** PICK UP THE SLACK YOU FUCK BOYS @Apple,negative
38959,@umo_games @Apple ended up getting a new compu...,neutral


In [63]:
# save to file
# comment out to save

# filename = "cleaned_data.csv"
# new_data.to_csv(filename, index=False)