### Testing how well VADER performs on the emoji tweets

In [1]:
import VADER
import VADER.vaderSentiment
from VADER.vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.metrics import accuracy_score
import emoji
import re

pd.options.display.max_colwidth =500
pd.options.display.max_rows =50

In [2]:
def regex_contains_emoji(s, return_filtered = False, return_dict = False):
    emojis = " ".join([emoj for emoj in emoji.UNICODE_EMOJI.keys() if len(emoj)==1])
    
    subbed = re.sub('[^' + emojis +']+', '', s)
    subbed = re.sub('[\s]+', '', subbed)    
    
    emoji_dict = {}
    for e in set(list(subbed)):
        emoji_dict[e] = list(subbed).count(e)
        
    if return_filtered:
        return subbed
    if return_dict:
        return emoji_dict
    return bool(len(subbed))

In [3]:
def decayed_count(count, k=2):
    # if count = 5, returns 1 + 1/k + 1/k^2 + 1/k^3 + 1/k^4
    if (k == 1):
        return 1
    return (k - k**(1-count))/(k-1)

In [4]:
all_emoji_tweets_train = pd.read_csv('all_emoji_tweets_train.csv')
all_emoji_tweets_test = pd.read_csv('all_emoji_tweets_test.csv')

In [5]:
analyser = SentimentIntensityAnalyzer()

In [6]:
orig_predictions = []
for tweet in all_emoji_tweets_test.text:
    score = analyser.polarity_scores(tweet)['compound']
    orig_predictions.append(score)

In [7]:
predictions = [i for i in range(all_emoji_tweets_test.shape[0])]
for i in range(len(orig_predictions)):
    if orig_predictions[i] > 0.05:
        predictions[i] = 'positive'
    elif orig_predictions[i] < -0.05:
        predictions[i] = 'negative'
    else:
        predictions[i] = 'neutral'

In [8]:
print(accuracy_score(all_emoji_tweets_test.sentiment, predictions))

0.6209150326797386


### Emojis and the sentiments of tweets they appear in:

<p>emoji_senti with all repeats included:</p>

In [9]:
# repeats
r_emoji_senti = {}
for emoj in [emoj for emoj in emoji.UNICODE_EMOJI.keys() if len(emoj)==1]:
    r_emoji_senti[emoj] ={"positive": 0, "neutral": 0, "negative": 0}

import string
for i in all_emoji_tweets_train.iterrows():
    if(regex_contains_emoji(i[1]["text"])):
        for emoj in regex_contains_emoji(i[1]["text"], True):
            if i[1]["sentiment"].lower() == "positive":
                r_emoji_senti[emoj]["positive"] += 1
            elif i[1]["sentiment"].lower() == "neutral":
                r_emoji_senti[emoj]["neutral"] += 1
            elif i[1]["sentiment"].lower() == "negative":
                r_emoji_senti[emoj]["negative"] += 1
            else:
                print("?")

<p>emoji_senti with decayed repeats:</p>

In [10]:
# decayed repeats
d_emoji_senti = {}
for emoj in [emoj for emoj in emoji.UNICODE_EMOJI.keys() if len(emoj)==1]:
    d_emoji_senti[emoj] ={"positive": 0, "neutral": 0, "negative": 0}

import string
for i in all_emoji_tweets_train.iterrows():
    if(regex_contains_emoji(i[1]["text"])):
        for emoj in regex_contains_emoji(i[1]["text"], False, True).items():
            if i[1]["sentiment"].lower() == "positive":
                d_emoji_senti[emoj[0]]["positive"] += decayed_count(emoj[1])
            elif i[1]["sentiment"].lower() == "neutral":
                d_emoji_senti[emoj[0]]["neutral"] += decayed_count(emoj[1])
            elif i[1]["sentiment"].lower() == "negative":
                d_emoji_senti[emoj[0]]["negative"] += decayed_count(emoj[1])
            else:
                print("wtf?/")

<p>emoji_senti with no repeats:</p>

In [11]:
n_emoji_senti = {}
for emoj in [emoj for emoj in emoji.UNICODE_EMOJI.keys() if len(emoj)==1]:
    n_emoji_senti[emoj] ={"positive": 0, "neutral": 0, "negative": 0}

import string
for i in all_emoji_tweets_train.iterrows():
    if(regex_contains_emoji(i[1]["text"])):
        for emoj in regex_contains_emoji(i[1]["text"], False, True).items():
            if i[1]["sentiment"].lower() == "positive":
                n_emoji_senti[emoj[0]]["positive"] += 1
            elif i[1]["sentiment"].lower() == "neutral":
                n_emoji_senti[emoj[0]]["neutral"] += 1
            elif i[1]["sentiment"].lower() == "negative":
                n_emoji_senti[emoj[0]]["negative"] += 1
            else:
                print("?")

### Creating a DataFrame from `emoji_senti` for easier analysis:

In [12]:
emoji_senti_list = []
items = list(r_emoji_senti.items())
for i in range(len(items)):
    data_items = items[i][1].items()
    data_list = list(data_items)
    emoji_senti_list.append([items[i][0], data_list[0][1], data_list[1][1], data_list[2][1]])
    
emoji_senti_df = pd.DataFrame(emoji_senti_list,columns=['emoji', 'positive', 'neutral', 'negative'])

In [13]:
in_data = emoji_senti_df.positive + emoji_senti_df.negative + emoji_senti_df.neutral > 0
emoji_senti_df = emoji_senti_df[in_data].reset_index()

In [14]:
sum_column = emoji_senti_df.positive + emoji_senti_df.negative + emoji_senti_df.neutral
emoji_senti_df['sum_col'] = sum_column

vader_sentiment = 4*(emoji_senti_df.positive - emoji_senti_df.negative)/(emoji_senti_df.sum_col + 12)
emoji_senti_df['vader_sentiment'] = vader_sentiment

In [15]:
emoji_senti_df.sort_values(by=['vader_sentiment'], ascending=False)

Unnamed: 0,index,emoji,positive,neutral,negative,sum_col,vader_sentiment
295,1036,😍,85,9,2,96,3.074074
123,459,🙏,119,22,3,144,2.974359
223,807,🎉,33,0,0,33,2.933333
265,933,❤,105,7,13,125,2.686131
45,160,💙,26,1,1,28,2.500000
...,...,...,...,...,...,...,...
17,82,😠,0,0,15,15,-2.222222
347,1179,😒,1,3,28,32,-2.454545
330,1136,👎,1,1,30,32,-2.636364
75,324,💨,0,0,32,32,-2.909091


### Below is what we added to vader_lexicon.txt

In [16]:
emoji_senti_df.filter(items=['emoji','vader_sentiment']).to_csv('emoji_sentiment.txt', index=False, header=False,
                               sep='\t')