# 1: Import Packages and Define Functions

In [667]:
# Import some libraries that will be used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mysql.connector
import sys
sys.path.insert(1, '/Users/brianmccabe/DataScience/Flatiron/mod5/Emoji_Analysis/Scripts/')
import config



pd.set_option('display.max_columns', 300)

%matplotlib inline

In [668]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from matplotlib import cm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
import string
import scipy
import emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brianmccabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brianmccabe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brianmccabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [669]:
def extract_emojis(s):
    return ''.join(c for c in s if c in emoji.UNICODE_EMOJI)

In [670]:
def check_for_flags(s):
    s = str(s)
    has_flag = False
    for flag in flags:
        if flag in s:
            has_flag = True
    return has_flag

In [671]:
def unique_emoji_count(s):
    return len(set([c for c in s]))

In [672]:
def condence_emojis(s):
    try:
        return set(s).pop()
    except:
        return None

In [673]:
analyzer = SentimentIntensityAnalyzer()
def sentiment_scores(s):
    return analyzer.polarity_scores(s)['compound']

In [674]:
def remove_emojis(s):
    pattern = "\w+"
    return ' '.join(re.findall(pattern, s))

In [675]:
def impute_top_emoji(s, mappings = emoji_sentiment_mappings, data = df):
    val = df[df.emoji == s].sentiment_score.values[0]
    s = list(set(s))
    closest = 100
    top = s[0]
    for i in range(0, len(s)):
        try:
            if abs(val - emoji_sentiment_mappings.loc[s[i]].sentiment_score) < closest:
                closest = abs(val - emoji_sentiment_mappings.loc[s[i]].sentiment_score)
                top = s[i]
        except:
            continue
    return top

# 2: Load in the Data and Clean

In [676]:
connection = mysql.connector.connect(host=config.host,
                                         user=config.user,
                                         port=config.port,
                                         password=config.password,
                                         database=config.database,
                                         auth_plugin='mysql_native_password')

cursor = connection.cursor()
query = "SELECT tweet FROM Tweets WHERE HEX(tweet) RLIKE '^(..)*F.'"

df = pd.read_sql(query, connection)

In [677]:
df['emoji'] = df.tweet.apply(extract_emojis)

In [678]:
# remove white spaces in emoji column since they mess up unique emoji count below...
df.emoji.apply(str.strip).loc[9]

'🤣😭'

In [679]:
# We can see that emoji flags have strange representations. I will remove them from the dataset.
[c for c in df.emoji.loc[9]]

['🤣', '😭']

In [680]:
import flag
from bs4 import BeautifulSoup
import requests

In [681]:
url = 'https://en.wikipedia.org/wiki/Regional_indicator_symbol'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [682]:
flags = []
skip_first = 0
for item in soup.find('tbody').find_all('tr'):
    if skip_first == 0:
        skip_first += 1
        continue
    flagcode = item.find_all('td')[1].text
    flags.append(flag.flag(flagcode))

flags[0:5]    

['🇦🇨', '🇦🇩', '🇦🇪', '🇦🇫', '🇦🇬']

In [683]:
df.emoji.apply(check_for_flags).value_counts()

False    27311
True      1001
Name: emoji, dtype: int64

In [684]:
df['has_flag'] = df.emoji.apply(check_for_flags)
df = df[df.has_flag == False]
df.drop('has_flag', axis=1, inplace=True)

In [685]:
df['unique_emoji_count'] = df.emoji.apply(unique_emoji_count)

In [686]:
df['sentiment_score'] = df.tweet.apply(sentiment_scores)

In [687]:
df['tweet'] = df.tweet.apply(remove_emojis)

In [688]:
df.emoji.isna().sum()

0

In [689]:
df.dropna(inplace=True)

In [690]:
df.isna().sum()

tweet                 0
emoji                 0
unique_emoji_count    0
sentiment_score       0
dtype: int64

In [691]:
df = df[df.unique_emoji_count > 0]

In [692]:
df.reset_index(inplace=True)

In [693]:
# Creating a dataframe of singular emojis to the average sentiment score they obtain
df2 = df.copy()
df2 = df2[df2.unique_emoji_count == 1]
df2['emoji'] = df2.emoji.apply(condence_emojis)
emoji_sentiment_mappings = df2[['emoji', 'sentiment_score']].groupby('emoji').mean()

In [694]:
df['top_emoji'] = df.emoji.apply(impute_top_emoji)

In [695]:
df.drop(["unique_emoji_count", "emoji"], axis=1, inplace=True)
df.drop('index', axis=1, inplace=True)

In [696]:
emoji_counts = dict(df.top_emoji.value_counts())

In [697]:
df['emoji_frequency'] = df.top_emoji.map(emoji_counts)

In [698]:
df.head()

Unnamed: 0,tweet,sentiment_score,top_emoji,emoji_frequency
0,One day honest citizens are going to stand up ...,0.8288,🤣,992
1,You are not alone There s a place where FEAR h...,-0.0926,🎶,61
2,5 Books That Changed My Life As An Investor Ex...,0.5719,👏,177
3,8616 new cases in 24 hours it s scary stay saf...,-0.0772,🤍,56
4,absolutely losing it over this picture of my b...,0.7187,💕,207


# 3. Explore the Data and Create Initial Insights

In [699]:
df.emoji_frequency.describe()

count    27184.000000
mean       562.530312
std        841.303087
min          1.000000
25%         61.000000
50%        204.000000
75%        453.000000
max       2730.000000
Name: emoji_frequency, dtype: float64

In [700]:
emojis = df[df.emoji_frequency>60].groupby('top_emoji')[['sentiment_score', 'emoji_frequency']].mean()

In [701]:
emojis.sort_values(by='sentiment_score')[0:25]

Unnamed: 0_level_0,sentiment_score,emoji_frequency
top_emoji,Unnamed: 1_level_1,Unnamed: 2_level_1
😱,-0.716108,102
😞,-0.581378,109
😭,-0.538639,1741
😢,-0.52327,131
😫,-0.40508,75
😕,-0.380187,61
😬,-0.335974,132
😩,-0.321308,197
👻,-0.314107,224
💥,-0.310835,103


In [702]:
emojis.sort_values(by='sentiment_score', ascending=False)[0:25]

Unnamed: 0_level_0,sentiment_score,emoji_frequency
top_emoji,Unnamed: 1_level_1,Unnamed: 2_level_1
💖,0.893985,161
🥰,0.865512,318
💜,0.834893,248
💗,0.824041,116
😊,0.822267,302
♥,0.816308,61
💙,0.796727,218
💕,0.78446,207
💚,0.770853,85
👌,0.770022,178
