In [1]:
import pandas as pd
import emoji
import re
import string
import csv

pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 50

In [2]:
def regex_contains_emoji(s, return_filtered = False):
    emojis = " ".join([emoj for emoj in emoji.UNICODE_EMOJI.keys() if len(emoj)==1])
    
    subbed = re.sub('[^' + emojis +']+', '', s)
    subbed = re.sub('[\s]+', '', subbed)
    
    if return_filtered:
        return subbed
    return bool(len(subbed))

### Reading data

In [3]:
airline_tweets = pd.read_csv('airline_tweets.csv')
semeval_tweets = pd.read_csv('semeval_tweets.csv')
gop_debate_tweets = pd.read_csv('gop_debate_tweets.csv')
smile_tweets = pd.read_csv('smile_tweets.csv')
semeval_anger_tweets = pd.read_csv('semeval_anger_tweets.csv', engine='python')
semeval_fear_tweets = pd.read_csv('semeval_fear_tweets.csv', engine='python')
semeval_joy_tweets = pd.read_csv('semeval_joy_tweets.csv', engine='python')
semeval_sad_tweets = pd.read_csv('semeval_sad_tweets.csv', engine='python')

In [4]:
angry = semeval_anger_tweets['Intensity Class'] >= 2
semeval_anger_tweets = semeval_anger_tweets[angry].reset_index()
list_of_negative = ['negative' for i in range(semeval_anger_tweets.shape[0])]
semeval_anger_tweets['sentiment'] = list_of_negative

In [5]:
fearful = semeval_fear_tweets['Intensity Class'] >= 2
semeval_fear_tweets = semeval_fear_tweets[fearful].reset_index()
list_of_negative = ['negative' for i in range(semeval_fear_tweets.shape[0])]
semeval_fear_tweets['sentiment'] = list_of_negative

In [6]:
joy = semeval_joy_tweets['Intensity Class'] >= 2
semeval_joy_tweets = semeval_joy_tweets[joy].reset_index()
list_of_positive = ['positive' for i in range(semeval_joy_tweets.shape[0])]
semeval_joy_tweets['sentiment'] = list_of_positive

In [7]:
sad = semeval_sad_tweets['Intensity Class'] >= 2
semeval_sad_tweets = semeval_sad_tweets[sad].reset_index()
list_of_negative = ['negative' for i in range(semeval_sad_tweets.shape[0])]
semeval_sad_tweets['sentiment'] = list_of_negative

In [8]:
def extract_emoji_tweets(data):
    emoji_tweets = pd.DataFrame(columns=data.columns)
    row_index = 0

    for tweet in data.text:
        if regex_contains_emoji(tweet):
            emoji_tweets = emoji_tweets.append(data.iloc[row_index], ignore_index=True)
        row_index += 1
    return emoji_tweets

In [9]:
emoji_airline_tweets = extract_emoji_tweets(airline_tweets)
emoji_semeval_tweets = extract_emoji_tweets(semeval_tweets)
emoji_gop_debate_tweets = extract_emoji_tweets(gop_debate_tweets)
emoji_smile_tweets = extract_emoji_tweets(smile_tweets)
emoji_se_anger_tweets = extract_emoji_tweets(semeval_anger_tweets)
emoji_se_joy_tweets = extract_emoji_tweets(semeval_joy_tweets)
emoji_se_fear_tweets = extract_emoji_tweets(semeval_fear_tweets)
emoji_se_sad_tweets = extract_emoji_tweets(semeval_sad_tweets)

In [10]:
happy = emoji_smile_tweets.code == 'happy'
happy_emoji_smile_tweets = emoji_smile_tweets[happy].reset_index()
list_of_positive = ['positive' for i in range(happy_emoji_smile_tweets.shape[0])]
happy_emoji_smile_tweets['sentiment'] = list_of_positive

In [11]:
sad = emoji_smile_tweets.code == 'sad'
sad_emoji_smile_tweets = emoji_smile_tweets[sad].reset_index()
list_of_negative = ['negative' for i in range(sad_emoji_smile_tweets.shape[0])]
sad_emoji_smile_tweets['sentiment'] = list_of_negative

In [12]:
from emoji import UNICODE_EMOJI
pattern = re.compile('[\W_]+', re.UNICODE)

def contains(s, e):
    s = s.strip(pattern.sub('', string.printable))
    count = 0
    for c in s:
        count += s.count(e)
    return bool(count)

In [13]:
#There's just too many American flag tweets
emoji_nf_gop_debate_tweets = pd.DataFrame(columns=emoji_gop_debate_tweets.columns)
row_index = 0

for tweet in emoji_gop_debate_tweets.text:
    if not contains(tweet, '🇺🇸'):
        emoji_nf_gop_debate_tweets = emoji_nf_gop_debate_tweets.append(emoji_gop_debate_tweets.iloc[row_index], ignore_index=True)
    row_index += 1

### Combining all tweets into a DataFrame

In [14]:
all_emoji_tweets = emoji_airline_tweets.append([emoji_semeval_tweets, emoji_nf_gop_debate_tweets,
                                                happy_emoji_smile_tweets, sad_emoji_smile_tweets,
                                                emoji_se_anger_tweets, emoji_se_fear_tweets,
                                                emoji_se_joy_tweets, emoji_se_sad_tweets])
all_emoji_tweets.drop(columns=['tweet_id', 'id', 'index', 'code', 'Intensity Class'], inplace=True)
all_emoji_tweets.shape

(2042, 2)

In [15]:
all_emoji_tweets.to_csv('all_emoji_tweets.csv')

In [17]:
import random
import numpy as np

random.seed = 9876544443
shuffler = np.arange(2042)
random.shuffle(shuffler)
all_emoji_tweets_shuffled = np.asarray(all_emoji_tweets)
all_emoji_tweets_shuffled = all_emoji_tweets_shuffled[shuffler]

all_emoji_tweets_train = pd.DataFrame(all_emoji_tweets_shuffled[:1736])
all_emoji_tweets_train.columns = ['sentiment', 'text']
all_emoji_tweets_test = pd.DataFrame(all_emoji_tweets_shuffled[1736:])
all_emoji_tweets_test.columns = ['sentiment', 'text']

all_emoji_tweets_train.to_csv("all_emoji_tweets_train.csv")
all_emoji_tweets_test.to_csv("all_emoji_tweets_test.csv")