In [16]:
import os
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#!pip install neattext
import neattext.functions as nfx

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import demoji
demoji.download_codes()

import nltk
# nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer

from keras.preprocessing.text import Tokenizer, tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout, Input, Conv1D, MaxPooling1D, Flatten, GlobalMaxPooling1D
from keras.models import load_model
import tensorflow as tf

  demoji.download_codes()


In [5]:
# load model
model = load_model('how_do_you_feel_my_dear/final_model/model.h5')
# summarize model
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 200)           2180000   
                                                                 
 dropout_27 (Dropout)        (None, 50, 200)           0         
                                                                 
 bidirectional_27 (Bidirecti  (None, 50, 100)          100400    
 onal)                                                           
                                                                 
 dropout_28 (Dropout)        (None, 50, 100)           0         
                                                                 
 bidirectional_28 (Bidirecti  (None, 50, 300)          301200    
 onal)                                                           
                                                                 
 dropout_29 (Dropout)        (None, 50, 300)          

In [33]:
# load in variables the data just downloaded
DIR = os.getcwd() + "/how_do_you_feel_my_dear/emotions"

joy_test = pd.read_csv(os.path.join(DIR, "joy_test"), sep="\t", header=None)
sadness_test = pd.read_csv(os.path.join(DIR, "sadness_test"), sep="\t", header=None)
fear_test = pd.read_csv(os.path.join(DIR, "fear_test"), sep="\t", header=None)
anger_test = pd.read_csv(os.path.join(DIR, "anger_test"), sep="\t", header=None)

In [34]:
# load in variables the data just downloaded

joy_train = pd.read_csv(os.path.join(DIR, "joy_train"), sep="\t", header=None)
sadness_train = pd.read_csv(os.path.join(DIR, "sadness_train"), sep="\t", header=None)
fear_train = pd.read_csv(os.path.join(DIR, "fear_train"), sep="\t", header=None)
anger_train = pd.read_csv(os.path.join(DIR, "anger_train"), sep="\t", header=None)

joy_val = pd.read_csv(os.path.join(DIR, "joy_val"), sep="\t", header=None)
sadness_val = pd.read_csv(os.path.join(DIR, "sadness_val"), sep="\t", header=None)
fear_val = pd.read_csv(os.path.join(DIR, "fear_val"), sep="\t", header=None)
anger_val = pd.read_csv(os.path.join(DIR, "anger_val"), sep="\t", header=None)

joy_test = pd.read_csv(os.path.join(DIR, "joy_test"), sep="\t", header=None)
sadness_test = pd.read_csv(os.path.join(DIR, "sadness_test"), sep="\t", header=None)
fear_test = pd.read_csv(os.path.join(DIR, "fear_test"), sep="\t", header=None)
anger_test = pd.read_csv(os.path.join(DIR, "anger_test"), sep="\t", header=None)

# rename columns all in the same way to get homogeneous datasets which could be concatenated

joy_train.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'j'}, inplace=True)
joy_train['s'] = 0
joy_train['a'] = 0
joy_train['f'] = 0
joy_train = joy_train[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

joy_val.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'j'}, inplace=True)
joy_val['s'] = 0
joy_val['a'] = 0
joy_val['f'] = 0
joy_val = joy_val[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

joy_test.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'j'}, inplace=True)
joy_test['s'] = 0
joy_test['a'] = 0
joy_test['f'] = 0
joy_test = joy_test[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

sadness_train.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 's'}, inplace=True)
sadness_train['j'] = 0
sadness_train['a'] = 0
sadness_train['f'] = 0
sadness_train = sadness_train[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

sadness_val.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 's'}, inplace=True)
sadness_val['j'] = 0
sadness_val['a'] = 0
sadness_val['f'] = 0
sadness_val = sadness_val[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

sadness_test.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 's'}, inplace=True)
sadness_test['j'] = 0
sadness_test['a'] = 0
sadness_test['f'] = 0
sadness_test = sadness_test[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

anger_train.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'a'}, inplace=True)
anger_train['j'] = 0
anger_train['s'] = 0
anger_train['f'] = 0
anger_train = anger_train[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

anger_val.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'a'}, inplace=True)
anger_val['j'] = 0
anger_val['s'] = 0
anger_val['f'] = 0
anger_val = anger_val[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

anger_test.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'a'}, inplace=True)
anger_test['j'] = 0
anger_test['s'] = 0
anger_test['f'] = 0
anger_test = anger_test[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

fear_train.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'f'}, inplace=True)
fear_train['j'] = 0
fear_train['s'] = 0
fear_train['a'] = 0
fear_train = fear_train[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

fear_val.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'f'}, inplace=True)
fear_val['j'] = 0
fear_val['s'] = 0
fear_val['a'] = 0
fear_val = fear_val[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

fear_test.rename(columns={0: 'id', 1: 'text', 2: 'sentiment', 3: 'f'}, inplace=True)
fear_test['j'] = 0
fear_test['s'] = 0
fear_test['a'] = 0
fear_test = fear_test[['id', 'text', 'sentiment', 'j', 's', 'f', 'a']]

# concatenate the datasets in order to get 3 separated datasets: train, test, validation

data = pd.concat([joy_test,
                  sadness_test,
                  fear_test,
                  anger_test])

In [35]:
# clean text functions
# https://github.com/Jcharis/neattext/blob/master/neattext/functions/functions.py

def clean_emoji_output(text):
    return re.sub(":", " ", text)

def strip_lowercase(text):
    return text.strip().lower()

# tokenize
tt = TweetTokenizer()

# lemmatize
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

# function which cleans texts
def clean_text(data):
    data['clean_text'] = data['text'].apply(nfx.remove_emails)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_numbers)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_urls)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_userhandles)
    data['clean_text'] = data['clean_text'].apply(demoji.replace_with_desc)
    data['clean_text'] = data['clean_text'].apply(clean_emoji_output)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_special_characters)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_bad_quotes)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_html_tags)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_punctuations)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_stopwords)
    data['clean_text'] = data['clean_text'].apply(nfx.remove_multiple_spaces)
    data['clean_text'] = data['clean_text'].apply(strip_lowercase)
    
    data['tokenize'] = data.clean_text.str.lower().apply(tt.tokenize)
    data['tokenize_lemmatized'] = data['tokenize'].apply(lemmatize_text)
    
    # detokenize
    data['final_text'] = data.tokenize_lemmatized.apply(TreebankWordDetokenizer().detokenize)

In [36]:
# clean and shuffle
clean_text(data)
data = shuffle(data, random_state=42)

In [37]:
data.tail()

Unnamed: 0,id,text,sentiment,j,s,f,a,clean_text,tokenize,tokenize_lemmatized,final_text
710,11651,maps by the yeah yeah yeahs came on the radio ...,anger,0.0,0.0,0.0,0.292,maps yeah yeah yeahs came radio today burst tears,"[maps, yeah, yeah, yeahs, came, radio, today, ...","[map, yeah, yeah, yeahs, came, radio, today, b...",map yeah yeah yeahs came radio today burst tear
381,41241,Dreams dashed and divided like million stars i...,sadness,0.0,0.688,0.0,0.0,dreams dashed divided like million stars night...,"[dreams, dashed, divided, like, million, stars...","[dream, dashed, divided, like, million, star, ...",dream dashed divided like million star night sky
416,41276,"I did the dishes yesterday, fell asleep, woke ...",sadness,0.0,0.625,0.0,0.0,dishes yesterday fell asleep woke sink didnt e...,"[dishes, yesterday, fell, asleep, woke, sink, ...","[dish, yesterday, fell, asleep, woke, sink, di...",dish yesterday fell asleep woke sink didnt eat...
580,41440,nooooo. Poor Blue Bell! not again.,sadness,0.0,0.604,0.0,0.0,nooooo poor blue bell,"[nooooo, poor, blue, bell]","[nooooo, poor, blue, bell]",nooooo poor blue bell
146,41006,Are you #serious that #fredsirieix lives in Pe...,sadness,0.0,0.312,0.0,0.0,fredsirieix lives peckhamsouth londons love gu...,"[fredsirieix, lives, peckhamsouth, londons, lo...","[fredsirieix, life, peckhamsouth, london, love...",fredsirieix life peckhamsouth london love gurubig


In [38]:
# numerical encoding of our labels
label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(data.sentiment)

# one-hot-encode them
y = to_categorical(y_enc)

In [39]:
y[-5:]

# alphabetical order
# 0: anger
# 1: fear
# 2: joy
# 3: sadness

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

## Load the tokenizer on which we trained our model
If you use another tokenizer it is not going to work

In [40]:
with open('tokenizer.json') as f:
    data_json = json.load(f)

tokenizer = tokenizer_from_json(data_json)

In [41]:
# Convert texts into some numeric sequences and make the length of all numeric sequences equal 

X_seq = tokenizer.texts_to_sequences(data.final_text) 
X_pad = pad_sequences(X_seq, maxlen = 50, padding = 'post') 

X_pad = np.array(X_pad)
X_pad[:3]

array([[  15,  303, 1521,  485,  795,  438,  485,  795,  438,  485,  795,
         438, 4007, 4008, 4009,  210, 4010,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [  68,  165, 9635,  349,  234,  165,  165, 5317,  339, 3771,  349,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0],
       [ 169, 1440,  517,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   

In [42]:
data.final_text[:3]

407    happy birthday seed double exclamation mark do...
588    n word normalized medium wow word word frog bo...
350                            depressing freaking close
Name: final_text, dtype: object

In [43]:
y[:3]

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]], dtype=float32)

In [44]:
# check external validity on the test set
score = model.evaluate(X_pad, y, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.18835455179214478
Test accuracy: 0.931253969669342


In [45]:
model.predict(X_pad)

array([[9.81675982e-01, 1.42456861e-02, 2.26224517e-03, 1.81610230e-03],
       [9.98395979e-01, 7.80888135e-04, 4.14176786e-04, 4.08927124e-04],
       [1.80836429e-03, 1.54076482e-03, 1.18682778e-03, 9.95464027e-01],
       ...,
       [1.39050861e-03, 1.52586838e-02, 1.79792685e-03, 9.81552899e-01],
       [1.36306568e-03, 1.08365493e-03, 1.48600689e-03, 9.96067345e-01],
       [3.22623044e-01, 1.03773475e-01, 2.17085972e-01, 3.56517524e-01]],
      dtype=float32)

In [46]:
y

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [47]:
predictions = np.argmax(model.predict(X_pad), axis=1)
target = np.argmax(y, axis=1)
mistakes = []

for i in range(len(X_pad)):
    if predictions[i] != target[i]:
        mistakes.append(i)

In [57]:
data['prediction'] = predictions

# alphabetical order
# 0: anger
# 1: fear
# 2: joy
# 3: sadness

code_to_strings = {0: "anger",
                   1: "fear",
                   2: "joy",
                   3: "sadness"}

data["prediction"] = data["prediction"].map(code_to_strings)
data.head()

Unnamed: 0,id,text,sentiment,j,s,f,a,clean_text,tokenize,tokenize_lemmatized,final_text,prediction
407,11348,@RiveraJose39 happy birthday to my seed‼️‼️‼️ ...,anger,0.0,0.0,0.0,0.138,happy birthday seed double exclamation mark do...,"[happy, birthday, seed, double, exclamation, m...","[happy, birthday, seed, double, exclamation, m...",happy birthday seed double exclamation mark do...,anger
588,11529,Now that the n word is normalized by the media...,anger,0.0,0.0,0.0,0.509,n word normalized media wow words words frog b...,"[n, word, normalized, media, wow, words, words...","[n, word, normalized, medium, wow, word, word,...",n word normalized medium wow word word frog bo...,anger
350,41210,@MariamVeiszadeh #depressing it's so freaking ...,sadness,0.0,0.679,0.0,0.0,depressing freaking close,"[depressing, freaking, close]","[depressing, freaking, close]",depressing freaking close,sadness
513,11454,@Grace_thomas473 you keep talkin shit and tryi...,anger,0.0,0.0,0.0,0.646,talkin shit trying offend youre petty fuck,"[talkin, shit, trying, offend, youre, petty, f...","[talkin, shit, trying, offend, youre, petty, f...",talkin shit trying offend youre petty fuck,anger
607,11548,"Ultimately, #KeithScott wasn't the man they we...",anger,0.0,0.0,0.0,0.741,ultimately keithscott wasnt man hand arrest wa...,"[ultimately, keithscott, wasnt, man, hand, arr...","[ultimately, keithscott, wasnt, man, hand, arr...",ultimately keithscott wasnt man hand arrest wa...,anger


In [58]:
len(mistakes)

216

In [59]:
len(X_pad)

3142

In [60]:
print("Accuracy in predicting all the data we have:", str(round((1-len(mistakes)/len(X_pad))*100, 2)) + "%")

Accuracy in predicting all the data we have: 93.13%


In [64]:
for i in range(10):
    print("---------------------------------------------------------------------------")
    print("MISTAKEN;", data['final_text'].values[mistakes[i]])
    print("target:", data['sentiment'].values[mistakes[i]])
    print("prediction:", data['prediction'].values[mistakes[i]])

---------------------------------------------------------------------------
MISTAKEN; didnt bad situation great mate cant believe make despair humanity
target: sadness
prediction: fear
---------------------------------------------------------------------------
MISTAKEN; need pout time getting nerve gbbo
target: sadness
prediction: anger
---------------------------------------------------------------------------
MISTAKEN; lol dk actually dropped table surprisingly arm warrior dp moment
target: fear
prediction: joy
---------------------------------------------------------------------------
MISTAKEN; finger crossed finish work early friday time catch lib frowning face open mouth timetogrind
target: fear
prediction: anger
---------------------------------------------------------------------------
MISTAKEN; dreadful franglaise
target: sadness
prediction: fear
---------------------------------------------------------------------------
MISTAKEN; internationaldayofpeace white supremacist terro