# Emojify
## Map emoji to a text according to the context

## What is covered?
1. Data Engineering
2. Load Embedding Vectors
3. Train the model
4. Test the model
5. Results

In [2]:
import pandas as pd
from util import Utils
import numpy as np
import emoji

## 1. Data Engineering

In [3]:
#load train & test csv files
cols = ["Doc", "Label", "c3", "c4"]
df = pd.read_csv("emojify_data.csv", header=None, names = cols)
df2 = pd.read_csv("test_emoji.csv", header=None, names = cols)
df.head()

Unnamed: 0,Doc,Label,c3,c4
0,French macaroon is so tasty,4,,
1,work is horrible,3,,
2,I am upset,3,,[3]
3,throw the ball,1,,[2]
4,Good joke,2,,


### Labels to Emoji
<p>The text is labeled with integers range from 0-4. Each integer corresponds to a specific emoji.</p>

In [3]:
emoji_dictionary = {"0": "\u2764\uFE0F",    
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}


def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

for i in range(5):
    print("label", i,label_to_emoji(i))

label 0 ❤️
label 1 ⚾
label 2 😄
label 3 😞
label 4 🍴


In [4]:
docs = df["Doc"]
labels = df["Label"]
docs_test = df2["Doc"]
labels_test = df2["Label"]
X = []
y = []

X_test = []
y_test = []



#create tokenized documents and assign labels
for i,doc in enumerate(docs):
    X.append(doc.split())
    y.append(labels[i])
    
for i,doc in enumerate(docs_test):
    X_test.append(doc.split())
    y_test.append(labels_test[i])
    
#print first example 
print(X[0],label_to_emoji(y[0]))
print(X_test[0],label_to_emoji(y_test[0]))

['French', 'macaroon', 'is', 'so', 'tasty'] 🍴
['I', 'want', 'to', 'eat'] 🍴


## 2. Load GloVe Embedding Vectors

In [5]:
util = Utils()
emb_file = 'D:\Resources\Glove_Embeddings\glove.6B.50d.txt'
dimention = 50
word_to_index, index_to_word, word_to_vec_map = util.read_emb_vec(file_name=emb_file, dimention = dimention)

## 3. Train the Keras Model

In [6]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)
import os
import tensorflow as tf
#disable warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

Using TensorFlow backend.


In [7]:
#convert tokenize docs to the indices representation of glove embedding
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)                                  
    X_indices = np.zeros((m,max_len))
    for i,x in enumerate(X):
        j = 0
        # Loop over the tokens
        for w in x:
            X_indices[i, j] = word_to_index[w.lower()]
            j = j + 1
    return X_indices

In [8]:
example = [X[0],X[1]]
example_indices = sentences_to_indices([X[0],X[1]],word_to_index, max_len = 5)
print("X1 =", example)
print("X1_indices =", example_indices)

X1 = [['French', 'macaroon', 'is', 'so', 'tasty'], ['work', 'is', 'horrible']]
X1_indices = [[153730. 229211. 192973. 336115. 353731.]
 [389837. 192973. 181872.      0.      0.]]


In [9]:
#create an embedding layer with GloVe Data for the Keras Model
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
   
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix = np.zeros((vocab_len,emb_dim))
    print(emb_matrix.shape)
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim,trainable=False)
    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [10]:
# Keras emojify LSTM Model
def emojify_model(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)   
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.5)(X)
    X = Dense(5)(X)
    X = Activation('softmax')(X)
    
    model = Model(input=sentence_indices, output=X)
    return model


In [11]:
def getMaxLen(X):
    max = 0
    for x in X:
        if len(x) > max:
            max = len(x)
    return max

maxLen = getMaxLen(X)
print("Max length of doc is ", maxLen)

model = emojify_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

W0717 15:58:08.000837  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0717 15:58:08.017757  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



Max length of doc is  10
(400002, 50)


W0717 15:58:08.559925  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0717 15:58:08.567904  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0717 15:58:08.569908  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0717 15:58:10.842415  5096 deprecation.py:506] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be re

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 50)            20000100  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
__________

  del sys.path[0]


In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0717 15:58:11.068811  5096 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [13]:
X_train_indices = sentences_to_indices(X, word_to_index, maxLen)
Y_train_oh = util.convert_to_one_hot(np.array(y), C = 5)

X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
Y_test_oh = util.convert_to_one_hot(np.array(y_test), C = 5)

In [15]:
model.fit(X_train_indices, Y_train_oh, epochs = 20, batch_size = 32, shuffle=True, validation_data=(X_test_indices, Y_test_oh))

Train on 183 samples, validate on 56 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1daacefd898>

In [18]:
# This code allows you to see the mislabelled examples

Y_test = y_test
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    sentence = " "
    for w in X_test[i]:
        sentence += (w + " ")
    if num != Y_test[i]:
        print('Expected emoji:'+ label_to_emoji(Y_test[i]) + sentence+ ', prediction: ' + label_to_emoji(num).strip())

Expected emoji:❤️ I am upset , prediction: 😞
Expected emoji:😞 work is hard , prediction: 😄
Expected emoji:😞 go away , prediction: ⚾
