In [64]:
import numpy as np
import csv
import emoji
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

# Loading Dataset

In [65]:
def read_csv(filename = 'data/emojify_data.csv'):
    phrase = []
    emoji = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            emoji.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(emoji, dtype=int)

    return X, Y

In [66]:
X_train , y_train = read_csv('train_emoji.csv')
X_test , y_test = read_csv('test_emoji.csv')

In [67]:
print("X_train: ",X_train.shape)
print("y_train: ",y_train.shape)
print("X_test: ",X_test.shape)
print("y_test: ",y_test.shape)

X_train:  (132,)
y_train:  (132,)
X_test:  (56,)
y_test:  (56,)


In [68]:
emoji_dictionary = {"0": "\u2764\uFE0F",    
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [69]:
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases=True)

In [70]:
print(X_train[4],label_to_emoji(y_train[4]))

food is life 🍴


# Glove Vectors

In [71]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding = 'utf8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [72]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [73]:
word = "food"
index = 289846
print("the index of", word, "in the vocabulary is", word_to_index[word])
print("the", str(index) + "th word in the vocabulary is", index_to_word[index])

the index of food in the vocabulary is 151204
the 289846th word in the vocabulary is potatos


In [74]:
max_len = maxLen = len(max(X_train, key=len).split())

In [75]:
def sentences_to_indices(X, word_to_index, max_len):
    
    m = X.shape[0]                                   
    X_indices = np.zeros(shape=(m, max_len))
    
    for i in range(m):                               
        
        sentence_words = (X[i].lower()).split()
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j + 1
            
    return X_indices

In [76]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [77]:
def emb_matrix(word_to_vec_map , word_to_index):
    
    vocab_len = len(word_to_index) + 1                  
    emb_dim = word_to_vec_map["food"].shape[0]
    emb_matrix = np.zeros(shape=(vocab_len, emb_dim))
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    return emb_matrix

# Model

In [78]:
def Emojify(Input_shape , word_to_vec_map , word_to_index):
    
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["food"].shape[0]
    
    sentence_indices = Input(Input_shape)
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix(word_to_vec_map,word_to_index)])
    embeddings = embedding_layer(sentence_indices)
    
    X = LSTM(128 , return_sequences = True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128 , return_sequences = True)(X)
    X = Dropout(0.5)(X)
    X = LSTM(128 , return_sequences = False)(X)
    X = Dropout(0.5)(X)
    X = Dense(128,activation = 'relu')(X)
    X = Dense(128 , activation = 'relu')(X)
    X = Dense(5 , activation = 'softmax')(X)
    
    model = Model(sentence_indices , X)
    return model

In [79]:
model = Emojify((max_len) , word_to_vec_map , word_to_index)

In [80]:
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'])

In [81]:
X_train_ind = sentences_to_indices(X_train , word_to_index , max_len)
y_train_hot = convert_to_one_hot(y_train , 5)
X_test_ind = sentences_to_indices(X_test , word_to_index , max_len)
y_test_hot = convert_to_one_hot(y_test , 5)

In [82]:
model.fit(X_train_ind , y_train_hot , epochs = 50 , batch_size = 32 , shuffle = True )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x272c0940488>

In [86]:
loss , acur = model.evaluate(X_test_ind , y_test_hot)
print("Loss : ", loss)
print("Accuracy of test set : " , acur)

Loss :  0.2687842845916748
Accuracy of test set :  0.8928571343421936


In [87]:
C = 5
pred = model.predict(X_test_ind)
for i in range(len(X_test)):
    x = X_test_ind
    num = np.argmax(pred[i])
    if(num != y_test[i]):
        print('Expected emoji:'+ label_to_emoji(y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())

Expected emoji:😄 prediction: she got me a nice present	❤️
Expected emoji:😞 prediction: work is hard	😄
Expected emoji:😞 prediction: This girl is messing with me	❤️
Expected emoji:😞 prediction: work is horrible	😄
Expected emoji:🍴 prediction: any suggestions for dinner	😄
Expected emoji:❤️ prediction: I love taking breaks	😄


In [141]:
x = np.array(['i love to eat pizza'])
X_ind = sentences_to_indices(x, word_to_index, maxLen)
print(x[0] +' '+  label_to_emoji(np.argmax(model.predict(X_ind))))

i love to eat pizza 🍴
