In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
import os
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/Emojify')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from model_utils import *

Read the dataset

In [3]:
data = pd.read_csv('./Dataset/emoji_data.csv', engine = 'python')

In [4]:
data.head()

Unnamed: 0,Text,Label
0,During the period of falling in love each tim...,joy
1,When I was involved in a traffic accident .,fear
2,When I was driving home after several days of ...,anger
3,When I lost the person who meant the most to me .,sadness
4,The time I knocked a deer down - the sight of ...,disgust


In [5]:
text = data['Text']
label = data['Label']

Unique labels

In [6]:
labels = label.unique()
print(labels)

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


C = Number of unqiue labels in the dataset

In [7]:
C = len(labels)

In [8]:
emoji_dict = {"joy" : "😅", "fear" : "😱", "anger" : "😠", "sadness" : "😢", "disgust" : "😒", "shame" : "😔", "guilt" : "😳"}
emoji_label = {"joy" : 0, "fear" : 1, "anger" : 2, "sadness" : 3, "disgust" : 4, "shame" : 5, "guilt" : 6}
no_emoji_dict = {0 : "😅", 1 : "😱", 2 : "😠", 3 : "😢", 4 : "😒", 5 : "😔", 6 : "😳"}

In [9]:
for i in range(len(label)):
  label[i] = emoji_label[label[i]]

Converting labels to one hot vector

In [10]:
output_label = convert_to_one_hot(label, C)

In [11]:
texts = np.array(text)

In [12]:
count = []
for text in texts:
    count.append(len(text.split()))

In [13]:
se = pd.Series(np.array(count)).value_counts()
# se[0: 30]

In [14]:
max_length = 10
print(len(texts))

7468


In [15]:
final_texts = []
final_labels = []
for i in range(len(texts)):
    if(len(texts[i].split()) <= max_length):
        final_texts.append(texts[i])
        final_labels.append(output_label[i])

In [16]:
final_texts = np.array(final_texts);
print(len(final_texts))

1279


Splitting data into training set and test set

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(final_texts, final_labels, test_size = 0.05, shuffle = True)

In [18]:
print(X_train.shape)
print(X_test.shape)

(1215,)
(64,)


In [19]:
for i in range(5):
    print(X_train[i], no_emoji_dict[np.argmax(Y_train[i])])
    print()

My father left us for another woman . 😢

When I had a little accident with my boss'car . 😳

After each successfully passed exam I am happy . 😅

Not having good marks like other people for homeworks . 😔

I couldn't fulfill a promise . 😳



* **word_to_index**: dictionary mapping from words to their indices in the vocabulary
(400,001 words, with the valid indices ranging from 0 to 400,000)
* **index_to_word**: dictionary mapping from indices to their corresponding words in the vocabulary
* **word_to_vec_map**: dictionary mapping words to their GloVe vector representation.

In [20]:
# !wget -O achive 'https://storage.googleapis.com/kaggle-data-sets/13926/18767/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210524%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210524T072942Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=6a3751b0f2cd83dbbee80d2590ec9e9c59245acfd000ee6e3a52abc4b289e95fd94244e5e93f8af55a5f98ae194e45b8561c1df139d0c65d448087f9935f3f7a69287b44ba5cbd71b844f7db89393e52926bb0402d72c43fc42bb1019fb86be7e5e2bcc5d82845ecd525620c04348ba650fac2bb90c5250cb97ab16b202e2654444aaaef4cd016c11545e37ee27b270c959d443768eab6edaa13b4e4f0904605bb5ea8f9a85264f9a7ffdfd299203706f245f0cdad40f1293b3725c3b399c2d8a29ba7ee6f1af8929cee78644953fdc29bb1e1a9d719dc4e5251e1e6df5527c867545e9f560def84587e4b644b71746d7bfa3ca831b7756c30fad2fcebcf7c89'

In [21]:
# !unzip achive

In [22]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('./glove.6B.200d.txt')

In [23]:
len(word_to_index)

400000

checking whether it works

In [24]:
word = 'hello'
print('Index of ', word, ' in the voucabulary is: ', word_to_index[word])

Index of  hello  in the voucabulary is:  176468


In [25]:
ind = 176468
print('word at ', ind, ' index is: ', index_to_word[ind])

word at  176468  index is:  hello


# MODEL
Let's build an LSTM model that takes word sequence as input
* We will feed word embeddings into an Bidirectional LSTM.
* The LSTM will learn to predict the most appropriate emoji.

In [26]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [31]:
def sentences_to_indices(X, word_to_index, max_length):
    # number of training examples
    m = X.shape[0]
    X_indices = np.zeros((m, max_length))
    
    # loop over training examples
    for i in range(m):  
        sentence_words = X[i].lower().split()
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if(w not in word_to_index):
                w = 'unk'
            X_indices[i, j] = word_to_index[w]
            j = j+1
    
    return X_indices

In [32]:
X_check = np.array(["I love problem solving", "Always keep smiling"])
X_check_indices = sentences_to_indices(X_check, word_to_index, max_length = max_length)
print("X_check =", X_check)
print("X_check_indices =\n", X_check_indices)

X_check = ['I love problem solving' 'Always keep smiling']
X_check_indices =
 [[185457. 226278. 292794. 337201.      0.      0.      0.      0.      0.
       0.]
 [ 52879. 204679. 335373.      0.      0.      0.      0.      0.      0.
       0.]]


In [33]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    
    # adding 1 to fit Keras embedding (requirement)
    vocab_len = len(word_to_index) + 1
    
    # define dimensionality of GloVe word vectors (= 200)
    emb_dim = word_to_vec_map["hello"].shape[0]
    
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be the word
    # vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Keras embedding layer with the correct input and output size and make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None, ))
    
    # Set the weights of the embedding layer to the embedding matrix. Layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [34]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

In [35]:
np.array(embedding_layer.get_weights()).shape

(1, 400001, 200)

In [36]:
print("weights[0][1][2] =", embedding_layer.get_weights()[0][1][2])

weights[0][1][2] = -0.49917


In [37]:
def Emojify(input_shape, word_to_vec_map, word_to_index):
    
    # Define sentence_indices as the input.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(input_shape, dtype = 'int32')
  
    # Create the embedding layer pretrained with GloVe Vectors
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through the embedding layer
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences = True))(embeddings)

    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = Bidirectional(LSTM(256, return_sequences = True))(embeddings)
    
    # Add dropout with a probability of 0.4
    X = Dropout(0.2)(X)
    
    # Propagate X trough another LSTM layer with 256-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X =  Bidirectional(LSTM(256, return_sequences = False))(X)

    # Propagate X through a Dense layer with 5 units and add a softmax activation
    X = Dense(units = 7, activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X)
    
    return model

In [38]:
model = Emojify((max_length,), word_to_vec_map, word_to_index)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 10, 200)           80000200  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 10, 512)           935936    
_________________________________________________________________
dropout (Dropout)            (None, 10, 512)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dense (Dense)                (None, 7)                 3591      
Total params: 82,514,639
Trainable params: 2,514,439
Non-trainable params: 80,000,200
_________________________________________

In [39]:
model.compile(loss = 'categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001), metrics = ['accuracy'])

In [40]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_length)

In [41]:
Y_train = np.array(Y_train)
X_test_indices = sentences_to_indices(X_test, word_to_index, max_length = max_length)
Y_test = np.array(Y_test)

In [42]:
model.fit(X_train_indices, Y_train, epochs = 25, batch_size = 4, validation_data = (X_test_indices, Y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f27805f7950>

In [43]:
loss, acc = model.evaluate(X_test_indices, Y_test)
print("Test accuracy = ", acc)

Test accuracy =  0.671875


Try your examples here!

In [44]:
x_in = np.array(['feeling sad more than feeling happy'])
X_test_indices = sentences_to_indices(x_in, word_to_index, max_length)
print(x_in[0] +' '+  no_emoji_dict[np.argmax(model.predict(X_test_indices))])

feeling sad more than feeling happy 😢


In [45]:
 import time
saved_model_path = "./{}.h5".format(int(time.time()))

model.save(saved_model_path)

In [None]:
!pip install tensorflowjs

In [47]:
!tensorflowjs_converter --input_format=keras {saved_model_path} ./Model

2021-05-24 11:25:34.059345: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [48]:
ls Model

group1-shard10of79.bin  group1-shard35of79.bin  group1-shard5of79.bin
group1-shard11of79.bin  group1-shard36of79.bin  group1-shard60of79.bin
group1-shard12of79.bin  group1-shard37of79.bin  group1-shard61of79.bin
group1-shard13of79.bin  group1-shard38of79.bin  group1-shard62of79.bin
group1-shard14of79.bin  group1-shard39of79.bin  group1-shard63of79.bin
group1-shard15of79.bin  group1-shard3of79.bin   group1-shard64of79.bin
group1-shard16of79.bin  group1-shard40of79.bin  group1-shard65of79.bin
group1-shard17of79.bin  group1-shard41of79.bin  group1-shard66of79.bin
group1-shard18of79.bin  group1-shard42of79.bin  group1-shard67of79.bin
group1-shard19of79.bin  group1-shard43of79.bin  group1-shard68of79.bin
group1-shard1of79.bin   group1-shard44of79.bin  group1-shard69of79.bin
group1-shard20of79.bin  group1-shard45of79.bin  group1-shard6of79.bin
group1-shard21of79.bin  group1-shard46of79.bin  group1-shard70of79.bin
group1-shard22of79.bin  group1-shard47of79.bin  group1-shard71of79.bin
group1-s