In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
import os
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/Emojify')

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from model_utils import *

Read the dataset

In [2]:
data = pd.read_csv('./Dataset/emoji_data.csv', engine = 'python')

In [3]:
data.head()

Unnamed: 0,Text,Label
0,During the period of falling in love each tim...,joy
1,When I was involved in a traffic accident .,fear
2,When I was driving home after several days of ...,anger
3,When I lost the person who meant the most to me .,sadness
4,The time I knocked a deer down - the sight of ...,disgust


In [4]:
text = data['Text']
label = data['Label']

Unique labels

In [5]:
labels = label.unique()
print(labels)

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


C = Number of unqiue labels in the dataset

In [6]:
C = len(labels)

In [7]:
emoji_dict = {"joy" : "😂", "fear" : "😱", "anger" : "😠", "sadness" : "😢", "disgust" : "😒", "shame" : "😔", "guilt" : "😳"}
emoji_label = {"joy" : 0, "fear" : 1, "anger" : 2, "sadness" : 3, "disgust" : 4, "shame" : 5, "guilt" : 6}
no_emoji_dict = {0 : "😂", 1 : "😱", 2 : "😠", 3 : "😢", 4 : "😒", 5 : "😔", 6 : "😳"}

In [8]:
for i in range(len(label)):
  label[i] = emoji_label[label[i]]

In [9]:
label

0       0
1       1
2       2
3       3
4       4
       ..
7463    4
7464    5
7465    6
7466    0
7467    1
Name: Label, Length: 7468, dtype: object

Converting labels to one hot vector

In [11]:
output_label = convert_to_one_hot(label, C)

In [12]:
texts = np.array(text)

In [13]:
count = []
for text in texts:
    count.append(len(text))

In [14]:
se = pd.Series(np.array(count)).value_counts()
# se[70:110]

In [15]:
max_length = 50

Splitting data into training set and test set

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(texts, output_label, test_size = 0.01, shuffle = True)

In [17]:
print(X_train.shape)
print(X_test.shape)

(7393,)
(75,)


In [18]:
for i in range(5):
    print(X_train[i], no_emoji_dict[np.argmax(Y_train[i])])
    print()

When my new jeans split while I was with the youth group . 😔

My first real experience with a boy . We were alone . I was quite young and pretty naive and he was pretty crude and real  sort of  macho . It was dark  at night during the summer and we were talking . 😔

A certain night during initiation . 😱

When I was a child  I thought that I had to be ashamed when asking and doing certain forbidden things . 😔

When my brother died . 😢



read_glove_vecs returns:
* word_to_index: dictionary mapping from words to their indices in the vocabulary
(400,001 words, with the valid indices ranging from 0 to 400,000)
* index_to_word: dictionary mapping from indices to their corresponding words in the vocabulary
* word_to_vec_map: dictionary mapping words to their GloVe vector representation.

In [19]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('./glove.6B.200d.txt')

In [20]:
len(word_to_index)

400000

checking whether it works

In [21]:
word = 'hello'
print('Index of ', word, ' in the voucabulary is: ', word_to_index[word])

Index of  hello  in the voucabulary is:  176468


In [22]:
ind = 176468
print('word at ', ind, ' index is: ', index_to_word[ind])

word at  176468  index is:  hello


# MODEL
Let's build an LSTM model that takes word sequence as input
* Emojifier-V2 will continue to use pre-trained word embeddings to represent words.
* We will feed word embeddings into an Bidirectional LSTM.
* The LSTM will learn to predict the most appropriate emoji.

In [23]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [24]:
def sentences_to_indices(X, word_to_index, max_length):
    # number of training examples
    m = X.shape[0]
    X_indices = np.zeros((m, max_length))
    
    # loop over training examples
    for i in range(m):  
        sentence_words = X[i].lower().split()
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            if j >= 50:
                break
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if(w not in word_to_index):
                w = 'unk'
            X_indices[i, j] = word_to_index[w]
            j = j+1
    
    return X_indices

In [25]:
X_check = np.array(["interesting problem", "yummy", "the end game"])
X_check_indices = sentences_to_indices(X_check,word_to_index, max_length = 5)
print("X_check =", X_check)
print("X_check_indices =\n", X_check_indices)

X_check = ['interesting problem' 'yummy' 'the end game']
X_check_indices =
 [[191290. 292794.      0.      0.      0.]
 [394957.      0.      0.      0.      0.]
 [357266. 136979. 157049.      0.      0.]]


In [26]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    
    # adding 1 to fit Keras embedding (requirement)
    vocab_len = len(word_to_index) + 1
    
    # define dimensionality of GloVe word vectors (= 200)
    emb_dim = word_to_vec_map["hello"].shape[0]
    
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Set each row "idx" of the embedding matrix to be the word
    # vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Keras embedding layer with the correct input and output size and make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None, ))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [27]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

In [28]:
print("weights[0][1][2] =", embedding_layer.get_weights()[0][1][2])

weights[0][1][2] = -0.49917


In [29]:
np.array(embedding_layer.get_weights()).shape

(1, 400001, 200)

In [30]:
def Emojify(input_shape, word_to_vec_map, word_to_index):
    
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(input_shape, dtype = 'int32')
  
    # Create the embedding layer pretrained with GloVe Vectors
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through the embedding layer
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = Bidirectional(LSTM(128, return_sequences = True))(embeddings)
    
    # Add dropout with a probability of 0.4
    X = Dropout(0.4)(X)
    
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X =  Bidirectional(LSTM(128, return_sequences = False))(X)
    
    # Add dropout with a probability of 0.4
    X =  Dropout(0.4)(X)
    
    # Propagate X through a Dense layer with 5 units
    X = Dense(units = 7)(X)
    
    # Add a softmax activation
    X =  Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices, X)
    
    return model

In [31]:
model = Emojify((max_length,), word_to_vec_map, word_to_index)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 200)           80000200  
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 256)           336896    
_________________________________________________________________
dropout (Dropout)            (None, 50, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 7)                

In [32]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [33]:
X_train_indices = sentences_to_indices(X_train, word_to_index, max_length)

In [34]:
model.fit(X_train_indices, Y_train, epochs = 50, batch_size = 32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f561e04ee10>

In [35]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_length = max_length)
loss, acc = model.evaluate(X_test_indices, Y_test)
print("Test accuracy = ", acc)

Test accuracy =  0.6266666650772095


Try your examples here!

In [53]:
x_in = np.array(['feeling sad more than feeling happy'])
X_test_indices = sentences_to_indices(x_in, word_to_index, max_length)
print(x_in[0] +' '+  no_emoji_dict[np.argmax(model.predict(X_test_indices))])

feeling sad more than feeling happy 😢
