In [1]:
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [2]:
!ls data 
!head -n 10 data/train_emoji.csv
X_train, Y_train = read_csv('data/train_emoji.csv')
X_test, Y_test = read_csv('data/tesss.csv')

emojify_data.csv tesss.csv        train_emoji.csv
glove.6B.50d.txt test_emoji.csv
never talk to me again,3,,
I am proud of your achievements,2,,
It is the worst day in my life,3,,
Miss you so much,0,, [0]
food is life,4,,
I love you mum,0,,
Stop saying bullshit,3,,
congratulations on your acceptance,2,,
The assignment is too long ,3,,
I want to go play,1,, [3]


In [3]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            X_indices[i, j] = word_to_index[w]
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [4]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len, emb_dim))
    print(emb_matrix.shape)
    print(len(word_to_index.items()))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    for word, index in word_to_index.items():
        try:
            emb_matrix[index, :] = word_to_vec_map[word]
        except KeyError:
            print(word)

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    ### END CODE HERE ###

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data

torch.manual_seed(1)

<torch._C.Generator at 0x1a1a1a79f0>

In [6]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [70]:
class EmojisDataset(Data.Dataset):
    """Emojiss dataset."""

    def __init__(self, csv_file, word_to_index, transform=None, one_hot=False):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        sentences, labels = read_csv(csv_file)
        max_len = len(max(sentences, key=len).split())
        print("max_len = {}".format(max_len))
        self.sentences = sentences_to_indices(sentences, word_to_index, max_len)
        self.labels = convert_to_one_hot(labels, C = 5) if one_hot else labels
        self.transform = transform

    def __len__(self):
        return len(self.sentences) if self.sentences is not None else 0

    def __getitem__(self, index):
        sample = self.sentences[index], self.labels[index]

        if self.transform:
            sample = self.transform(sample)

        return sample

In [71]:
train_data = EmojisDataset(
    csv_file='data/train_emoji.csv',
    word_to_index=word_to_index,
)
train_loader = Data.DataLoader(dataset=train_data, batch_size=32, shuffle=True, num_workers=1)

max_len = 10


In [72]:
test_data = EmojisDataset(
    csv_file='data/tesss.csv',
    word_to_index=word_to_index,
)
test_loader = Data.DataLoader(dataset=train_data, batch_size=32, shuffle=True, num_workers=0)

max_len = 8


In [73]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    embedding_dim = word_to_vec_map["cucumber"].shape[0]
    embedding_matrix = np.zeros((vocab_len, embedding_dim))
    for word, index in word_to_index.items():
        embedding_matrix[index, :] = word_to_vec_map[word]
        
    weights = torch.FloatTensor(embedding_matrix)
    print("len(weights) = ", len(weights))
    embedding_layer = nn.Embedding.from_pretrained(weights)
    return embedding_layer, embedding_dim

In [91]:
class EmojifyNN(nn.Module):
    
    def __init__(self, word_to_vec_map, word_to_index, hidden_size, num_layers, batch_size, out_features):
        super(EmojifyNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size

        self.hidden = self.init_hidden()
        self.embedding, embedding_dim = pretrained_embedding_layer(word_to_vec_map, word_to_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_size=hidden_size, num_layers=self.num_layers, dropout=0.5)
        self.linear = nn.Linear(in_features=hidden_size, out_features=out_features)
        self.softmax = nn.Softmax(0)

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(self.num_layers, 10, self.hidden_size),
                torch.zeros(self.num_layers, 10, self.hidden_size))
        
    def forward(self, sentence, hidden):
        embeds = self.embedding(sentence)
#         print("Embedding shape = {}".format(embeds.shape))
        lstm_out, _ = self.lstm(embeds, hidden)
#         print("LSTM shape = {}".format(lstm_out.shape))
        lstm_out = lstm_out[:, -1, :]
        linear_out = self.linear(lstm_out)
#         print("Linear shape = {}".format(linear_out.shape))
        out = self.softmax(linear_out)
#         print("Softmax shape = {}\n".format(out.shape))
        return out

In [92]:
model = EmojifyNN(word_to_vec_map, word_to_index, hidden_size=128, num_layers=2, batch_size=32, out_features=5)

len(weights) =  400001


In [93]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer1 = torch.optim.Adam(model.lstm.parameters(), lr=learning_rate)
optimizer2 = torch.optim.Adam(model.linear.parameters(), lr=learning_rate)

In [94]:
for epoch in range(80):  # again, normally you would NOT do 300 epochs, it is toy data
    for step, data in enumerate(train_loader):
        # get the inputs
        inputs, labels = data
        inputs = inputs.long()
        labels = labels.long()
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        hidden = model.init_hidden()

        # Step 3. Run our forward pass.
        outputs = model(inputs, hidden)
        
        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer1.step()
        optimizer2.step()
        
#         if (epoch+1) % 5 == 0:
#             for step, data in enumerate(train_loader):
#                 test_sentences, test_labels = data
#                 test_sentences = test_sentences.long()
#                 test_labels = test_labels.long()
# #                 x_test = test_sentences # np.array(['not feeling happy', 'Holy shit', 'you are so pretty', 'let us play ball'])
# #                 X_test_indices = sentences_to_indices(x_test, word_to_index, 32)
# #                 X_test_indices = torch.from_numpy(X_test_indices)
# #                 X_test_indices = X_test_indices.long()
#                 hidden = model.init_hidden()
#                 prediction = model(test_sentences, hidden)
#                 for i in range(len(test_sentences)):
#                     pred = prediction.data.numpy()
#                     num = np.argmax(pred[i])
#                     print(' prediction: ', test_sentences[i], label_to_emoji(num))
        

In [95]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        sentences, labels = data
        sentences = sentences.long()
        labels = labels.long()
        
        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        hidden = model.init_hidden()
        outputs = model(sentences, hidden)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
print('Accuracy of the network: %d %%' % (100 * correct / total))

Accuracy of the network: 25 %
