In [88]:
# https://colab.research.google.com/drive/1uFJBO1pgsiFwCGIJwZlhUzaJ2srDbtw-#scrollTo=LjMMYJv85hVT
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [3]:
import pandas as pd
import numpy as np
import torch
import torchvision
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
import os
import unicodedata
import re
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import time

In [8]:
def load_data(loc = '../input/spa-eng/spa.txt', num_samples = 30000 ):
    file = open(loc, encoding='UTF-8').read().strip().split('\n')  
    original_word_pairs = [[w for w in l.split('\t')] for l in file[:num_samples]]
    data = pd.DataFrame(original_word_pairs, columns=["eng", "es"])
    return data

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w
    

In [9]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
            
        # sort the vocab
        self.vocab = sorted(self.vocab)

        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word        

In [10]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [11]:
data = load_data()
data.head(10)

Unnamed: 0,eng,es
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
5,Run!,¡Corre!
6,Run.,Corred.
7,Who?,¿Quién?
8,Fire!,¡Fuego!
9,Fire!,¡Incendio!


In [12]:
# Now we do the preprocessing using pandas and lambdas
data["eng"] = data.eng.apply(lambda w: preprocess_sentence(w))
data["es"] = data.es.apply(lambda w: preprocess_sentence(w))
data.sample(10)

Unnamed: 0,eng,es
9766,<start> a bird has wings . <end>,<start> los pajaros tienen alas . <end>
6747,<start> that s for you . <end>,<start> es para ti . <end>
10094,<start> father came home . <end>,<start> papa volvio a casa . <end>
1626,<start> cook for me . <end>,<start> cocina para mi . <end>
8867,<start> somebody called . <end>,<start> alguien llamo . <end>
1483,<start> watch this . <end>,<start> mira esto . <end>
12576,<start> you re necessary . <end>,<start> eres necesario . <end>
20039,<start> i go to work by bus . <end>,<start> voy al trabajo en autobus . <end>
11061,<start> it hurts so much . <end>,<start> duele tanto . . . <end>
7027,<start> tom is spoiled . <end>,<start> tom esta malcriado . <end>


In [44]:
# index language using the class above
inp_lang = LanguageIndex(data["es"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())
# Vectorize the input and target languages
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')]  for es in data["es"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["eng"].values.tolist()]
input_tensor[:10]

[[5, 9090, 3, 4],
 [5, 9204, 3, 4],
 [5, 9082, 3, 4],
 [5, 9089, 3, 4],
 [5, 4702, 3, 4],
 [5, 2299, 1, 4],
 [5, 2304, 3, 4],
 [5, 9413, 7433, 6, 4],
 [5, 4270, 1, 4],
 [5, 4881, 1, 4]]

In [45]:
target_tensor[:10]

[[5, 1857, 3, 4],
 [5, 1857, 3, 4],
 [5, 1857, 3, 4],
 [5, 1857, 3, 4],
 [5, 2058, 3, 4],
 [5, 3655, 1, 4],
 [5, 3655, 3, 4],
 [5, 4815, 6, 4],
 [5, 1636, 1, 4],
 [5, 1636, 1, 4]]

In [46]:
# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
print("max_length_inp ",max_length_inp)
print("max_length_tar ",max_length_tar)

max_length_inp  16
max_length_tar  11


In [47]:
def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
    return padded

In [48]:
# inplace padding
input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
len(target_tensor)

30000

In [49]:
input_tensor[0]

array([   5, 9090,    3,    4,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0], dtype=int64)

In [50]:
print("max_length_inp :", max_length_inp)
print("max_length_tar :",max_length_tar)

max_length_inp : 16
max_length_tar : 11


In [51]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(24000, 24000, 6000, 6000)

In [76]:
# train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
# valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
# test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# train_dataset = MyData(input_tensor_train, target_tensor_train)
# val_dataset = MyData(input_tensor_val, target_tensor_val)

# dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
#                      drop_last=True,
#                      shuffle=True)

In [54]:
train_dataset = TensorDataset(torch.from_numpy(np.array(input_tensor_train)), torch.from_numpy(np.array(target_tensor_train)))
val_dataset = TensorDataset(torch.from_numpy(np.array(input_tensor_val)), torch.from_numpy(np.array(target_tensor_val)))

In [55]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

In [56]:
dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

In [117]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size,batch_first=True)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        print(embedded.shape)
        output = embedded
        print(output.shape)
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [118]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [119]:
### Testing Encoder part
# TODO: put whether GPU is available or not
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = EncoderRNN(vocab_inp_size, units)
encoder.to(device)


EncoderRNN(
  (embedding): Embedding(9414, 1024)
  (lstm): LSTM(1024, 1024, batch_first=True)
)

In [120]:
decoder = DecoderRNN(units,vocab_tar_size)
decoder = decoder.to(device)

In [121]:
it = iter(dataset)
x, y = next(it)

In [122]:
x.shape

torch.Size([64, 16])

In [123]:
enc_output, enc_hidden = encoder(x.to(device), units)

torch.Size([1, 1, 1048576])
torch.Size([1, 1, 1048576])


RuntimeError: input.size(-1) must be equal to input_size. Expected 1024, got 1048576

In [116]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

encoder.to(device)
# obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)

print("Input: ", x.shape)
print("Output: ", y.shape)

# sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)

enc_output, enc_hidden = encoder(xs.to(device), lens, device)
print("Encoder Output: ", enc_output.shape) # batch_size X max_length X enc_units
print("Encoder Hidden: ", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)

decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)
decoder = decoder.to(device)

#print(enc_hidden.squeeze(0).shape)

dec_hidden = enc_hidden#.squeeze(0)
dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
print("Decoder Input: ", dec_input.shape)
print("--------")

for t in range(1, y.size(1)):
    # enc_hidden: 1, batch_size, enc_units
    # output: max_length, batch_size, enc_units
    predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
    
    print("Prediction: ", predictions.shape)
    print("Decoder Hidden: ", dec_hidden.shape)
    
    #loss += loss_function(y[:, t].to(device), predictions.to(device))
    
    dec_input = y[:, t].unsqueeze(1)
    print(dec_input.shape)
    break

NameError: name 'Encoder' is not defined

In [85]:
criterion = nn.CrossEntropyLoss()

def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [86]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## TODO: Combine the encoder and decoder into one class
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

encoder.to(device)
decoder.to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr=0.001)

In [89]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    encoder.train()
    decoder.train()
    
    total_loss = 0
    
    for (batch, (inp, targ, inp_len)) in enumerate(dataset):
        loss = 0
        
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        
        # use teacher forcing - feeding the target as the next input (via dec_input)
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        
        # run code below for every timestep in the ys batch
        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
            loss += loss_function(ys[:, t].to(device), predictions.to(device))
            #loss += loss_
            dec_input = ys[:, t].unsqueeze(1)
            
        
        batch_loss = (loss / int(ys.size(1)))
        total_loss += batch_loss
        
        optimizer.zero_grad()
        
        loss.backward()

        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.detach().item()))
        
        
    ### TODO: Save checkpoint for model
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
    
    
    

Epoch 1 Batch 0 Loss 4.6137
Epoch 1 Batch 100 Loss 1.5926
Epoch 1 Batch 200 Loss 1.2573
Epoch 1 Batch 300 Loss 1.1738
Epoch 1 Loss 1.4782
Time taken for 1 epoch 184.451566696167 sec

Epoch 2 Batch 0 Loss 0.8271
Epoch 2 Batch 100 Loss 0.6359
Epoch 2 Batch 200 Loss 0.7116
Epoch 2 Batch 300 Loss 0.7174
Epoch 2 Loss 0.7487
Time taken for 1 epoch 199.08265566825867 sec

Epoch 3 Batch 0 Loss 0.4353
Epoch 3 Batch 100 Loss 0.3932
Epoch 3 Batch 200 Loss 0.3396
Epoch 3 Batch 300 Loss 0.4074
Epoch 3 Loss 0.4208
Time taken for 1 epoch 359.9769802093506 sec

Epoch 4 Batch 0 Loss 0.2179
Epoch 4 Batch 100 Loss 0.2796
Epoch 4 Batch 200 Loss 0.2639
Epoch 4 Batch 300 Loss 0.2271
Epoch 4 Loss 0.2462
Time taken for 1 epoch 340.2648720741272 sec

Epoch 5 Batch 0 Loss 0.1401
Epoch 5 Batch 100 Loss 0.1557
Epoch 5 Batch 200 Loss 0.1737
Epoch 5 Batch 300 Loss 0.2161
Epoch 5 Loss 0.1589
Time taken for 1 epoch 323.82831716537476 sec

Epoch 6 Batch 0 Loss 0.0727
Epoch 6 Batch 100 Loss 0.1101
Epoch 6 Batch 200 Los