In [1]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import time

with open('./spa-eng/spa.txt') as f:
    spa_eng_list = f.read().strip().split('\n')

len(spa_eng_list)

123376

In [2]:
# sample size
num_examples = 10000

# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in spa_eng_list[:num_examples]]

data = pd.DataFrame(original_word_pairs, columns=["eng", "es"])

data.sample(10)

Unnamed: 0,eng,es
7216,Tom called you.,Tom te llamó.
4510,I lost my hat.,Yo perdí mi sombrero.
180,I tried.,Lo intenté.
8463,I play baseball.,Juego al béisbol.
2254,Should I go?,¿Debería ir?
4799,I'm sensitive.,Soy sensible.
1687,Are you hot?,¿Tenéis calor?
8294,I forgot my key.,Olvidé mi llave.
418,Sit here.,Siéntate aquí.
2504,We're right.,Tenemos razón.


In [3]:
# utils.py

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

def max_length(tensor):
    return max(len(t) for t in tensor)

def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded

# sort batch function to be able to use with pad_packed_sequence
def sort_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count  

In [4]:
# process the data
data["eng"] = data.eng.apply(lambda w: preprocess_sentence(w))
data["es"] = data.es.apply(lambda w: preprocess_sentence(w))
data.sample(10)

Unnamed: 0,eng,es
2234,<start> say goodbye . <end>,<start> deci adios . <end>
9263,<start> that man is tom . <end>,<start> aquel hombre es tomas . <end>
5746,<start> you re unfair . <end>,<start> eres injusto . <end>
3283,<start> life goes on . <end>,<start> la vida continua . <end>
560,<start> go get it . <end>,<start> vete a por ello . <end>
3530,<start> they re here . <end>,<start> aqui estan . <end>
7233,<start> tom died alone . <end>,<start> tom murio solo . <end>
7998,<start> give me a smile . <end>,<start> ponme una sonrisa . <end>
3199,<start> it was magic . <end>,<start> fue magico . <end>
5751,<start> you ve got me . <end>,<start> me han pescado . <end>


In [5]:
# build the vocabulary
class LanguageIndex():
    
    def __init__(self, lang):
        """ lang: the list of phrases from each language """
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
            
        # sort the vocab
        self.vocab = sorted(self.vocab)

        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [6]:
# index language using the class above
inp_lang = LanguageIndex(data["es"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())

# Vectorize the input and target languages
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')]  for es in data["es"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["eng"].values.tolist()]
print(input_tensor[:5])
print(target_tensor[:5])

# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
print('max_length_inp: %d'%max_length_inp)
print('max_length_tar: %d'%max_length_tar)

# inplace padding
input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
print('len(input_tensor): %d'%len(input_tensor))
print('len(target_tensor): %d'%len(target_tensor))

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print('len(input_tensor_train): %d\nlen(target_tensor_train): %d\nlen(input_tensor_val): %d\nlen(target_tensor_val): %d\n'
       % (len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)))

[[5, 4361, 3, 4], [5, 4425, 3, 4], [5, 4353, 3, 4], [5, 4360, 3, 4], [5, 2221, 3, 4]]
[[5, 843, 3, 4], [5, 843, 3, 4], [5, 843, 3, 4], [5, 843, 3, 4], [5, 944, 3, 4]]
max_length_inp: 12
max_length_tar: 8
len(input_tensor): 10000
len(target_tensor): 10000
len(input_tensor_train): 8000
len(target_tensor_train): 8000
len(input_tensor_val): 2000
len(target_tensor_val): 2000



In [7]:
len(targ_lang.vocab)

2285

In [8]:
# conver the data to tensors and pass to the Dataloader to create an batch iterator
class IterData(Dataset):
    
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x, y, x_len
    
    def __len__(self):
        return len(self.data)

In [9]:
# hyperparams
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = IterData(input_tensor_train, target_tensor_train)
val_dataset = IterData(input_tensor_val, target_tensor_val)

In [10]:
# model.py

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, lens,  device):
        # x: (batch_size, max_length, embedding_dim)
        x = self.embedding(x) 
        x = pack_padded_sequence(x, lens)
        
        self.hidden = self.initialize_hidden_state(device)
        
        output, self.hidden = self.gru(x, self.hidden)
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden
        
    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_size, self.enc_units)).to(device)
    
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units,
                                         self.dec_units,
                                         batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        # for attention
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
        
    def forward(self, x, hidden, enc_output):
        # enc_output original: (max_length, batch_size, enc_units)
        # enc_output converted == (batch_size, max_length, hidden_size)
        enc_output = enc_output.permute(1, 0, 2)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        
        # score: (batch_size, max_length, hidden_size)
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = torch.softmax(self.V(score), dim=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        # Looks like attention vector in diagram of source
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        
        # passing the concatenated vector to the GRU
        # output: (batch_size, 1, hidden_size)
        output, state = self.gru(x)
            
        # output shape == (batch_size * 1, hidden_size)
        output =  output.view(-1, output.size(2))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

In [11]:
criterion = nn.CrossEntropyLoss()

def loss_fuction(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
    mask = real.ge(1).type(torch.FloatTensor)
#     mask = real.ge(1).type(torch.cuda.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [12]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(' Train on ' + str(device).upper())

PRE_TRAINED = True

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

if PRE_TRAINED:
    encoder.load_state_dict(torch.load('eng2spa_encoder_params.pkl'))
    decoder.load_state_dict(torch.load('eng2spa_decoder_params.pkl'))

encoder.to(device)
decoder.to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

 Train on CPU


In [13]:
# train.py

def train(encoder, decoder, epoch):
    
    dataset_train = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)
    
    losses = AverageMeter()
    batch_time = AverageMeter()

    encoder.train()
    decoder.train()

    total_loss = 0

    for batch, (inp, targ, inp_len) in enumerate(dataset_train):
        loss = 0

        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        
        end = time.time()

        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                                            dec_hidden.to(device),
                                                                            enc_output.to(device))
            loss += loss_fuction(ys[:, t].to(device), predictions.to(device))
            dec_input = ys[:, t].unsqueeze(1)

        batch_loss = (loss / int(ys.size(1)))
        losses.update(batch_loss, xs.size(0))
        
        total_loss += batch_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_time.update(time.time() - end)
        end = time.time()

        if batch % PRINT_FREQ == 0:
                print('Epoch: [{0}] [{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      .format(
                       epoch, batch, len(dataset_train), batch_time=batch_time, loss=losses))
                
        if batch % 200 == 0:
            print('Saving Model...')
            torch.save(encoder.state_dict(), 'eng2spa_encoder_params.pkl')
            torch.save(decoder.state_dict(), 'eng2spa_decoder_params.pkl')
            print('Model Saved Successfully!')

In [14]:
def validate(encoder, decoder):
    
    dataset_val = DataLoader(val_dataset, batch_size = 1, 
                     drop_last=True,
                     shuffle=False)
    
    encoder.eval()
    decoder.eval()
    acc = 0
    for batch, (inp, targ, inp_len) in enumerate(dataset_val):
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        raw_sen = xs.numpy()
#         print(raw_sen.shape)
        for raw_word in raw_sen:
#             print(raw_word)
            print(inp_lang.idx2word[int(raw_word)], end=' ')
        print()
        raw_sen1 = ys.numpy()[0]
#         print(raw_sen1.shape)
        for raw_word1 in raw_sen1:
#             print(raw_word)
            print(targ_lang.idx2word[int(raw_word1)], end=' ')
        print()
        input()
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        for t in range(1, ys.size(1)):
#             print(ys.size(1))
            predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                                            dec_hidden.to(device),
                                                                            enc_output.to(device))
#             print(predictions.shape)
#             print(ys[:,t])
            pred = torch.argmax(predictions)
#             print(predictions.detach().numpy())
#             print(predictions.detach().numpy()[0][3])
#             print(int(pred.numpy()))
            print(targ_lang.idx2word[int(pred.numpy())], end=' ')
        print()
#             input()
#         print(predictions)

In [15]:
def apply(encoder, decoder):
    encoder.eval()
    decoder.eval()
    x = input('Input an Spanish sentence to translate: ')
    

In [16]:
# model train

EPOCH = 20
PRINT_FREQ = 5

for epoch in range(EPOCH):
    train(encoder, decoder, epoch)
    print('Saving Model...')
    torch.save(encoder.state_dict(), 'eng2spa_encoder_params.pkl')
    torch.save(decoder.state_dict(), 'eng2spa_decoder_params.pkl')
    print('Model Saved Successfully!')

Epoch: [0] [0/500]	Time 0.584 (0.584)	Loss 0.1573 (0.1573)	
Saving Model...
Model Saved Successfully!
Epoch: [0] [5/500]	Time 0.526 (0.565)	Loss 0.2638 (0.1486)	
Epoch: [0] [10/500]	Time 0.547 (0.562)	Loss 0.0458 (0.1269)	
Epoch: [0] [15/500]	Time 0.560 (0.559)	Loss 0.0151 (0.1172)	
Epoch: [0] [20/500]	Time 0.540 (0.559)	Loss 0.0144 (0.1180)	
Epoch: [0] [25/500]	Time 0.540 (0.560)	Loss 0.1073 (0.1123)	
Epoch: [0] [30/500]	Time 0.540 (0.559)	Loss 0.0185 (0.1199)	
Epoch: [0] [35/500]	Time 0.574 (0.559)	Loss 0.0798 (0.1198)	
Epoch: [0] [40/500]	Time 0.873 (0.574)	Loss 0.0723 (0.1253)	
Epoch: [0] [45/500]	Time 0.560 (0.598)	Loss 0.2536 (0.1341)	
Epoch: [0] [50/500]	Time 0.584 (0.596)	Loss 0.0846 (0.1350)	
Epoch: [0] [55/500]	Time 0.544 (0.594)	Loss 0.1071 (0.1329)	
Epoch: [0] [60/500]	Time 0.597 (0.592)	Loss 0.3405 (0.1347)	
Epoch: [0] [65/500]	Time 0.530 (0.590)	Loss 0.1438 (0.1343)	
Epoch: [0] [70/500]	Time 0.531 (0.587)	Loss 0.2796 (0.1342)	
Epoch: [0] [75/500]	Time 0.532 (0.585)	Loss 0

Epoch: [1] [150/500]	Time 1.009 (0.590)	Loss 0.0508 (0.0486)	
Epoch: [1] [155/500]	Time 0.568 (0.589)	Loss 0.0977 (0.0495)	
Epoch: [1] [160/500]	Time 0.831 (0.591)	Loss 0.0142 (0.0493)	
Epoch: [1] [165/500]	Time 0.631 (0.591)	Loss 0.0108 (0.0492)	
Epoch: [1] [170/500]	Time 0.627 (0.590)	Loss 0.1188 (0.0495)	
Epoch: [1] [175/500]	Time 0.551 (0.590)	Loss 0.0079 (0.0494)	
Epoch: [1] [180/500]	Time 0.558 (0.589)	Loss 0.0851 (0.0508)	
Epoch: [1] [185/500]	Time 0.559 (0.588)	Loss 0.0663 (0.0505)	
Epoch: [1] [190/500]	Time 0.559 (0.588)	Loss 0.0952 (0.0508)	
Epoch: [1] [195/500]	Time 0.558 (0.587)	Loss 0.0645 (0.0507)	
Epoch: [1] [200/500]	Time 0.562 (0.587)	Loss 0.0715 (0.0511)	
Saving Model...
Model Saved Successfully!
Epoch: [1] [205/500]	Time 0.540 (0.586)	Loss 0.0272 (0.0509)	
Epoch: [1] [210/500]	Time 0.567 (0.586)	Loss 0.0343 (0.0506)	
Epoch: [1] [215/500]	Time 0.543 (0.585)	Loss 0.0268 (0.0505)	
Epoch: [1] [220/500]	Time 0.532 (0.585)	Loss 0.0705 (0.0512)	
Epoch: [1] [225/500]	Time 0.

Epoch: [2] [300/500]	Time 0.537 (0.570)	Loss 0.1028 (0.0413)	
Epoch: [2] [305/500]	Time 0.534 (0.570)	Loss 0.1393 (0.0417)	
Epoch: [2] [310/500]	Time 0.701 (0.570)	Loss 0.0520 (0.0419)	
Epoch: [2] [315/500]	Time 0.567 (0.570)	Loss 0.0042 (0.0416)	
Epoch: [2] [320/500]	Time 0.550 (0.570)	Loss 0.0172 (0.0415)	
Epoch: [2] [325/500]	Time 0.552 (0.570)	Loss 0.0312 (0.0411)	
Epoch: [2] [330/500]	Time 0.708 (0.570)	Loss 0.0313 (0.0411)	
Epoch: [2] [335/500]	Time 0.860 (0.572)	Loss 0.0626 (0.0412)	
Epoch: [2] [340/500]	Time 0.553 (0.574)	Loss 0.0036 (0.0411)	
Epoch: [2] [345/500]	Time 0.553 (0.573)	Loss 0.1123 (0.0413)	
Epoch: [2] [350/500]	Time 0.535 (0.573)	Loss 0.0295 (0.0413)	
Epoch: [2] [355/500]	Time 0.551 (0.573)	Loss 0.0180 (0.0412)	
Epoch: [2] [360/500]	Time 0.550 (0.573)	Loss 0.0642 (0.0416)	
Epoch: [2] [365/500]	Time 0.552 (0.573)	Loss 0.0921 (0.0416)	
Epoch: [2] [370/500]	Time 0.548 (0.573)	Loss 0.0165 (0.0414)	
Epoch: [2] [375/500]	Time 0.564 (0.573)	Loss 0.0028 (0.0414)	
Epoch: [

Epoch: [3] [450/500]	Time 0.546 (0.572)	Loss 0.0409 (0.0412)	
Epoch: [3] [455/500]	Time 0.530 (0.572)	Loss 0.0115 (0.0419)	
Epoch: [3] [460/500]	Time 0.546 (0.572)	Loss 0.0102 (0.0417)	
Epoch: [3] [465/500]	Time 0.533 (0.572)	Loss 0.0750 (0.0420)	
Epoch: [3] [470/500]	Time 0.546 (0.572)	Loss 0.0856 (0.0422)	
Epoch: [3] [475/500]	Time 0.553 (0.572)	Loss 0.0668 (0.0423)	
Epoch: [3] [480/500]	Time 0.663 (0.572)	Loss 0.0834 (0.0422)	
Epoch: [3] [485/500]	Time 0.539 (0.571)	Loss 0.0715 (0.0422)	
Epoch: [3] [490/500]	Time 0.555 (0.571)	Loss 0.0770 (0.0424)	
Epoch: [3] [495/500]	Time 0.546 (0.571)	Loss 0.0726 (0.0426)	
Saving Model...
Model Saved Successfully!
Epoch: [4] [0/500]	Time 0.602 (0.602)	Loss 0.0179 (0.0179)	
Saving Model...
Model Saved Successfully!
Epoch: [4] [5/500]	Time 0.543 (0.575)	Loss 0.0495 (0.0275)	
Epoch: [4] [10/500]	Time 0.559 (0.561)	Loss 0.0045 (0.0248)	
Epoch: [4] [15/500]	Time 0.558 (0.567)	Loss 0.0246 (0.0301)	
Epoch: [4] [20/500]	Time 0.558 (0.568)	Loss 0.0045 (0.

Epoch: [5] [95/500]	Time 0.574 (0.578)	Loss 0.0039 (0.0312)	
Epoch: [5] [100/500]	Time 0.588 (0.580)	Loss 0.0269 (0.0306)	
Epoch: [5] [105/500]	Time 0.546 (0.580)	Loss 0.0302 (0.0303)	
Epoch: [5] [110/500]	Time 0.588 (0.586)	Loss 0.0496 (0.0319)	
Epoch: [5] [115/500]	Time 1.021 (0.590)	Loss 0.0368 (0.0321)	
Epoch: [5] [120/500]	Time 0.676 (0.594)	Loss 0.0037 (0.0319)	
Epoch: [5] [125/500]	Time 0.557 (0.592)	Loss 0.0136 (0.0318)	
Epoch: [5] [130/500]	Time 0.587 (0.592)	Loss 0.0312 (0.0314)	
Epoch: [5] [135/500]	Time 0.541 (0.590)	Loss 0.0031 (0.0320)	
Epoch: [5] [140/500]	Time 0.541 (0.589)	Loss 0.0437 (0.0319)	
Epoch: [5] [145/500]	Time 0.535 (0.588)	Loss 0.0179 (0.0321)	
Epoch: [5] [150/500]	Time 0.551 (0.586)	Loss 0.0170 (0.0325)	
Epoch: [5] [155/500]	Time 0.661 (0.587)	Loss 0.0043 (0.0319)	
Epoch: [5] [160/500]	Time 0.556 (0.586)	Loss 0.0523 (0.0322)	
Epoch: [5] [165/500]	Time 0.538 (0.585)	Loss 0.0431 (0.0323)	
Epoch: [5] [170/500]	Time 0.540 (0.584)	Loss 0.0536 (0.0326)	
Epoch: [5

Epoch: [6] [245/500]	Time 0.726 (0.648)	Loss 0.0667 (0.0341)	
Epoch: [6] [250/500]	Time 0.704 (0.650)	Loss 0.0167 (0.0343)	
Epoch: [6] [255/500]	Time 0.757 (0.652)	Loss 0.0384 (0.0348)	
Epoch: [6] [260/500]	Time 0.839 (0.654)	Loss 0.0015 (0.0350)	
Epoch: [6] [265/500]	Time 0.739 (0.655)	Loss 0.0238 (0.0351)	
Epoch: [6] [270/500]	Time 0.758 (0.657)	Loss 0.0426 (0.0352)	
Epoch: [6] [275/500]	Time 0.731 (0.659)	Loss 0.0720 (0.0356)	
Epoch: [6] [280/500]	Time 0.803 (0.661)	Loss 0.0024 (0.0354)	
Epoch: [6] [285/500]	Time 0.986 (0.664)	Loss 0.0365 (0.0353)	
Epoch: [6] [290/500]	Time 0.909 (0.668)	Loss 0.1243 (0.0358)	
Epoch: [6] [295/500]	Time 0.971 (0.672)	Loss 0.0352 (0.0357)	
Epoch: [6] [300/500]	Time 0.827 (0.678)	Loss 0.0362 (0.0358)	
Epoch: [6] [305/500]	Time 0.789 (0.681)	Loss 0.0226 (0.0355)	
Epoch: [6] [310/500]	Time 0.942 (0.681)	Loss 0.1650 (0.0359)	
Epoch: [6] [315/500]	Time 0.568 (0.680)	Loss 0.0398 (0.0358)	
Epoch: [6] [320/500]	Time 0.853 (0.679)	Loss 0.0369 (0.0356)	
Epoch: [

Epoch: [7] [395/500]	Time 0.593 (0.563)	Loss 0.1301 (0.0400)	
Epoch: [7] [400/500]	Time 0.554 (0.563)	Loss 0.0579 (0.0401)	
Saving Model...
Model Saved Successfully!
Epoch: [7] [405/500]	Time 0.681 (0.563)	Loss 0.0619 (0.0401)	
Epoch: [7] [410/500]	Time 0.544 (0.563)	Loss 0.0432 (0.0402)	
Epoch: [7] [415/500]	Time 0.528 (0.563)	Loss 0.0342 (0.0402)	
Epoch: [7] [420/500]	Time 0.574 (0.563)	Loss 0.0511 (0.0404)	
Epoch: [7] [425/500]	Time 0.575 (0.563)	Loss 0.1174 (0.0404)	
Epoch: [7] [430/500]	Time 0.557 (0.563)	Loss 0.0125 (0.0406)	
Epoch: [7] [435/500]	Time 0.554 (0.563)	Loss 0.0450 (0.0408)	
Epoch: [7] [440/500]	Time 0.556 (0.563)	Loss 0.0037 (0.0410)	
Epoch: [7] [445/500]	Time 0.554 (0.563)	Loss 0.0027 (0.0409)	
Epoch: [7] [450/500]	Time 0.553 (0.563)	Loss 0.0758 (0.0413)	
Epoch: [7] [455/500]	Time 0.546 (0.563)	Loss 0.0425 (0.0414)	
Epoch: [7] [460/500]	Time 0.535 (0.563)	Loss 0.0818 (0.0415)	
Epoch: [7] [465/500]	Time 0.529 (0.562)	Loss 0.0258 (0.0416)	
Epoch: [7] [470/500]	Time 0.

Epoch: [9] [35/500]	Time 0.525 (0.547)	Loss 0.0509 (0.0356)	
Epoch: [9] [40/500]	Time 0.593 (0.549)	Loss 0.0089 (0.0356)	
Epoch: [9] [45/500]	Time 0.582 (0.549)	Loss 0.0177 (0.0355)	
Epoch: [9] [50/500]	Time 0.545 (0.550)	Loss 0.0580 (0.0363)	
Epoch: [9] [55/500]	Time 0.533 (0.549)	Loss 0.0446 (0.0389)	
Epoch: [9] [60/500]	Time 0.563 (0.549)	Loss 0.0560 (0.0386)	
Epoch: [9] [65/500]	Time 0.532 (0.549)	Loss 0.0184 (0.0391)	
Epoch: [9] [70/500]	Time 0.595 (0.549)	Loss 0.0641 (0.0397)	
Epoch: [9] [75/500]	Time 0.521 (0.549)	Loss 0.1356 (0.0419)	
Epoch: [9] [80/500]	Time 0.545 (0.549)	Loss 0.0478 (0.0420)	
Epoch: [9] [85/500]	Time 0.539 (0.550)	Loss 0.0636 (0.0416)	
Epoch: [9] [90/500]	Time 0.556 (0.550)	Loss 0.0213 (0.0409)	
Epoch: [9] [95/500]	Time 0.554 (0.550)	Loss 0.0543 (0.0408)	
Epoch: [9] [100/500]	Time 0.544 (0.550)	Loss 0.0275 (0.0415)	
Epoch: [9] [105/500]	Time 0.637 (0.552)	Loss 0.0829 (0.0417)	
Epoch: [9] [110/500]	Time 0.740 (0.555)	Loss 0.0646 (0.0417)	
Epoch: [9] [115/500]	

Epoch: [10] [185/500]	Time 0.529 (0.568)	Loss 0.0676 (0.0386)	
Epoch: [10] [190/500]	Time 0.554 (0.568)	Loss 0.0041 (0.0384)	
Epoch: [10] [195/500]	Time 0.555 (0.567)	Loss 0.0145 (0.0381)	
Epoch: [10] [200/500]	Time 0.545 (0.567)	Loss 0.1325 (0.0382)	
Saving Model...
Model Saved Successfully!
Epoch: [10] [205/500]	Time 0.555 (0.567)	Loss 0.0039 (0.0381)	
Epoch: [10] [210/500]	Time 0.563 (0.567)	Loss 0.0143 (0.0382)	
Epoch: [10] [215/500]	Time 0.555 (0.567)	Loss 0.0392 (0.0386)	
Epoch: [10] [220/500]	Time 0.560 (0.566)	Loss 0.0316 (0.0382)	
Epoch: [10] [225/500]	Time 0.577 (0.567)	Loss 0.1745 (0.0389)	
Epoch: [10] [230/500]	Time 0.558 (0.567)	Loss 0.1471 (0.0395)	
Epoch: [10] [235/500]	Time 0.542 (0.567)	Loss 0.0366 (0.0392)	
Epoch: [10] [240/500]	Time 0.557 (0.567)	Loss 0.0745 (0.0390)	
Epoch: [10] [245/500]	Time 0.538 (0.567)	Loss 0.0512 (0.0392)	
Epoch: [10] [250/500]	Time 0.534 (0.566)	Loss 0.0506 (0.0394)	
Epoch: [10] [255/500]	Time 0.574 (0.567)	Loss 0.0686 (0.0398)	
Epoch: [10] [

Epoch: [11] [325/500]	Time 0.534 (0.572)	Loss 0.0036 (0.0340)	
Epoch: [11] [330/500]	Time 0.548 (0.572)	Loss 0.0086 (0.0340)	
Epoch: [11] [335/500]	Time 0.551 (0.571)	Loss 0.0389 (0.0344)	
Epoch: [11] [340/500]	Time 0.551 (0.571)	Loss 0.0181 (0.0341)	
Epoch: [11] [345/500]	Time 0.548 (0.572)	Loss 0.0110 (0.0342)	
Epoch: [11] [350/500]	Time 0.549 (0.572)	Loss 0.0760 (0.0345)	
Epoch: [11] [355/500]	Time 0.546 (0.571)	Loss 0.0737 (0.0346)	
Epoch: [11] [360/500]	Time 0.575 (0.571)	Loss 0.0118 (0.0348)	
Epoch: [11] [365/500]	Time 0.541 (0.571)	Loss 0.0254 (0.0350)	
Epoch: [11] [370/500]	Time 0.560 (0.571)	Loss 0.1156 (0.0357)	
Epoch: [11] [375/500]	Time 0.543 (0.571)	Loss 0.0080 (0.0360)	
Epoch: [11] [380/500]	Time 0.588 (0.571)	Loss 0.0242 (0.0359)	
Epoch: [11] [385/500]	Time 0.556 (0.571)	Loss 0.0176 (0.0361)	
Epoch: [11] [390/500]	Time 0.588 (0.571)	Loss 0.0345 (0.0362)	
Epoch: [11] [395/500]	Time 0.558 (0.571)	Loss 0.0080 (0.0362)	
Epoch: [11] [400/500]	Time 0.542 (0.571)	Loss 0.0739 (0

Epoch: [12] [465/500]	Time 0.561 (0.566)	Loss 0.0207 (0.0369)	
Epoch: [12] [470/500]	Time 0.525 (0.566)	Loss 0.0364 (0.0369)	
Epoch: [12] [475/500]	Time 0.687 (0.566)	Loss 0.0007 (0.0369)	
Epoch: [12] [480/500]	Time 0.523 (0.566)	Loss 0.0321 (0.0369)	
Epoch: [12] [485/500]	Time 0.527 (0.566)	Loss 0.0079 (0.0371)	
Epoch: [12] [490/500]	Time 0.572 (0.566)	Loss 0.1371 (0.0376)	
Epoch: [12] [495/500]	Time 0.570 (0.566)	Loss 0.0490 (0.0378)	
Saving Model...
Model Saved Successfully!
Epoch: [13] [0/500]	Time 0.568 (0.568)	Loss 0.0064 (0.0064)	
Saving Model...
Model Saved Successfully!
Epoch: [13] [5/500]	Time 0.681 (0.745)	Loss 0.0008 (0.0201)	
Epoch: [13] [10/500]	Time 0.577 (0.755)	Loss 0.0385 (0.0174)	
Epoch: [13] [15/500]	Time 0.527 (0.688)	Loss 0.0268 (0.0184)	
Epoch: [13] [20/500]	Time 0.602 (0.680)	Loss 0.0063 (0.0186)	
Epoch: [13] [25/500]	Time 0.544 (0.666)	Loss 0.0116 (0.0195)	
Epoch: [13] [30/500]	Time 0.553 (0.647)	Loss 0.0217 (0.0201)	
Epoch: [13] [35/500]	Time 0.569 (0.635)	Los

Epoch: [14] [100/500]	Time 0.538 (0.549)	Loss 0.0020 (0.0261)	
Epoch: [14] [105/500]	Time 0.594 (0.549)	Loss 0.0011 (0.0260)	
Epoch: [14] [110/500]	Time 0.540 (0.550)	Loss 0.0011 (0.0254)	
Epoch: [14] [115/500]	Time 0.599 (0.551)	Loss 0.0279 (0.0250)	
Epoch: [14] [120/500]	Time 0.583 (0.551)	Loss 0.0223 (0.0250)	
Epoch: [14] [125/500]	Time 0.532 (0.552)	Loss 0.0222 (0.0251)	
Epoch: [14] [130/500]	Time 0.546 (0.552)	Loss 0.0544 (0.0251)	
Epoch: [14] [135/500]	Time 0.568 (0.552)	Loss 0.0215 (0.0249)	
Epoch: [14] [140/500]	Time 0.553 (0.551)	Loss 0.0421 (0.0252)	
Epoch: [14] [145/500]	Time 0.581 (0.552)	Loss 0.1078 (0.0257)	
Epoch: [14] [150/500]	Time 0.567 (0.552)	Loss 0.0427 (0.0260)	
Epoch: [14] [155/500]	Time 0.563 (0.552)	Loss 0.0109 (0.0262)	
Epoch: [14] [160/500]	Time 0.528 (0.552)	Loss 0.0062 (0.0265)	
Epoch: [14] [165/500]	Time 0.545 (0.552)	Loss 0.0258 (0.0267)	
Epoch: [14] [170/500]	Time 0.535 (0.552)	Loss 0.0623 (0.0270)	
Epoch: [14] [175/500]	Time 0.570 (0.552)	Loss 0.0453 (0

Epoch: [15] [240/500]	Time 0.851 (0.588)	Loss 0.1059 (0.0316)	
Epoch: [15] [245/500]	Time 0.574 (0.588)	Loss 0.0741 (0.0316)	
Epoch: [15] [250/500]	Time 0.557 (0.587)	Loss 0.0699 (0.0316)	
Epoch: [15] [255/500]	Time 0.567 (0.587)	Loss 0.0019 (0.0314)	
Epoch: [15] [260/500]	Time 0.575 (0.586)	Loss 0.0154 (0.0316)	
Epoch: [15] [265/500]	Time 0.559 (0.586)	Loss 0.0210 (0.0317)	
Epoch: [15] [270/500]	Time 0.558 (0.585)	Loss 0.0440 (0.0317)	
Epoch: [15] [275/500]	Time 0.577 (0.586)	Loss 0.0156 (0.0316)	
Epoch: [15] [280/500]	Time 0.546 (0.585)	Loss 0.0262 (0.0315)	
Epoch: [15] [285/500]	Time 0.571 (0.585)	Loss 0.0367 (0.0314)	
Epoch: [15] [290/500]	Time 0.572 (0.585)	Loss 0.0293 (0.0313)	
Epoch: [15] [295/500]	Time 0.555 (0.585)	Loss 0.0299 (0.0314)	
Epoch: [15] [300/500]	Time 0.565 (0.586)	Loss 0.0238 (0.0314)	
Epoch: [15] [305/500]	Time 1.042 (0.588)	Loss 0.1614 (0.0320)	
Epoch: [15] [310/500]	Time 0.561 (0.588)	Loss 0.0530 (0.0322)	
Epoch: [15] [315/500]	Time 0.540 (0.588)	Loss 0.0134 (0

Epoch: [16] [380/500]	Time 0.583 (0.582)	Loss 0.0173 (0.0329)	
Epoch: [16] [385/500]	Time 0.576 (0.582)	Loss 0.0475 (0.0327)	
Epoch: [16] [390/500]	Time 0.547 (0.581)	Loss 0.0245 (0.0326)	
Epoch: [16] [395/500]	Time 0.544 (0.581)	Loss 0.0037 (0.0328)	
Epoch: [16] [400/500]	Time 0.528 (0.581)	Loss 0.0612 (0.0327)	
Saving Model...
Model Saved Successfully!
Epoch: [16] [405/500]	Time 0.535 (0.581)	Loss 0.0364 (0.0328)	
Epoch: [16] [410/500]	Time 0.554 (0.580)	Loss 0.0267 (0.0328)	
Epoch: [16] [415/500]	Time 0.550 (0.580)	Loss 0.0386 (0.0328)	
Epoch: [16] [420/500]	Time 0.532 (0.580)	Loss 0.0231 (0.0330)	
Epoch: [16] [425/500]	Time 0.542 (0.580)	Loss 0.0370 (0.0333)	
Epoch: [16] [430/500]	Time 0.529 (0.579)	Loss 0.0018 (0.0333)	
Epoch: [16] [435/500]	Time 0.531 (0.579)	Loss 0.0448 (0.0335)	
Epoch: [16] [440/500]	Time 0.531 (0.579)	Loss 0.0503 (0.0336)	
Epoch: [16] [445/500]	Time 0.563 (0.578)	Loss 0.0558 (0.0336)	
Epoch: [16] [450/500]	Time 0.548 (0.578)	Loss 0.0173 (0.0336)	
Epoch: [16] [

Epoch: [18] [10/500]	Time 0.608 (0.580)	Loss 0.0022 (0.0298)	
Epoch: [18] [15/500]	Time 0.529 (0.573)	Loss 0.0005 (0.0266)	
Epoch: [18] [20/500]	Time 0.564 (0.570)	Loss 0.0406 (0.0289)	
Epoch: [18] [25/500]	Time 0.708 (0.577)	Loss 0.0200 (0.0295)	
Epoch: [18] [30/500]	Time 0.559 (0.582)	Loss 0.0009 (0.0279)	
Epoch: [18] [35/500]	Time 0.544 (0.590)	Loss 0.0107 (0.0275)	
Epoch: [18] [40/500]	Time 0.542 (0.588)	Loss 0.0105 (0.0258)	
Epoch: [18] [45/500]	Time 0.582 (0.585)	Loss 0.0272 (0.0265)	
Epoch: [18] [50/500]	Time 0.731 (0.594)	Loss 0.1038 (0.0287)	
Epoch: [18] [55/500]	Time 0.564 (0.597)	Loss 0.0097 (0.0280)	
Epoch: [18] [60/500]	Time 0.573 (0.594)	Loss 0.0335 (0.0278)	
Epoch: [18] [65/500]	Time 0.602 (0.598)	Loss 0.0535 (0.0274)	
Epoch: [18] [70/500]	Time 0.565 (0.598)	Loss 0.0038 (0.0268)	
Epoch: [18] [75/500]	Time 0.561 (0.595)	Loss 0.0313 (0.0271)	
Epoch: [18] [80/500]	Time 0.540 (0.595)	Loss 0.0109 (0.0264)	
Epoch: [18] [85/500]	Time 0.657 (0.601)	Loss 0.0275 (0.0260)	
Epoch: [

Epoch: [19] [155/500]	Time 0.569 (0.620)	Loss 0.0454 (0.0341)	
Epoch: [19] [160/500]	Time 0.552 (0.619)	Loss 0.0314 (0.0342)	
Epoch: [19] [165/500]	Time 0.560 (0.618)	Loss 0.0225 (0.0344)	
Epoch: [19] [170/500]	Time 0.546 (0.617)	Loss 0.0328 (0.0342)	
Epoch: [19] [175/500]	Time 0.545 (0.617)	Loss 0.0137 (0.0344)	
Epoch: [19] [180/500]	Time 0.556 (0.616)	Loss 0.0248 (0.0343)	
Epoch: [19] [185/500]	Time 0.612 (0.614)	Loss 0.0070 (0.0337)	
Epoch: [19] [190/500]	Time 0.554 (0.613)	Loss 0.2102 (0.0351)	
Epoch: [19] [195/500]	Time 0.556 (0.612)	Loss 0.0619 (0.0357)	
Epoch: [19] [200/500]	Time 0.562 (0.612)	Loss 0.0051 (0.0359)	
Saving Model...
Model Saved Successfully!
Epoch: [19] [205/500]	Time 0.562 (0.611)	Loss 0.0334 (0.0360)	
Epoch: [19] [210/500]	Time 0.550 (0.609)	Loss 0.0147 (0.0362)	
Epoch: [19] [215/500]	Time 0.549 (0.608)	Loss 0.1229 (0.0365)	
Epoch: [19] [220/500]	Time 0.655 (0.611)	Loss 0.1372 (0.0372)	
Epoch: [19] [225/500]	Time 0.718 (0.612)	Loss 0.0338 (0.0374)	
Epoch: [19] [

In [18]:
encoder_val = Encoder(vocab_inp_size, embedding_dim, units, 1)
decoder_val = Decoder(vocab_tar_size, embedding_dim, units, units, 1)

encoder_val.to(device)
decoder_val.to(device)

# print(encoder_val)

encoder_val.load_state_dict(torch.load('eng2spa_encoder_params.pkl'))
decoder_val.load_state_dict(torch.load('eng2spa_decoder_params.pkl'))

# validate(encoder_val, decoder_val)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])