In [1]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import time

with open('./spa-eng/spa.txt') as f:
    spa_eng_list = f.read().strip().split('\n')

len(spa_eng_list)

123376

In [2]:
# sample size
num_examples = 123300

# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in spa_eng_list[:num_examples]]

data = pd.DataFrame(original_word_pairs, columns=["eng", "es"])

data.sample(10)

Unnamed: 0,eng,es
78315,Copper and silver are both metals.,El cobre y la plata son dos metales.
120197,You shouldn't say that kind of thing when chil...,No deberías decir esa clase de cosas cuando ha...
90698,You should be a little more tolerant.,Debería ser un poco más tolerante.
89613,Sumo is a traditional Japanese sport.,El Sumo es un deporte tradicional japonés.
86065,I will go tomorrow morning at seven.,Voy a ir mañana a las siete de la mañana.
48945,Why don't we just stay in?,¿Por qué no nos quedamos dentro?
2158,It's simple.,Es simple.
9221,Stay in the car.,¡Quedate en el auto!
54662,I have trouble with physics.,Tengo problemas con física.
111105,There's a time to speak and a time to be silent.,Hay un tiempo para hablar y un tiempo para cal...


In [3]:
# utils.py

# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

def max_length(tensor):
    return max(len(t) for t in tensor)

def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len:
        padded[:] = x[:max_len]
    else:
        padded[:len(x)] = x
    return padded

# sort batch function to be able to use with pad_packed_sequence
def sort_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count  

In [4]:
# process the data
data["eng"] = data.eng.apply(lambda w: preprocess_sentence(w))
data["es"] = data.es.apply(lambda w: preprocess_sentence(w))
data.sample(10)

Unnamed: 0,eng,es
122752,<start> tom took thousands of pictures during ...,<start> tom tomo miles de fotos durante sus va...
50229,<start> i didn t know it was there . <end>,<start> no sabia que estaba ahi . <end>
87084,<start> they were satisfied with the result . ...,<start> estaban satisfechos con el resultado ....
116767,<start> i want to wear the same kind of clothe...,<start> quiero usar el mismo tipo de ropa que ...
94011,<start> i don t want to cause any more trouble...,<start> no quiero causar mas problemas . <end>
19054,<start> we are against war . <end>,<start> estamos en contra de la guerra . <end>
62709,<start> he is able to play the guitar . <end>,<start> sabe tocar la guitarra . <end>
91711,<start> i was about to suggest the same thing ...,<start> estuve a punto de sugerir lo mismo . <...
53828,<start> france is in western europe . <end>,<start> francia esta en europa occidental . <end>
98684,<start> i had difficulty in solving this probl...,<start> tuve dificultad para resolver este pro...


In [5]:
# build the vocabulary
class LanguageIndex():
    
    def __init__(self, lang):
        """ lang: the list of phrases from each language """
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
            
        # sort the vocab
        self.vocab = sorted(self.vocab)

        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [6]:
# index language using the class above
inp_lang = LanguageIndex(data["es"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())

# Vectorize the input and target languages
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')]  for es in data["es"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["eng"].values.tolist()]
print(input_tensor[:5])
print(target_tensor[:5])

# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
print('max_length_inp: %d'%max_length_inp)
print('max_length_tar: %d'%max_length_tar)

# inplace padding
input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
print('len(input_tensor): %d'%len(input_tensor))
print('len(target_tensor): %d'%len(target_tensor))

# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print('len(input_tensor_train): %d\nlen(target_tensor_train): %d\nlen(input_tensor_val): %d\nlen(target_tensor_val): %d\n'
       % (len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)))

[[5, 24440, 3, 4], [5, 24665, 3, 4], [5, 24432, 3, 4], [5, 24439, 3, 4], [5, 12839, 3, 4]]
[[5, 5001, 3, 4], [5, 5001, 3, 4], [5, 5001, 3, 4], [5, 5001, 3, 4], [5, 5482, 3, 4]]
max_length_inp: 38
max_length_tar: 33
len(input_tensor): 123300
len(target_tensor): 123300
len(input_tensor_train): 98640
len(target_tensor_train): 98640
len(input_tensor_val): 24660
len(target_tensor_val): 24660



In [7]:
len(targ_lang.vocab)

13008

In [8]:
# conver the data to tensors and pass to the Dataloader to create an batch iterator
class IterData(Dataset):
    
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x, y, x_len
    
    def __len__(self):
        return len(self.data)

In [9]:
# hyperparams
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 256
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = IterData(input_tensor_train, target_tensor_train)
val_dataset = IterData(input_tensor_val, target_tensor_val)

In [10]:
# model.py

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, lens,  device):
        # x: (batch_size, max_length, embedding_dim)
        x = self.embedding(x) 
        x = pack_padded_sequence(x, lens)
        
        self.hidden = self.initialize_hidden_state(device)
        
        output, self.hidden = self.gru(x, self.hidden)
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden
        
    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_size, self.enc_units)).to(device)
    
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units,
                                         self.dec_units,
                                         batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        # for attention
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
        
    def forward(self, x, hidden, enc_output):
        enc_output = enc_output.permute(1, 0, 2)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        attention_weights = torch.softmax(self.V(score), dim=1)
        
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        
        x = self.embedding(x)
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        
        output, state = self.gru(x)
            
        output =  output.view(-1, output.size(2))
        
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

In [11]:
criterion = nn.CrossEntropyLoss()

def loss_fuction(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
#    mask = real.ge(1).type(torch.FloatTensor)
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [12]:
# Device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(' Train on ' + str(device).upper())

PRE_TRAINED = True

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

if PRE_TRAINED:
    encoder.load_state_dict(torch.load('eng2spa_encoder_params.pkl'))
    decoder.load_state_dict(torch.load('eng2spa_decoder_params.pkl'))

encoder.to(device)
decoder.to(device)
print(encoder)
print(decoder)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

 Train on CUDA:1
Encoder(
  (embedding): Embedding(25145, 256)
  (gru): GRU(256, 1024)
)
Decoder(
  (embedding): Embedding(13009, 256)
  (gru): GRU(1280, 1024, batch_first=True)
  (fc): Linear(in_features=1024, out_features=13009, bias=True)
  (W1): Linear(in_features=1024, out_features=1024, bias=True)
  (W2): Linear(in_features=1024, out_features=1024, bias=True)
  (V): Linear(in_features=1024, out_features=1, bias=True)
)


In [13]:
# train.py

def train(encoder, decoder, epoch):
    
    dataset_train = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)
    
    losses = AverageMeter()
    batch_time = AverageMeter()

    encoder.train()
    decoder.train()

    total_loss = 0

    for batch, (inp, targ, inp_len) in enumerate(dataset_train):
        loss = 0

        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        
        end = time.time()

        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                                dec_hidden.to(device),
                                                enc_output.to(device))
            loss += loss_fuction(ys[:, t].to(device), predictions.to(device))
            dec_input = ys[:, t].unsqueeze(1)

        batch_loss = (loss / int(ys.size(1)))
        losses.update(batch_loss, xs.size(0))
        
        total_loss += batch_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_time.update(time.time() - end)
        end = time.time()

        if batch % PRINT_FREQ == 0:
                print('Epoch: [{0}] [{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      .format(
                       epoch, batch, len(dataset_train), batch_time=batch_time, loss=losses))
                
        if batch % 200 == 0:
            print('Saving Model...')
            torch.save(encoder.state_dict(), 'eng2spa_encoder_params.pkl')
            torch.save(decoder.state_dict(), 'eng2spa_decoder_params.pkl')
            print('Model Saved Successfully!')

In [21]:
def validate(encoder, decoder):
    
    dataset_val = DataLoader(val_dataset, batch_size = 1, 
                     drop_last=True,
                     shuffle=False)
    
    encoder.eval()
    decoder.eval()
    acc = 0
    for batch, (inp, targ, inp_len) in enumerate(dataset_val):
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        raw_sen = xs.numpy()
#         print(raw_sen.shape)
        for raw_word in raw_sen:
#             print(raw_word)
            print(inp_lang.idx2word[int(raw_word)], end=' ')
        print()
        raw_sen1 = ys.numpy()[0]
#         print(raw_sen1.shape)
        for raw_word1 in raw_sen1:
#             print(raw_word)
            print(targ_lang.idx2word[int(raw_word1)], end=' ')
        print()
        input()
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * 1)
        for t in range(1, ys.size(1)):
#             print(ys.size(1))
            predictions, dec_hidden, _ = decoder(dec_input.to(device),
                                        dec_hidden.to(device),
                                        enc_output.to(device))
#             print(predictions.shape)
#             print(ys[:,t])
            pred = torch.argmax(predictions)
#             print(predictions.detach().numpy())
#             print(predictions.detach().numpy()[0][3])
#             print(int(pred.numpy()))
            print(targ_lang.idx2word[int(pred.cpu().numpy())], end=' ')
        print()
#             input()
#         print(predictions)

In [15]:
def apply(encoder, decoder):
    encoder.eval()
    decoder.eval()
    x = input('Input an Spanish sentence to translate: ')
    

In [16]:
# model train

EPOCH = 10
PRINT_FREQ = 5

for epoch in range(EPOCH):
    train(encoder, decoder, epoch)
    print('Saving Model...')
    torch.save(encoder.state_dict(), 'eng2spa_encoder_params.pkl')
    torch.save(decoder.state_dict(), 'eng2spa_decoder_params.pkl')
    print('Model Saved Successfully!')

Epoch: [0] [0/385]	Time 0.428 (0.428)	Loss 2.4471 (2.4471)	
Saving Model...
Model Saved Successfully!
Epoch: [0] [5/385]	Time 0.350 (0.376)	Loss 1.5620 (2.0568)	
Epoch: [0] [10/385]	Time 0.355 (0.382)	Loss 1.2576 (1.7140)	
Epoch: [0] [15/385]	Time 0.351 (0.385)	Loss 1.2350 (1.5664)	
Epoch: [0] [20/385]	Time 0.404 (0.388)	Loss 1.1673 (1.4713)	
Epoch: [0] [25/385]	Time 0.465 (0.392)	Loss 1.0750 (1.4062)	
Epoch: [0] [30/385]	Time 0.404 (0.394)	Loss 1.1318 (1.3585)	
Epoch: [0] [35/385]	Time 0.356 (0.390)	Loss 1.0679 (1.3211)	
Epoch: [0] [40/385]	Time 0.400 (0.390)	Loss 1.0220 (1.2886)	
Epoch: [0] [45/385]	Time 0.411 (0.394)	Loss 1.0551 (1.2603)	
Epoch: [0] [50/385]	Time 0.479 (0.396)	Loss 1.0399 (1.2381)	
Epoch: [0] [55/385]	Time 0.383 (0.397)	Loss 0.9931 (1.2148)	
Epoch: [0] [60/385]	Time 0.418 (0.395)	Loss 0.9583 (1.1947)	
Epoch: [0] [65/385]	Time 0.433 (0.395)	Loss 0.9830 (1.1757)	
Epoch: [0] [70/385]	Time 0.401 (0.396)	Loss 0.8860 (1.1589)	
Epoch: [0] [75/385]	Time 0.410 (0.398)	Loss 0

In [22]:
encoder_val = Encoder(vocab_inp_size, embedding_dim, units, 1)
decoder_val = Decoder(vocab_tar_size, embedding_dim, units, units, 1)

encoder_val.to(device)
decoder_val.to(device)

# print(encoder_val)

encoder_val.load_state_dict(torch.load('eng2spa_encoder_params.pkl'))
decoder_val.load_state_dict(torch.load('eng2spa_decoder_params.pkl'))

validate(encoder_val, decoder_val)

<start> ¿ podrias ensenarme el camino , por favor ? <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 
<start> will you please show me the way ? <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

could could you you you you you you you you you you you you you you you you you you you you you you you you you you you you you you 
<start> los prisioneros fugitivos siguen profugos . <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 
<start> the escaped prisoners are still on the run . <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

the the the the the the th

KeyboardInterrupt: 