In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

%matplotlib inline

In [16]:
device = torch.device('mps')

In [2]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [4]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [45]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

In [46]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [5]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [6]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [11]:
MAX_LENGTH = 10

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

def filterPair(p):
    try:
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH #and \
#            p[0].startswith(eng_prefixes)
    except:
        print(p)
        
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]



In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [8]:
def prepareData(lang1, lang2, file):
    text = open(file, encoding='utf-8').read().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')][:2] for l in text ]
    pairs = [pair for pair in pairs if len(pair) == 2]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    pairs = filterPairs(pairs)
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [9]:
path = "./models/"
encoder = torch.load(path+"translate_sp_en_encoder.pt")
decoder = torch.load(path+"translate_sp_en_decoder.pt")

In [12]:
file = 'data/spa.txt'
input_lang, output_lang, pairs = prepareData('eng', 'spa', file)

Counted words:
eng 12105
spa 23411


In [17]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> that wasn t my fault
= eso no fue mi culpa
< no fue mi culpa mia <EOS>

> please smile
= sonreid
< favor le puso gasolina en las armas <EOS>

> she s at a meeting
= ella esta en una reunion
< ella es un aliado en la reunion <EOS>

> you learn something new every day
= cada dia aprendes algo nuevo
< aprendiste un nuevo dia de ingles <EOS>

> this machine can print sixty pages a minute
= este aparato puede imprimir sesenta paginas por minuto
< este aparato puede imprimir sesenta paginas por minuto <EOS>

> is this price acceptable ?
= es aceptable el precio ?
< es aceptable el precio de la mia ? <EOS>

> why are you wearing my coat ?
= por que estas usando mi abrigo ?
< por que lleva usted mi abrigo ? <EOS>

> tom was just as scared as mary was
= tom estaba tan asustado como lo estaba mary
< tom estaba tan asustado como lo estaba mary <EOS>

> this was his one and only hope
= esta era su unica esperanza
< era su unica parque solo tiene una necesidad urgente <EOS>

> did you already do 

In [18]:
evaluate(encoder, decoder, 'she is my sister', input_lang, output_lang)

(['ella', 'es', 'mi', 'hermana', 'mayor', '<EOS>'], None)

In [29]:
evaluate(encoder, decoder, 'i am cleaning my house', input_lang, output_lang)

(['estoy', 'contento', 'de', 'mi', 'casa', 'es', 'verde', '<EOS>'], None)

In [21]:
evaluate(encoder, decoder, 'when is homework due ?', input_lang, output_lang)

(['cuando', 'puedo', 'hacer', 'los', 'documentos', '?', '<EOS>'], None)

In [22]:
evaluate(encoder, decoder, 'i m scared', input_lang, output_lang)

(['tengo', 'miedo', 'de', 'tener', 'miedo', '<EOS>'], None)

In [32]:
evaluate(encoder, decoder, 'what is my name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'mi', '?', '<EOS>'], None)

In [24]:
evaluate(encoder, decoder, 'what is your name ?', input_lang, output_lang)

(['cual', 'es', 'tu', 'nombre', '?', '<EOS>'], None)

In [25]:
evaluate(encoder, decoder, 'what is her name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'tu', 'nombre', '?', '<EOS>'], None)

In [27]:
evaluate(encoder, decoder, 'what is his name ?', input_lang, output_lang)

(['como', 'se', 'llama', 'tu', 'nombre', '?', '<EOS>'], None)

In [28]:
evaluate(encoder, decoder, 'many years later', input_lang, output_lang)

(['muchos', 'anos', 'mas', 'lejos', 'mucho', '<EOS>'], None)

In [33]:
evaluate(encoder, decoder, 'i m taking an english class', input_lang, output_lang)

(['estoy', 'bebiendo', 'un', 'vaso', 'de', 'ingles', '<EOS>'], None)

In [31]:
evaluate(encoder, decoder, 'i m studying in an english class', input_lang, output_lang)

(['estoy', 'estudiando', 'ingles', 'en', 'el', 'estudio', '<EOS>'], None)

In [43]:
evaluate(encoder, decoder, 'they play soccer', input_lang, output_lang)

(['juegan', 'al', 'futbol', 'al', 'futbol', '<EOS>'], None)

In [None]:
play v. -> jugar
play n. -> actividad de entretenimiento

In [44]:
evaluate(encoder, decoder, 'my brother never showed up in my party', input_lang, output_lang)

(['mi', 'hermano', 'nunca', 'se', 'fue', 'en', 'mi', 'fiesta', '<EOS>'], None)

In [47]:
path = "./models/"
encoder_attn = torch.load(path+"translate_sp_en_attn_encoder.pt")
decoder_attn = torch.load(path+"translate_sp_en_attn_decoder.pt")

In [52]:
evaluate(encoder_attn, decoder_attn, 'she is my sister', input_lang, output_lang)[0]

['ella', 'es', 'mi', 'hermana', 'llamado', '<EOS>']