In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def read_file(path):
    
    with open(path, encoding='utf-8', mode='r+') as f:
        data = f.read()
    
    return data.strip().split('\n')

def preprocess_word(word):
    
    s = re.sub(r'([.,?!])', r' \1', word)
    s = re.sub(r'[^a-zA-Z.\'!?]+', r' ', word)

    return s

def split_en_fra(line):
    
    en = []
    fra = []
    pairs = []
    
    for l in line:
        pair = l.lower().split('\t')
        en_word = pair[0]
        fra_word = pair[1]

        en.append(preprocess_word(pair[0]))
        fra.append(preprocess_word(pair[1]))
        pairs.append(pair)
        
    return en, fra, pairs

def filter_pairs(pairs, max_length=10):
    
    filtered_pairs = []
    
    eng_prefixes = (
        "i am", "i m ",
        "he is", "he s",
        "she is", "she s",
        "you are", "you re",
        "we are", "we re",
        "they are", "they re"
    )
    
    for p in pairs:
        pair1 = preprocess_word(p[0])
        pair2 = preprocess_word(p[1])
        if len(pair1.split(' ')) < max_length and len(pair2.split(' ')) < max_length and pair1.startswith(eng_prefixes):
            filtered_pairs.append([pair1, pair2])
            
    return filtered_pairs

In [15]:
line = read_file('./data/eng-fra.txt')

In [16]:
line[:5]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !']

In [17]:
en, fra, pairs = split_en_fra(line)
filtered_pairs = filter_pairs(pairs)

In [18]:
print(en[:5])
print(fra[:5])
print(pairs[:5])
print(filtered_pairs[:5])
print(len(pairs))
print(len(filtered_pairs))

['go.', 'run!', 'run!', 'wow!', 'fire!']
['va !', 'cours !', 'courez !', ' a alors !', 'au feu !']
[['go.', 'va !'], ['run!', 'cours\u202f!'], ['run!', 'courez\u202f!'], ['wow!', 'ça alors\u202f!'], ['fire!', 'au feu !']]
[['i am fat.', 'je suis gras.'], ['he is ill.', 'il est malade.'], ['he is old.', 'il est vieux.'], ['i am busy.', 'je suis occup .'], ['i am calm.', 'je suis calme.']]
135842
3775


In [19]:
def build_dict(eng_list, fra_list):
    
    start_token = 0
    end_token = 1
    
    en_idx = {}
    fra_idx = {}
    idx_en = {}
    idx_fra = {}
    
    en_idx['SOS'] = start_token
    en_idx['EOS'] = end_token
    fra_idx['SOS'] = start_token
    fra_idx['EOS'] = end_token
    
    idx = 2
    for e in eng_list:
        for word in e.split():
            if word not in en_idx:
                en_idx[word] = idx
                idx += 1
    idx = 2
    for f in fra_list:
        for word in f.split():
            if word not in fra_idx:
                fra_idx[word] = idx
                idx += 1
            
    idx_en[0] = 'SOS'
    idx_en[1] = 'EOS'
    idx_fra[0] = 'SOS'
    idx_fra[1] = 'EOS'
        
    for word, idx in en_idx.items():
        idx_en[idx] = word
    for word, idx in fra_idx.items():
        idx_fra[idx] = word
    
    return en_idx, fra_idx, idx_en, idx_fra

In [20]:
en_idx, fra_idx, idx_en, idx_fra = build_dict(en, fra)

In [21]:
len(en_idx) == len(idx_en)

True

In [74]:
def en_to_idx(sentence):
    return [en_idx[word] for word in sentence.split(' ')]

def fra_to_idx(sentence):
    return [fra_idx[word] for word in sentence.split(' ')]

def to_tensor(indexes):
    indexes.append(1)
    return torch.tensor(indexes, device=device).view(-1, 1)

def tensor_from_pairs(pair):
    input_tensor = to_tensor(en_to_idx(pair[0].strip()))
    target_tensor = to_tensor(fra_to_idx(pair[1].strip()))
    
    return input_tensor, target_tensor

In [79]:
rand_idx = random.randint(0, len(filtered_pairs))
print(filtered_pairs[rand_idx])
print(tensor_from_pairs(filtered_pairs[rand_idx]))

['he is a heroin addict.', 'il est h ro no d pendant.']
(tensor([[  66],
        [ 245],
        [ 110],
        [7011],
        [2851],
        [   1]], device='cuda:0'), tensor([[  71],
        [  73],
        [ 880],
        [1805],
        [3707],
        [  97],
        [7183],
        [   1]], device='cuda:0'))


In [76]:
# en_tensor = []
# fra_tensor = []

# for e in en:
#     en_tensor.append(to_tensor(en_to_idx(e.strip())))
# for f in fra:
#     fra_tensor.append(to_tensor(fra_to_idx(f.strip())))

In [None]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, inp, hidden):
        embed = self.embedding(inp).view(1, 1, -1)
        output = embed
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [31]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Dense(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, inp, hidden):
        output = self.embedding(inp).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.ouut(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [32]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden, encoder_outputs):
        embed = self.embedding(inp).view(1, 1, -1)
        embed = self.dropout(embed)
        
        attn_weights = F.softmax(self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embed[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attn_weights
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [33]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, crit, max_length=10):
    
    encoder_hidden = encoder.init_hidden()
    
    encoder_opt.zero_grad()
    decoder_opt.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    training_loss = 0
    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output[0,0]
        
    decoder_input = torch.tensor([[0]], device=device)
    
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            training_loss += crit(decoder_output, target_tensor[i])
            decoder_input = target_tensor[i]
            
    else:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            
            training_loss += crit(decoder_output, target_tensor[i])
            
            if decoder_input.item() == 1:
                break
                
    training_loss.backward()
    encoder_opt.step()
    decoder_opt.step()
    
    return training_loss.item() / target_length

In [37]:
def train_iters(encoder, decoder, n_iters, print_every=10, plot_every=100, lr=0.01):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_opt = optim.SGD(encoder.parameters(), lr=lr)
    decoder_opt = optim.SGD(decoder.parameters(), lr=lr)
#     training_pairs = [tensor_from_pairs(random.choice(filtered_pairs)) for _ in range(n_iters)]
    criterion = nn.NLLLoss()
    
    for i in range(n_iters):
        rand_pairs = random.choice(filtered_pairs)
        training_pairs = tensor_from_pairs(rand_pairs)
#         training_pair = training_pairs[i]
        input_tensor = training_pairs[0]
        target_tensor = training_pairs[1]
        
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if i % print_every == 0:
            train_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print("Loss:", train_loss_avg)
            
#         if i % plot_every == 0:
#             plot_loss_avg = print_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_avg = 0
            

In [57]:
tensor_from_pairs(random.choice(filtered_pairs))

(tensor([[  66],
         [ 144],
         [2012],
         [   1]], device='cuda:0'),
 tensor([[ 71],
         [  6],
         [299],
         [284],
         [  1]], device='cuda:0'))

In [38]:
hidden_size = 256
encoder1 = EncoderRNN(len(en_idx), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(fra_idx), dropout_p=0.1).to(device)

train_iters(encoder1, attn_decoder1, 75000, print_every=5000)

Loss: 0.0020491358947753906
Loss: 4.4502176980467025
Loss: 3.557711275358076
Loss: 2.9388878632730724
Loss: 2.400896590808083
Loss: 1.9453321599769453
Loss: 1.5924743188971298
Loss: 1.275214881206147
Loss: 1.0841896553254342
Loss: 0.8951137886225994
Loss: 0.7469192901023616
Loss: 0.6370441047694261
Loss: 0.5743444136838698
Loss: 0.527690010658919
Loss: 0.4842822133519212


In [51]:
def evaluate(encoder, decoder, sentence, max_length=10):
    
    with torch.no_grad():
        input_tensor = to_tensor(en_to_idx(sentence))
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.init_hidden()
        
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        
        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
            encoder_outputs[i] += encoder_output[0,0]
        
        decoder_input = torch.tensor([[0]], device=device)
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)
        
        for i in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[i] = decoder_attention.data
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(idx_fra[topi.item()])
                
            decoder_input = topi.squeeze().detach()
            
        return decoded_words, decoder_attention[:i + 1]
        

In [53]:
def evaluate_randomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(filtered_pairs)
        print('English:', pair[0])
        print('Actual French translation:', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Model translation:', output_sentence)
        print('')

In [54]:
evaluate_randomly(encoder1, attn_decoder1)

English: he sometimes watches tv.
Actual French translation: il regarde parfois la t l .
Model translation: il regarde parfois la t l . <EOS>

English: you really don't have a clue do you?
Actual French translation: t'as vraiment pas l'ombre d'une id e si ?
Model translation: vous tes vraiment ignare non ? <EOS>

English: i am on holiday this week.
Actual French translation: je suis en cong cette semaine.
Model translation: je suis en cong cette semaine. cette semaine. <EOS>

English: he is unpopular for some reason.
Actual French translation: il est impopulaire pour une raison quelconque.
Model translation: il est impopulaire pour une raison quelconque. <EOS>

English: he is a bank clerk.
Actual French translation: il est employ de banque.
Model translation: il est employ de banque. <EOS>

English: he is having lunch now.
Actual French translation: il est en train de d jeuner l'heure actuelle.
Model translation: il est en train de d jeuner l'heure actuelle. <EOS>

English: i am yours 