In [73]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [91]:
def read_file(path):
    
    with open(path, encoding='utf-8', mode='r+') as f:
        data = f.read()
    
    return data.strip().split('\n')

def preprocess_word(word):
    
    s = re.sub(r'([.?!])', r' \1', word)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', word)

    return s

def split_en_fra(line):
    
    en = []
    fra = []
    pairs = []
    
    for l in line:
        pair = l.lower().split('\t')
        en_word = pair[0]
        fra_word = pair[1]

        en.append(preprocess_word(pair[0]))
        fra.append(preprocess_word(pair[1]))
        pairs.append(pair)
        
    return en, fra, pairs

def filter_pairs(pairs, max_length=10):
    
    filtered_pairs = []
    
    eng_prefixes = (
        "i am", "i m ",
        "he is", "he s",
        "she is", "she s",
        "you are", "you re",
        "we are", "we re",
        "they are", "they re"
    )
    
    for p in pairs:
        pair1 = preprocess_word(p[0])
        pair2 = preprocess_word(p[1])
        if len(pair1.split(' ')) < max_length and len(pair2.split(' ')) < max_length and pair1.startswith(eng_prefixes):
            filtered_pairs.append([pair1, pair2])
            
    return filtered_pairs

In [92]:
line = read_file('./data/eng-fra.txt')

In [93]:
line[:5]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !']

In [94]:
en, fra, pairs = split_en_fra(line)
filtered_pairs = filter_pairs(pairs)

In [95]:
print(en[:5])
print(fra[:5])
print(pairs[:5])
print(filtered_pairs[:5])
print(len(pairs))
print(len(filtered_pairs))

['go.', 'run!', 'run!', 'wow!', 'fire!']
['va !', 'cours !', 'courez !', ' a alors !', 'au feu !']
[['go.', 'va !'], ['run!', 'cours\u202f!'], ['run!', 'courez\u202f!'], ['wow!', 'ça alors\u202f!'], ['fire!', 'au feu !']]
[['i m .', 'j ai ans.'], ['i m ok.', 'je vais bien.'], ['i m ok.', ' a va.'], ['i m fat.', 'je suis gras.'], ['i m fat.', 'je suis gros.']]
135842
11927


In [96]:
def build_dict(eng_list, fra_list):
    
    start_token = 0
    end_token = 1
    
    en_idx = {}
    fra_idx = {}
    idx_en = {}
    idx_fra = {}
    
    en_idx['SOS'] = start_token
    en_idx['EOS'] = end_token
    fra_idx['SOS'] = start_token
    fra_idx['EOS'] = end_token
    
    idx = 2
    for e in eng_list:
        for word in e.split():
            if word not in en_idx:
                en_idx[word] = idx
                idx += 1
    idx = 2
    for f in fra_list:
        for word in f.split():
            if word not in fra_idx:
                fra_idx[word] = idx
                idx += 1
            
    idx_en[0] = 'SOS'
    idx_en[1] = 'EOS'
    idx_fra[0] = 'SOS'
    idx_fra[1] = 'EOS'
        
    for word, idx in en_idx.items():
        idx_en[idx] = word
    for word, idx in fra_idx.items():
        idx_fra[idx] = word
    
    return en_idx, fra_idx, idx_en, idx_fra

In [97]:
en_idx, fra_idx, idx_en, idx_fra = build_dict(en, fra)

In [98]:
len(en_idx) == len(idx_en)

True

In [121]:
def en_to_idx(sentence):
    return [en_idx[word] for word in sentence.split(' ')]

def fra_to_idx(sentence):
    return [fra_idx[word] for word in sentence.split(' ')]

def to_tensor(indexes):
    indexes.append(1)
    return torch.tensor(indexes, device=device).view(-1, 1)

def tensor_from_pairs(pair):
    input_tensor = to_tensor(en_to_idx(pair[0].strip()))
    target_tensor = to_tensor(fra_to_idx(pair[1].strip()))
    
    return input_tensor, target_tensor

In [122]:
rand_idx = random.randint(0, len(filtered_pairs))
print(filtered_pairs[rand_idx])
print(tensor_from_pairs(filtered_pairs[rand_idx]))

['he is sure of success.', 'il est s r de son succ s.']
(tensor([[  66],
        [ 243],
        [2175],
        [ 340],
        [6931],
        [   1]], device='cuda:0'), tensor([[  73],
        [  69],
        [ 303],
        [ 191],
        [  71],
        [3392],
        [3504],
        [ 624],
        [   1]], device='cuda:0'))


In [103]:
en_tensor = []
fra_tensor = []

for e in en:
    en_tensor.append(to_tensor(en_to_idx(e.strip())))
for f in fra:
    fra_tensor.append(to_tensor(fra_to_idx(f.strip())))

In [104]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, inp, hidden):
        embed = self.embedding(inp).view(1, 1, -1)
        output = embed
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [105]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Dense(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, inp, hidden):
        output = self.embedding(inp).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.ouut(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [106]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden, encoder_outputs):
        embed = self.embedding(inp).view(1, 1, -1)
        embed = self.dropout(embed)
        
        attn_weights = F.softmax(self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embed[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attn_weights
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [107]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, crit, max_length=10):
    
    encoder_hidden = encoder.init_hidden()
    
    encoder_opt.zero_grad()
    decoder_opt.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    training_loss = 0
    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output[0,0]
        
    decoder_input = torch.tensor([[0]], device=device)
    
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            training_loss += crit(decoder_output, target_tensor[i])
            decoder_input = target_tensor[i]
            
    else:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            
            training_loss += crit(decoder_output, target_tensor[i])
            
            if decoder_input.item() == 1:
                break
                
    training_loss.backward()
    encoder_opt.step()
    decoder_opt.step()
    
    return training_loss.item() / target_length

In [129]:
def train_iters(encoder, decoder, n_iters, print_every=10, plot_every=100, lr=0.01):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_opt = optim.SGD(encoder.parameters(), lr=lr)
    decoder_opt = optim.SGD(decoder.parameters(), lr=lr)
#     training_pairs = [tensor_from_pairs(random.choice(filtered_pairs)) for _ in range(n_iters)]
    criterion = nn.NLLLoss()
    
    for i in range(n_iters):
        rand_pairs = random.choice(filtered_pairs)
        training_pairs = tensor_from_pairs(rand_pairs)
#         training_pair = training_pairs[i]
        input_tensor = training_pairs[0]
        target_tensor = training_pairs[1]
        
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if i % print_every == 0:
            train_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print("Loss:", train_loss_avg)
            
#         if i % plot_every == 0:
#             plot_loss_avg = print_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_avg = 0
            

In [130]:
hidden_size = 256
encoder1 = EncoderRNN(len(en_idx), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(fra_idx), dropout_p=0.1).to(device)

train_iters(encoder1, attn_decoder1, 10000, print_every=100)

Loss: 0.10218090057373047
Loss: 6.131591665262267
Loss: 4.807781986403087
Loss: 5.109556426048279
Loss: 4.725137678297739
Loss: 4.956477164442576
Loss: 4.777958238298931
Loss: 4.847564830666498
Loss: 4.697543733676275
Loss: 4.700871934285241
Loss: 4.420794135846788
Loss: 4.408980877740042
Loss: 4.151803937851437
Loss: 3.962843799027185
Loss: 4.275886997518085
Loss: 4.234532964320409
Loss: 3.9615778729499334
Loss: 4.191663705383029
Loss: 3.963008569021075
Loss: 4.140494697264261
Loss: 4.028944581088567
Loss: 3.92569718529686
Loss: 3.9728033560609055
Loss: 3.6231213659483292
Loss: 3.86922737808833
Loss: 4.059077709932175
Loss: 3.9373146464294866
Loss: 4.022785940609281
Loss: 3.8412536865037588
Loss: 4.104906374079841
Loss: 4.076543157713752
Loss: 3.7580044850546206
Loss: 3.937291819228065
Loss: 4.154851230276956
Loss: 3.8179710206796242
Loss: 3.919900182226347
Loss: 3.6853385493490434
Loss: 3.7679762105639028
Loss: 3.7185018973388355
Loss: 3.60454008318129
Loss: 3.6838963870699435
Loss: 

In [131]:
def evaluate(encoder, decoder, sentence, max_length=10):
    
    with torch.no_grad():
        input_tensor = to_tensor(en_to_idx(sentence))
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.init_hidden()
        
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        
        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
            encoder_outputs[i] += encoder_output[0,0]
        
        decoder_input = torch.tensor([[0]], device=device)
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)
        
        for i in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[i] = decoder_attention.data
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(idx_fra[topi.item()])
                
            decoder_input = topi.squeeze().detach()
            
        return decoded_words, decoder_attention[:i + 1]
        

In [134]:
def evaluate_randomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(filtered_pairs)
        print('English:', pair[0])
        print('Actual France translation:', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Model translation:', output_sentence)
        print('')

In [135]:
evaluate_randomly(encoder1, attn_decoder1)

English: you re very sophisticated.
Actual France translation: vous tes tr s l gante.
Model translation: tu es tr s <EOS>

English: i m not your brother.
Actual France translation: je ne suis pas ton fr re.
Model translation: je ne suis pas votre <EOS>

English: they re anxious for peace.
Actual France translation: ils s inqui tent pour la paix.
Model translation: elles sont en pour <EOS>

English: i m glad you re my friend.
Actual France translation: je me r jouis que tu sois mon ami.
Model translation: je suis r jouis que vous mon <EOS>

English: i m better looking than tom.
Actual France translation: je suis plus beau que tom.
Model translation: je suis en train de <EOS>

English: i m shorter than you.
Actual France translation: je suis plus petit que vous.
Model translation: je suis plus que que toi. <EOS>

English: i m sure that you ll succeed.
Actual France translation: je suis s r que tu vas r ussir.
Model translation: je suis heureux que vous vous <EOS>

English: he is crazy ab