In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import codecs

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def read_file(path):
    
    with open(path, encoding='utf-8', mode='r+') as f:
        data = f.read()
    
    return data.strip().split('\n')

def preprocess_word(word):
    
    s = re.sub(r'([.,?!])', r' \1', word)
    s = re.sub(r'[^a-zA-Z.\'!?]+', r' ', word)

    return s

def split_en_fra(line):
    
    en = []
    fra = []
    pairs = []
    
    for l in line:
        pair = l.lower().split('\t')
        en_word = pair[0]
        fra_word = pair[1]

        en.append(preprocess_word(pair[0]))
        fra.append(preprocess_word(pair[1]))
        pairs.append(pair)
        
    return en, fra, pairs

def filter_pairs(pairs, max_length=10):
    
    filtered_pairs = []
    
    eng_prefixes = (
        "i am", "i m ",
        "he is", "he s",
        "she is", "she s",
        "you are", "you re",
        "we are", "we re",
        "they are", "they re"
    )
    
    for p in pairs:
        pair1 = preprocess_word(p[0])
        pair2 = preprocess_word(p[1])
        if len(pair1.split(' ')) < max_length and len(pair2.split(' ')) < max_length and pair1.startswith(eng_prefixes):
            filtered_pairs.append([pair1, pair2])
            
    return filtered_pairs

In [5]:
line = read_file('./data/eng-fra.txt')

In [6]:
line[:5]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !']

In [7]:
en, fra, pairs = split_en_fra(line)
filtered_pairs = filter_pairs(pairs)

In [8]:
print(en[:5])
print(fra[:5])
print(pairs[:5])
print(filtered_pairs[:5])
print(len(pairs))
print(len(filtered_pairs))

['go.', 'run!', 'run!', 'wow!', 'fire!']
['va !', 'cours !', 'courez !', ' a alors !', 'au feu !']
[['go.', 'va !'], ['run!', 'cours\u202f!'], ['run!', 'courez\u202f!'], ['wow!', 'ça alors\u202f!'], ['fire!', 'au feu !']]
[['i am fat.', 'je suis gras.'], ['he is ill.', 'il est malade.'], ['he is old.', 'il est vieux.'], ['i am busy.', 'je suis occup .'], ['i am calm.', 'je suis calme.']]
135842
3775


In [9]:
def build_dict(eng_list, fra_list):
    
    start_token = 0
    end_token = 1
    
    en_idx = {}
    fra_idx = {}
    idx_en = {}
    idx_fra = {}
    
    en_idx['SOS'] = start_token
    en_idx['EOS'] = end_token
    fra_idx['SOS'] = start_token
    fra_idx['EOS'] = end_token
    
    idx = 2
    for e in eng_list:
        for word in e.split():
            if word not in en_idx:
                en_idx[word] = idx
                idx += 1
    idx = 2
    for f in fra_list:
        for word in f.split():
            if word not in fra_idx:
                fra_idx[word] = idx
                idx += 1
            
    idx_en[0] = 'SOS'
    idx_en[1] = 'EOS'
    idx_fra[0] = 'SOS'
    idx_fra[1] = 'EOS'
        
    for word, idx in en_idx.items():
        idx_en[idx] = word
    for word, idx in fra_idx.items():
        idx_fra[idx] = word
    
    return en_idx, fra_idx, idx_en, idx_fra

In [10]:
en_idx, fra_idx, idx_en, idx_fra = build_dict(en, fra)

In [11]:
len(en_idx) == len(idx_en)

True

In [12]:
def en_to_idx(sentence):
    return [en_idx[word] for word in sentence.split(' ')]

def fra_to_idx(sentence):
    return [fra_idx[word] for word in sentence.split(' ')]

def to_tensor(indexes):
    indexes.append(1)
    return torch.tensor(indexes, device=device).view(-1, 1)

def tensor_from_pairs(pair):
    input_tensor = to_tensor(en_to_idx(pair[0].strip()))
    target_tensor = to_tensor(fra_to_idx(pair[1].strip()))
    
    return input_tensor, target_tensor

In [13]:
rand_idx = random.randint(0, len(filtered_pairs))
print(filtered_pairs[rand_idx])
print(tensor_from_pairs(filtered_pairs[rand_idx]))

['she is a very intelligent young lady.', "c'est une jeune femme tr s intelligente."]
(tensor([[ 184],
        [ 245],
        [ 110],
        [1233],
        [9038],
        [7055],
        [5493],
        [   1]], device='cuda:0'), tensor([[  67],
        [ 753],
        [2400],
        [2320],
        [ 499],
        [ 313],
        [4012],
        [   1]], device='cuda:0'))


In [14]:
def generate_batch_tensor(batch_size, pairs):
    
    batch_data = []
    random.shuffle(pairs)
    
    for p in pairs:
        batch_data.append(tensor_from_pairs(p))
        
        if len(batch_data) == batch_size:

            yield batch_data

In [15]:
next(generate_batch_tensor(5, filtered_pairs))

[(tensor([[ 104],
          [2609],
          [  86],
          [ 345],
          [2198],
          [   1]], device='cuda:0'),
  tensor([[ 134],
          [ 156],
          [ 367],
          [9141],
          [2716],
          [   1]], device='cuda:0')),
 (tensor([[ 184],
          [1506],
          [ 249],
          [   1]], device='cuda:0'),
  tensor([[ 324],
          [3354],
          [ 443],
          [   1]], device='cuda:0')),
 (tensor([[  66],
          [ 245],
          [4833],
          [ 345],
          [ 609],
          [3362],
          [ 345],
          [1112],
          [   1]], device='cuda:0'),
  tensor([[  71],
          [1298],
          [1510],
          [ 389],
          [2512],
          [  69],
          [1897],
          [   1]], device='cuda:0')),
 (tensor([[ 104],
          [ 405],
          [1233],
          [1063],
          [   1]], device='cuda:0'),
  tensor([[ 134],
          [  46],
          [ 499],
          [ 313],
          [1813],
          [   1]],

In [16]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.input_size = input_size
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, inp, hidden):
        embed = self.embedding(inp).view(1, 1, -1)
        output = embed
        output, hidden = self.gru(output, hidden)
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [17]:
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Dense(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, inp, hidden):
        output = self.embedding(inp).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.ouut(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [18]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=10):
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, inp, hidden, encoder_outputs):
        embed = self.embedding(inp).view(1, 1, -1)
        embed = self.dropout(embed)
        
        attn_weights = F.softmax(self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embed[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        
        return output, hidden, attn_weights
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [19]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, crit, max_length=10):
    
    encoder_hidden = encoder.init_hidden()
    
    encoder_opt.zero_grad()
    decoder_opt.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    training_loss = 0
    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        encoder_outputs[i] = encoder_output[0,0]
        
    decoder_input = torch.tensor([[0]], device=device)
    
    decoder_hidden = encoder_hidden
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            training_loss += crit(decoder_output, target_tensor[i])
            decoder_input = target_tensor[i]
            
    else:
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            
            training_loss += crit(decoder_output, target_tensor[i])
            
            if decoder_input.item() == 1:
                break
                
    training_loss.backward()
    encoder_opt.step()
    decoder_opt.step()
    
    return training_loss.item() / target_length

In [20]:
def train_iters(encoder, decoder, n_iters, lr, print_every=10, plot_every=100):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_opt = optim.SGD(encoder.parameters(), lr=lr)
    decoder_opt = optim.SGD(decoder.parameters(), lr=lr)
#     training_pairs = [tensor_from_pairs(random.choice(filtered_pairs)) for _ in range(n_iters)]
    criterion = nn.NLLLoss()
    
    for i in range(n_iters):
        rand_pairs = random.choice(filtered_pairs)
        training_pairs = tensor_from_pairs(rand_pairs)
#         training_pair = training_pairs[i]
        input_tensor = training_pairs[0]
        target_tensor = training_pairs[1]
        
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if i % print_every == 0:
            train_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print("Epoch #{} Loss:{}".format(i // print_every, train_loss_avg))
            
#         if i % plot_every == 0:
#             plot_loss_avg = print_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_avg = 0
            

In [30]:
def train_batch(encoder, decoder, batch_size, n_epoch, lr):
    
    train_loss = 0
    
    encoder_opt = optim.Adam(encoder.parameters(), lr=lr)
    decoder_opt = optim.Adam(decoder.parameters(), lr=lr)
    criterion = nn.NLLLoss()
    
    for i in range(n_epoch):
        
        for batch_idx, batch in enumerate(next(generate_batch_tensor(batch_size, filtered_pairs))):
            input_tensor = batch[0]
            target_tensor = batch[1]
            
            loss = train(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, criterion)
            train_loss = train_loss + (1 / (batch_idx + 1)) * (loss - train_loss)
            
        print("Epoch #{} | Loss:{}".format(i+1, train_loss))
        
        train_loss = 0

In [33]:
hidden_size = 256
encoder1 = EncoderRNN(len(en_idx), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(fra_idx), dropout_p=0.1).to(device)

train_batch(encoder1, attn_decoder1, batch_size=32, n_epoch=500, lr=0.001)

Epoch #1 | Loss:5.765310939377736
Epoch #2 | Loss:5.289547710286247
Epoch #3 | Loss:5.09117398801304
Epoch #4 | Loss:4.61146394687276
Epoch #5 | Loss:4.435002452844665
Epoch #6 | Loss:4.603814513307242
Epoch #7 | Loss:4.97554296401758
Epoch #8 | Loss:5.000761142798832
Epoch #9 | Loss:4.495552404532354
Epoch #10 | Loss:4.420061555552105
Epoch #11 | Loss:4.684738720858855
Epoch #12 | Loss:4.296890112970557
Epoch #13 | Loss:4.819211546438081
Epoch #14 | Loss:4.501241597438615
Epoch #15 | Loss:5.0883841070627405
Epoch #16 | Loss:5.048834842088676
Epoch #17 | Loss:4.947392651817156
Epoch #18 | Loss:4.449600359773824
Epoch #19 | Loss:4.526536537257453
Epoch #20 | Loss:4.68247251565021
Epoch #21 | Loss:4.335761953653798
Epoch #22 | Loss:4.909706726599307
Epoch #23 | Loss:4.510367498724234
Epoch #24 | Loss:4.474223959587868
Epoch #25 | Loss:4.7215478843166725
Epoch #26 | Loss:4.432063117233062
Epoch #27 | Loss:4.889441294305854
Epoch #28 | Loss:4.416071051808575
Epoch #29 | Loss:4.639354751223

Epoch #230 | Loss:2.8674852035318814
Epoch #231 | Loss:2.558637399983311
Epoch #232 | Loss:2.828566097160653
Epoch #233 | Loss:2.1341343788164
Epoch #234 | Loss:2.5368949079413023
Epoch #235 | Loss:2.4386786462413896
Epoch #236 | Loss:2.634414162942105
Epoch #237 | Loss:2.505672268380249
Epoch #238 | Loss:2.1846751513403087
Epoch #239 | Loss:2.2618623967947697
Epoch #240 | Loss:2.796358995766393
Epoch #241 | Loss:2.198122224377261
Epoch #242 | Loss:2.463168417461335
Epoch #243 | Loss:2.246712172983421
Epoch #244 | Loss:2.1831374128009116
Epoch #245 | Loss:2.3791690416752345
Epoch #246 | Loss:1.8733487445565444
Epoch #247 | Loss:2.490657889133408
Epoch #248 | Loss:2.3538042795016536
Epoch #249 | Loss:2.7527374226599925
Epoch #250 | Loss:2.172589051285906
Epoch #251 | Loss:1.8551387449992556
Epoch #252 | Loss:2.264040088221903
Epoch #253 | Loss:2.186012517365198
Epoch #254 | Loss:1.87166114457336
Epoch #255 | Loss:2.391056459719344
Epoch #256 | Loss:1.9963693753653575
Epoch #257 | Loss:2

Epoch #454 | Loss:1.2815807339262277
Epoch #455 | Loss:1.5307572811457613
Epoch #456 | Loss:1.5026352695411158
Epoch #457 | Loss:1.706661830990324
Epoch #458 | Loss:1.7894061149171892
Epoch #459 | Loss:1.239132509844023
Epoch #460 | Loss:1.8468101851376044
Epoch #461 | Loss:1.2855855588579466
Epoch #462 | Loss:1.4833502573375075
Epoch #463 | Loss:1.7309068955037563
Epoch #464 | Loss:1.5357577504619724
Epoch #465 | Loss:1.7158814287227062
Epoch #466 | Loss:1.3163869057345898
Epoch #467 | Loss:1.8360723153614098
Epoch #468 | Loss:1.3666342127550808
Epoch #469 | Loss:1.5645421119939003
Epoch #470 | Loss:1.27023015695491
Epoch #471 | Loss:1.7673519340787258
Epoch #472 | Loss:1.5725992594150797
Epoch #473 | Loss:1.3142451326089304
Epoch #474 | Loss:1.7691770136592881
Epoch #475 | Loss:1.5006061047249077
Epoch #476 | Loss:1.3616639980395873
Epoch #477 | Loss:2.0149100273699743
Epoch #478 | Loss:1.7023869943021543
Epoch #479 | Loss:1.8607709494759401
Epoch #480 | Loss:1.4375169459135375
Epoch

In [97]:
hidden_size = 256
encoder1 = EncoderRNN(len(en_idx), hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, len(fra_idx), dropout_p=0.1).to(device)

train_iters(encoder1, attn_decoder1, lr=0.01, n_iters=75000, print_every=5000)

Epoch #0 Loss:0.002042321341378348
Epoch #1 Loss:4.4049204833663556
Epoch #2 Loss:3.60808532378745
Epoch #3 Loss:3.00178563995259
Epoch #4 Loss:2.3930053956861737
Epoch #5 Loss:1.9028129336227715
Epoch #6 Loss:1.5236937234779502
Epoch #7 Loss:1.2476111472871252
Epoch #8 Loss:0.9903130174774107
Epoch #9 Loss:0.8452153684947403
Epoch #10 Loss:0.7152414842244694
Epoch #11 Loss:0.5991151942635896
Epoch #12 Loss:0.5270209830537718
Epoch #13 Loss:0.4735609930491003
Epoch #14 Loss:0.4108813001404079


In [98]:
def evaluate(encoder, decoder, sentence, max_length=10):
    
    with torch.no_grad():
        input_tensor = to_tensor(en_to_idx(sentence))
        input_length = input_tensor.size(0)
        encoder_hidden = encoder.init_hidden()
        
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        
        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
            encoder_outputs[i] += encoder_output[0,0]
        
        decoder_input = torch.tensor([[0]], device=device)
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)
        
        for i in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[i] = decoder_attention.data
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(idx_fra[topi.item()])
                
            decoder_input = topi.squeeze().detach()
            
        return decoded_words, decoder_attention[:i + 1]
        

In [99]:
def evaluate_randomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(filtered_pairs)
        print('English:', pair[0])
        print('Actual French translation:', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('Model translation:', output_sentence)
        print('')

In [100]:
evaluate_randomly(encoder1, attn_decoder1)

English: she is a second year student.
Actual French translation: c'est une l ve de deuxi me ann e.
Model translation: c'est une l ve de deuxi me ann e. <EOS>

English: you really don't have a clue do you?
Actual French translation: t'as vraiment pas l'ombre d'une id e si ?
Model translation: vous tes vraiment ignare non ? <EOS>

English: i am a japanese.
Actual French translation: je suis japonais.
Model translation: je suis japonais. <EOS>

English: he seems to be rich.
Actual French translation: on dirait qu'il est riche.
Model translation: il semble tre riche. <EOS>

English: she is japanese.
Actual French translation: elle est japonaise.
Model translation: elle est japonaise. <EOS>

English: you are right in a way.
Actual French translation: en un sens vous avez raison.
Model translation: en un sens tu avez raison. <EOS>

English: he is reading.
Actual French translation: il lit.
Model translation: il est en train de lire. <EOS>

English: i am exhausted.
Actual French translation: