# Natural Machine Translation using RNNs and Attention 

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F



Preparing the datas
--------------
We are going to import the data base but we also need to process what it contains in order to be able to manipulate:
~~~~~~

-  Every word is a hot vector and we will inditify it as the index of the single one
-  Turning the Unicode to Ascii characters
-  Every word will be lowercased



In [2]:
Start_token = 0
End_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "Start", 1: "End"}
        self.n_words = 2  # Count Start and End

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

Description of the dataset
--------------
The dataset is text file where we have a huge amount of lines in every lines we may have many expression and their translation so an example would be:

Go.	Va ! 
Run!	Cours !
Run!	Courez !
Wow!	Ça alors !
Fire!	Au feu !
Help!	À l'aide !
Jump.	Saute.
Stop!	Ça suffit !
...

That is not even a full line so we can see that there are pairs of Word->Translated and there space between them so we are using this structutre.
In order to make it more usefull we are going to make it also able to reverse the process and translate from the original language

In [9]:
def readL(orig, trslt, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (orig, trslt) , encoding='utf_8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
               
    input_lang = Lang(orig)
    output_lang = Lang(trslt)

    # Reverse pairs, make Lang instances
    if reverse:
        # We need to change the pairs its always going to be processed this way :Word->Translated   
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(trslt)
        output_lang = Lang(orig)
     
                 
    return input_lang, output_lang, pairs
input_lang, output_lang, pairs= readL('eng', 'fra', True)


Reading lines...


In [11]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs= readL(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    ML=0

    for p in pairs :
        if ML>max(len(p[0]),len(p[1])):
            ML=ML
        else :
            ML=max(len(p[0]),len(p[1]))
        
    print(ML)
    return input_lang, output_lang, pairs,ML


input_lang, output_lang, pairs,ML = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Counting words...
Counted words:
fra 21334
eng 13043
348
['tu es fort elegant .', 'you re very sophisticated .']


The Seq2Seq Model
--------------
This model has many variaous names one of them is the Encoder/Decoder network because we have two partis the encoder which is the one reading the input sequence outputs a single vector and a hidden state, and the decoder that reads the resulted vector to produce an output sequence.

![seq2seq](https://user-images.githubusercontent.com/45148200/50037065-30378d80-000e-11e9-874d-01e4cca8a272.png)
As we can see on the graphic we have a first RNN(LSTM or GRU) as a decoder and another one as encoder but we can see a little difference between the two, there is no "memory" in the encoder its a flow of information (Output + hidden state)

However there is still an issue how are we going to make this network evolving which means how are we going to apply the back propagation that will modify our weights ?

### Embedding 

Embeddings are dense vector representations of the characters. The rationale behind using it is to convert an arbitrary discrete id, to a continuous representation.

The main advantage is that back-propagation is possible over continuous representations while it is not over discrete representation

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hdim):
        super(EncoderRNN, self).__init__()
        self.hdim = hdim
        self.embedding = nn.Embedding(input_size, hdim)
        self.gru = nn.GRU(hdim,hdim)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hdim)

Attention is all you need
--------------
Today we cannot talk about good NMT without going through this. 
<img width="1058" alt="seq2seqwthatt" src="https://user-images.githubusercontent.com/45148200/50037100-5ceba500-000e-11e9-9fcb-604bdbf65008.png">


Obviously what we need after a encoder would be a basic decoder, it works but when we think about compressing all the necessary information in vector can be a burden to handle. The decoder will have to cope with long sentences, especially those that are longer than the sentences in the training corpus.
Thus Dzmitry Bahdanau tought about a way to make the decoder filter by importance what he called " Align ", it identifies which parts of the input sequence are relevant to each word in the outputAttention allows the decoder network to “focus” on a different part of the encoder’s outputs for every step of the decoder’s own outputs.
### How does it work?
The Attention layer will receive the hidden state and the inputs, its output will be multiplied by the encoder outputs therefore it will show the decoder where to focus or emphasize on.
Furthemore, in order to handle the length issue we are adding to the Attention Layer a parameter Max_length. This layer return a weight vector for each word.

In [13]:


class AttnDecoderRNN(nn.Module):
    def __init__(self, hdim, output_size, dropout_p=0.1, max_length=ML):
        super(AttnDecoderRNN, self).__init__()
        self.hdim = hdim
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hdim)
        self.attn = nn.Linear(self.hdim * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hdim * 2, self.hdim)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hdim, self.hdim)
        self.out = nn.Linear(self.hdim, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hdim)



Training & Evaluation
--------------
The training is basically this 4 steps:

-  Set a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting


The evaluation is the same but instead of feeding the decoder targets we are feeding its own predictions back to itself

In [14]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(End_token)
    return torch.tensor(indexes, dtype=torch.long,).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [38]:

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=ML):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hdim)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[Start_token]])

    decoder_hidden = encoder_hidden

#Instead of using its own prediction like in classical RNN we use feed our decode the target

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        loss += criterion(decoder_output, target_tensor[di])
        decoder_input = target_tensor[di]


    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Setting a timer
--------------



In [27]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

Training 
--------------





In [43]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [46]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluate
--------------




In [45]:
def evaluate(encoder, decoder, sentence, max_length=ML):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hdim)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[Start_token]])  #

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == End_token:
                decoded_words.append('<End>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [41]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hdim = 256
encoder1 = EncoderRNN(input_lang.n_words, hdim)
attn_decoder1 = AttnDecoderRNN(hdim ,output_lang.n_words, dropout_p=0.1)

trainIters(encoder1, attn_decoder1, 10000, print_every=5000)

37m 25s (- 37m 25s) (5000 50%) 4.5166


In [None]:
evaluateRandomly(encoder1, attn_decoder1)
