### Imports

In [55]:
import unicodedata
import numpy as np
import torch
from torch import nn
import re
import os

try:
    from tensorboardX import SummaryWriter
except ModuleNotFoundError:
    print("TensorboardX not available")
    pass

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'device = {device}')

device = cpu


## Translating text

How do you do this? There are many difficulties with different sentence lengths, different grammar or contextual information. In this notebook we will cover how to do this using sequence to sequence learning.

![](img/hello-lead.png)

## Sequence to sequence learning
We will use pytorch to translate short sentences from French to English and vice versa

In [35]:
# download the needed data
if not os.path.isfile('data.zip'):
    ! curl -o data.zip https://download.pytorch.org/tutorial/data.zip && unzip data.zip 

In [36]:
# Take a quick view of the data.
with open('data/eng-fra.txt') as f:
    f.seek(1000)
    print(f.read(200))

 de question !
Really?	Vraiment ?
Really?	Vrai ?
Really?	Ah bon ?
Thanks.	Merci !
We try.	On essaye.
We won.	Nous avons gagné.
We won.	Nous gagnâmes.
We won.	Nous l'avons emporté.
We won.	Nous l'empor


# Preparing the data I

* Create a Language class that maps indexes to words and words to indexes

**indexes to word**
```python
{0: SOS,
 1: EOS,
 2: The
 ...
 n: World
}
```

**words to indexes**
```python
{SOS: 0,
 EOS: 1,
 The: 2
 ...
 World: n
}
```

* Implement functions to convert the letters to ASCII and remove rare letters. (á, ò, ê -> a, o, e)

In [130]:
class Language:
    """ Utility class that serves as a language dictionary """
    def __init__(self, name):
        self.name = name
        # Count how often a word occurs in the language data.
        self.word2count = {}
        # Words are mapped to indices and vice versa
        self.index2word = {0: "SOS", 1: "EOS"}
        self.word2index = {v:k for k, v in self.index2word.items()}
        # Total word count
        self.n_words = 2  # Count SOS and EOS

    def add_sentence(self, sentence):
        """ Process words in a sentence string. """
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        """ Process a word (e.g. put it in vocabulary and count) """
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        elif word != 'SOS' and word != 'EOS':
            self.word2count[word] += 1
    
    def translate_indexes(self, idx):
        """ Takes in a vector of indices and returns the sentence. """
        return [self.index2word[i] for i in idx]
    
    def translate_words(self, words):
        """ Takes in a vector of indices and returns the sentence. """
        return [self.word2index[w] for w in words.split(' ')]
    
    
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"\s?[.!?]", r" EOS", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def read_langs(lang1, lang2):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    input_lang = Language(lang1)
    output_lang = Language(lang2)

    return input_lang, output_lang, pairs

# Preparing the data II
Since there are a lot of example sentences and we want to train something quickly, we'll trim the data set to only relatively short and simple sentences. 
Here the maximum length is 10 words (that includes ending punctuation) and we're filtering to sentences that translate to the form "I am" or "He is" etc. 
(accounting for apostrophes replaced earlier).


In [131]:
def filter_pairs(pairs):
    MAX_LENGTH = 10
    
    eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
    )
    
    def filter_pair(p):
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH \
            and p[0].startswith(eng_prefixes)
    return [pair for pair in pairs if filter_pair(pair)]

# Preparing the data III

Read the data from the text files, normalize the sentences, create the Language instances from the Language class and wrap the two languages in a Data class so we can shuffle the sentences and query them later.

In [166]:
np.random.seed(42)

class Data:
    def __init__(self, pairs, lang_1, lang_2):
        self.pairs = np.array(pairs)        
        np.random.shuffle(self.pairs)
        idx_1 = [[lang_1.word2index[word] for word in s.split(' ')] 
                               for s in self.pairs[:, 0]]
        idx_2 = [[lang_2.word2index[word] for word in s.split(' ')]
                               for s in self.pairs[:, 1]]
        self.idx_pairs = np.array(list(zip(idx_1, idx_2)))
        self.shuffle_idx = np.arange(len(pairs))
                
    def __str__(self):
        return(self.pairs)
    
    def shuffle(self):
        np.random.shuffle(self.shuffle_idx)
        self.pairs = self.pairs[self.shuffle_idx]
        self.idx_pairs = self.idx_pairs[self.shuffle_idx]      
    
def prepare_data(lang1, lang2):
    # read_langs initialized the Language objects (still empty) and returns the pair sentences.
    input_lang, output_lang, pairs = read_langs(lang1, lang2)
    print(f"Read {len(pairs)} sentence pairs")
    
    # Reduce data. We haven't got all day to train a model.
    pairs = filter_pairs(pairs) 
    print(f"Trimmed to {len(pairs)} sentence pairs")
    print("Counting words...")
    
    # Process the language pairs.
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, Data(pairs, input_lang, output_lang)


eng, fra, data = prepare_data('eng', 'fra')
print(f"First data pair: {data.pairs[0]}")

Reading lines...
Read 135842 sentence pairs
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
eng 2922
fra 4486
First data pair: ['you re jealous EOS' 'tu es jalouse EOS']


# Sequence to sequence model overview

![](img/seq2seq.png)

## The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word. Every output could be seen as the context of the sentence up to that point.

<img src="img/training_seq2seq_many2may.svg" alt="drawing" style="height:300px;float: left;"/>

![](img/encoder-network.png)

In [141]:
class Encoder(nn.Module):
    def __init__(self, n_words, embedding_size, hidden_size, device=device):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        # The word embeddings will also be trained
        # To freeze them --> m.embedding.weight.requires_grad = False
        self.embedding = nn.Embedding(n_words, embedding_size)  
        self.rnn = nn.GRU(embedding_size, hidden_size)
        
        self.device = device
        if device == 'cuda':
            self.cuda()
                    
    def forward(self, x):
        print(f'(raw) x = {x.shape}')
        print(f'embedded x = {self.embedding(x).shape}')

        # shape (seq_length, batch_size, input_size)
        dense_vector = self.embedding(x).view(x.shape[0], 1, -1)
        print(f'dense_vector = {dense_vector.shape}')
        
        # init hidden layer at beginning of sequence --> SOS
        h = torch.zeros(1, 1, self.hidden_size, device=self.device)
        print(f'h init = {h.shape}')
        
        x, h = self.rnn(dense_vector, h)
        print(f'h = {h.shape}, x = {x.shape}')

        return x, h
        

m = Encoder(n_words=eng.n_words, 
            embedding_size=10, 
            hidden_size=2, 
            device=device)

eng_sentence = data.pairs[0][0]
print(f'Test sentence: {eng_sentence}')
sentence = torch.tensor(eng.translate_words(eng_sentence), device=device)
print(f'Test tensor: {sentence}')
enc_out, enc_hidden = m(sentence)
print(f'output shape: {enc_out.shape}')

Test sentence: we are even EOS
Test tensor: tensor([ 75, 123, 125,   1])
(raw) x = torch.Size([4])
embedded x = torch.Size([4, 10])
dense_vector = torch.Size([4, 1, 10])
h init = torch.Size([1, 1, 2])
h = torch.Size([1, 1, 2]), x = torch.Size([4, 1, 2])
output shape: torch.Size([4, 1, 2])


# Simple Decoder

In the simplest seq2seq decoder we use only last output of the encoder. This last output is sometimes called the context vector as it encodes context from the entire sequence. This context vector is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and hidden state. The initial input token is the start-of-string <SOS> token, and the first hidden state is the context vector (the encoder’s last hidden state).
    
![](img/decoder-network-adapted.png)
    

The power of this model lies in the fact that it can map sequences of different lengths to each other. As you can see the inputs and outputs are not correlated and their lengths can differ. This opens a whole new range of problems which can now be solved using such architecture.    
    
<img src="img/unfolded-encoder-decoder.png" alt="drawing" style="width:500px;float: left;"/>

In [150]:
class Decoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, device=device):
        super(Decoder, self).__init__()
        self.decoder = 'simple'
        self.hidden_size = hidden_size
        # Lookup table for the last word activation.
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.out = nn.Sequential(
            nn.Linear(hidden_size, output_size),
            nn.LogSoftmax(dim=2)
        )
        self.device = device
        if device == 'cuda':
            self.cuda()
            
    def forward(self, word, h):
        """ Forward pass of the NN
        
        Parameters
        ----------
        word : torch.tensor
            Last word or start of sentence token.
        h : torch.tensor
            Hidden state or context tensor.
        """
        # Map from shape (seq_len, embedding_size) to (seq_len, batch, embedding_size)
        # Note: seq_len is the number of words in the sentence
        word_embedding = self.embedding(word).view(1, 1, -1)
        x, h = self.rnn(word_embedding, h)

        return self.out(x), h
    
m = Decoder(embedding_size=10, 
            hidden_size=20, 
            output_size=eng.n_words,
            device=device)
m.train(False)
out, hidden = m(torch.tensor([1]) ,torch.zeros(1, 1, 20))
out.size(), hidden.size()

(torch.Size([1, 1, 2922]), torch.Size([1, 1, 20]))

## What is wrong with the simple decoder?

![](img/seq2seq.png)
![](img/vanishing_context.png)

## Solution: Attention
![](img/attention-decoder-network-adapted.png)

In [157]:
class AttentionDecoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, dropout=0.1, max_length=10, device=device):
        super(AttentionDecoder, self).__init__()
        self.decoder = 'attention'
        self.max_length = max_length
        self.device = device
        self.embedding = nn.Sequential(
            nn.Embedding(output_size, embedding_size),
        )
        
        # Seperate neural network to learn the attention weights
        self.attention_weights = nn.Sequential(
            nn.Linear(embedding_size + hidden_size, max_length),
            nn.Softmax(2)
        )
        self.attention_combine = nn.Sequential(
            nn.Linear(hidden_size + embedding_size, hidden_size),
            nn.ReLU()
        )
        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Sequential(
            nn.Linear(hidden_size, output_size),
            nn.LogSoftmax(2)
        )
        
        if device == 'cuda':
            self.cuda()
        
    def forward(self, word, h, encoder_outputs):
        """
        :param word: (LongTensor) The word indices. This is the last activated word or 
        :param h: (tensor) The hidden state from the previous step. In the first step, the hidden state of the encoder.
        :param encoder_outputs: (tensor) Zero padded (max_length, shape, shape) outputs from the encoder.
        """
        # map from shape (seq_len, embedding_size) to (seq_len, batch, embedding_size) 
        # Note: seq length is the number of words in the sentence
        print("word", word)
        word_embedding = self.embedding(word).view(1, 1, -1)
        print("word embedding", word_embedding.shape)
        # Concatenate the word embedding and the last hidden state, so that attention weights can be determined.
        x = torch.cat([word_embedding, h], dim=2)
        
        # attention applied
        attention_weights = self.attention_weights(x)
        print("before bmm", x.shape)
        print("attention_weights", attention_weights.shape)
        
        x = torch.bmm(attention_weights, encoder_outputs.unsqueeze(0))  # could also be done with matmul
        print("after bmm", x.shape)
   
        # attention combined
        x = torch.cat((word_embedding, x), 2)
        x = self.attention_combine(x)
        
        x, h = self.rnn(x, h)
        x = self.out(x)

        return x, h
    
embedding_size = 256
hidden_size    = 256
max_length     = 10

m        = Encoder(eng.n_words, embedding_size, hidden_size, device=device)
sentence = torch.tensor([1, 23, 9], device=device)
out, h   = m(sentence)
print("out.shape:", out.shape)

# in case sentence is shorter than max_length, pad with zeros
encoder_outputs = torch.zeros(max_length, out.shape[-1], device=device)
encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1)
print(f'encoder_outputs.shape: {encoder_outputs.shape}')

m = AttentionDecoder(embedding_size, hidden_size, output_size=2, device=device)
m(torch.tensor([1], device=device), h, encoder_outputs)[0].shape

(raw) x = torch.Size([3])
embedded x = torch.Size([3, 256])
dense_vector = torch.Size([3, 1, 256])
h init = torch.Size([1, 1, 256])
h = torch.Size([1, 1, 256]), x = torch.Size([3, 1, 256])
out.shape: torch.Size([3, 1, 256])
encoder_outputs.shape: torch.Size([10, 256])
word tensor([1])
word embedding torch.Size([1, 1, 256])
before bmm torch.Size([1, 1, 512])
attention_weights torch.Size([1, 1, 10])
after bmm torch.Size([1, 1, 256])


torch.Size([1, 1, 2])

## Utility function to run the decoder & calculate the loss

In [10]:
def run_decoder(decoder, criterion, sentence, h, teacher_forcing=False, encoder_outputs=None):
    loss = 0
    word = torch.tensor([0], device=device) # <SOS>
    for j in range(sentence.shape[0]):
        if decoder.decoder == 'attention':
            x, h = decoder(word, h, encoder_outputs)
        else:
            x, h = decoder(word, h)

        loss += criterion(x.view(1, -1), sentence[j].view(-1))
        if teacher_forcing:
            word = sentence[j]
        else:
            word = x.argmax().detach()
        if word.item() == 1: # <EOS>
            break
    return loss

## Training the model

In [11]:
epochs                = 10
teacher_forcing_ratio = 0.5
embedding_size        = 100
context_vector_size   = 256

encoder = Encoder(n_words=eng.n_words, 
                  embedding_size=embedding_size, 
                  hidden_size=context_vector_size)
decoder = AttentionDecoder(embedding_size=embedding_size, 
                           hidden_size=context_vector_size, 
                           output_size=fra.n_words)

if 'SummaryWriter' in globals():
    writer = SummaryWriter('tb/train-3')

In [None]:
# Zelf bouwen
def train(encoder, decoder):
    # Criterion
    criterion = nn.NLLLoss()
    
    # Optimizers
    optim_encoder = torch.optim.SGD(encoder.parameters(), lr=0.01)
    optim_decoder = torch.optim.SGD(decoder.parameters(), lr=0.01)  
    
    # Models
    encoder.train(True)
    decoder.train(True)

    # Train loop
    for epoch in range(epochs):
        data.shuffle()
        for i in range(data.pairs.shape[0]):
            optim_decoder.zero_grad()
            optim_encoder.zero_grad()
            
            pair = data.idx_pairs[i]
            eng_sentence = torch.tensor(pair[0], device=device)
            fra_sentence = torch.tensor(pair[1], device=device)

            # Encode the input language
            out, h = encoder(eng_sentence)        
            
            # pad encoder outputs with zeros
            encoder_outputs = torch.zeros(max_length, out.shape[-1], device=device)
            if decoder.decoder == 'attention':
                encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1) # remove batch dim
            
            # implement teacher_forcing
            teacher_forcing = np.random.rand() < teacher_forcing_ratio
            loss = run_decoder(decoder, criterion, fra_sentence, h, teacher_forcing, encoder_outputs)
            loss.backward()
            
            if 'SummaryWriter' in globals():
                writer.add_scalar('loss', loss.cpu().item() / (len(fra_sentence)))

            optim_decoder.step()
            optim_encoder.step()

        print(f'epoch {epoch}')

train(encoder, decoder)

## Or load a pretrained model

In [None]:
# encoder = Encoder(eng.n_words, embedding_size, context_vector_size)
# encoder.load_state_dict(torch.load('models/encoder_10_epochs.pt', map_location=device))

# decoder = AttentionDecoder(embedding_size, context_vector_size, fra.n_words)
# decoder.load_state_dict(torch.load('models/decoder_10_epochs.pt', map_location=device))

## Start translating some sentences from English to French

In [None]:
def translate(start, end):
    for i in range(start, end):
        pair = data.idx_pairs[i]
        eng_sentence = torch.tensor(pair[0], device=device)
        fra_sentence = torch.tensor(pair[1], device=device)

        print('English sentence:\t', ' '.join([eng.index2word[i.item()] for i in eng_sentence[:-1]]))
        print('French sentence:\t', ' '.join([fra.index2word[i.item()] for i in fra_sentence[:-1]]))

        # Encode the input language
        out, h = encoder(eng_sentence)        
        encoder_outputs = torch.zeros(max_length, out.shape[-1], device=device)
        encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1)
        
        word = torch.tensor([0], device=device) # <SOS>
  
        translation = []
        for j in range(eng_sentence.shape[0]):
            x, h = decoder(word, h, encoder_outputs=encoder_outputs)
  
            word = x.argmax().detach()
            translation.append(word.cpu().data.tolist())

            if word.item() == 1: # <EOS>
                break
        print('\nModel translation:\t', ' '.join([fra.index2word[i] for i in translation][:-1]), '\n' + '-'*50)
        
translate(0, 60)