# Seq2Seq with attention

Translation German into English

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

import spacy

import random
import math
import time


from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import torch.nn.functional as F

In [10]:
seed = 43

random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [11]:
! python -m spacy download en
! python -m spacy download de


spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [12]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]



SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [13]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),  fields = (SRC, TRG))

In [14]:
labels = ['train', 'validation', 'test']
dataloaders = [train_data, valid_data, test_data]
for d, l in zip(dataloaders, labels):
    print("Number of sentences in {} : {}".format(l, len(d.examples)))


Number of sentences in train : 29000
Number of sentences in validation : 1014
Number of sentences in test : 1000


In [15]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print("Number of words in source vocabulary", len(SRC.vocab))
print("Number of words in target vocabulary", len(TRG.vocab))

Number of words in source vocabulary 7855
Number of words in target vocabulary 5893


## Encoder

Encoder is based on biLSTM


In [16]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):

        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
 
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional = True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):

        embedded = self.embedding(src)
        embedded = self.dropout(embedded)
        outputs, (hidden, cell) = self.rnn(embedded)
        
        hidden = torch.cat(tuple([hidden[i:i+2] for i in range(0, hidden.shape[0], 2)]), 2)
        cell = torch.cat(tuple([cell[i:i+2] for i in range(0, cell.shape[0], 2)]), 2) 

        return outputs, hidden, cell


## Decoder
Decoder is based on multi-layer LSTM with Attention machanism (both Bahdanau and Luong are implemented)



In [17]:
class Attention(nn.Module):
    def __init__(self, layers, embedd_size, hidden_size, approach = 'Luong', method="concat"):
        super(Attention, self).__init__()

        self.approach = approach
        self.method = method
        self.hidden_size = hidden_size

        if self.approach == 'Bahdanau':

            self.ln1 = nn.Linear(hidden_size, hidden_size)
            self.ln2 = nn.Linear(hidden_size, hidden_size)
            self.ln3 = nn.Linear(embedd_size, hidden_size)
            self.ln4 = nn.Linear(hidden_size, layers)
            self.ln5 = nn.Linear(hidden_size+embedd_size, hidden_size)

        if self.approach == 'Luong':

            if method == "one-layer-net":
                 self.ln = nn.Linear(hidden_size, hidden_size)

            elif method == "concat":
                 self.ln1 = nn.Linear(hidden_size, hidden_size)
                 self.ln2 = nn.Linear(hidden_size, 1)

    def forward(self, embedded, last_hidden, encoder_outputs, seq_len=None):
            
            if self.approach == 'Bahdanau':

                x = torch.tanh(self.ln1(last_hidden[0]) + self.ln2(encoder_outputs) + self.ln3(embedded))

                alignment_scores = self.ln4(x)
                attn_weights = F.softmax(alignment_scores, dim=0)

                context_vector = torch.bmm(encoder_outputs.permute(1,2,0), attn_weights.permute(1,0,2)).permute(2,0,1)
                    
                context_vector = torch.cat((torch.cat(context_vector.shape[0]*[embedded]), context_vector), 2)
                new_hidden = self.ln5(context_vector)

                return new_hidden

            if self.approach == 'Luong':

                if self.method == "one-layer-net":       

                    out = self.ln(last_hidden)
                    alignment_scores = torch.bmm(encoder_outputs.permute(1,0,2), out.permute(1,2,0))               
                    attn_weights = F.softmax(alignment_scores, dim=1)
                    context_vector = torch.bmm(encoder_outputs.permute(1,2,0), attn_weights).view(-1, self.hidden_size)
                    output = torch.cat((last_hidden.view(-1, self.hidden_size), context_vector), 1)

                    return output

                elif self.method == "dot":

                    alignment_scores = torch.bmm(encoder_outputs.permute(1,0,2), last_hidden.permute(1,2,0))               
                    attn_weights = F.softmax(alignment_scores, dim=1)
                    context_vector = torch.bmm(encoder_outputs.permute(1,2,0), attn_weights).view(-1, self.hidden_size)
                    output = torch.cat((last_hidden.view(-1, self.hidden_size), context_vector), 1)

                    return output      

                elif self.method == "concat":

                    alignment_scores = self.ln2(torch.tanh(self.ln1(last_hidden + encoder_outputs)).permute(1,0,2))
                    attn_weights = F.softmax(alignment_scores, dim=1)
                    context_vector = torch.bmm(encoder_outputs.permute(1,2,0), attn_weights).view(-1, self.hidden_size)
                    output = torch.cat((last_hidden.view(-1, self.hidden_size), context_vector), 1)

                    return output     



class DecoderAttn(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, attention, dropout=0.1):
        super(DecoderAttn, self).__init__()
        
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        
        self.attn = attention

        self.embedding = nn.Embedding(self.output_dim, self.emb_dim)
        
        self.rnn = nn.LSTM(self.emb_dim, self.hid_dim, self.n_layers) #(lstm embd, hid, layers, dropout)

        if self.attn.approach == 'Bahdanau':
            self.out = nn.Linear(self.hid_dim, self.output_dim)  # Projection :hid_dim x output_dim

        elif self.attn.approach == 'Luong':
            self.out = nn.Linear(self.hid_dim*2, self.output_dim)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_, hidden, cell, encoder_outputs):
        
        if self.attn.approach == 'Bahdanau':

            input_ = input_.unsqueeze(0)
            embedded = self.embedding(input_)
            embedded = self.dropout(embedded)
            hidden_attention = self.attn.forward(embedded, hidden, encoder_outputs)

            output, (hidden, cell) = self.rnn(embedded, (hidden_attention, cell))
            prediction = self.out(output.squeeze(0))
            
            return prediction, hidden, cell

        elif self.attn.approach == 'Luong':

            input_ = input_.unsqueeze(0)
            embedded = self.embedding(input_)
            embedded = self.dropout(embedded)
            
            output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
            output = self.attn.forward(embedded, output, encoder_outputs)
            prediction = self.out(output)
            
            return prediction, hidden, cell

## Seq2Seq itself


In [18]:
BOS_IDX = TRG.vocab.stoi['<sos>']

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self._init_weights() 
        self.max_len=30
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):

        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        enc_out, hidden, cell = self.encoder.forward(src)
        input = trg[BOS_IDX] 
        
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input, hidden, cell, enc_out)
  
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs
    
    def translate(self, src):
        trg_vocab_size = self.decoder.output_dim

        outputs = []
        
        src = torch.as_tensor(src).view(-1,1).to(self.device)

        enc_out, hidden, cell = self.encoder.forward(src) 

        input = torch.as_tensor(BOS_IDX).view(-1).to(self.device)     

        for t in range(1, self.max_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell, enc_out) 
            top1 = output.max(1)[1]
            outputs.append(top1)
            input = (top1)
        
        return outputs
    
    def _init_weights(self):
        p = 0.08
        for name, param in self.named_parameters():
            nn.init.uniform_(param.data, -p, p)
        


In [37]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
src_embd_dim =  tgt_embd_dim = 256
num_layers_enc =  2
num_layers_dec =  2
dropout_prob = 0.5
hidden_dim_enc = 256
hidden_dim_dec = 512

batch_size = 128
PAD_IDX = TRG.vocab.stoi['<pad>']

iterators = BucketIterator.splits((train_data, valid_data, test_data),
                                  batch_size = batch_size, device = device)
train_iterator, valid_iterator, test_iterator = iterators




encoder = Encoder(input_dim, src_embd_dim, hidden_dim_enc, num_layers_enc, dropout_prob)
attention = Attention(num_layers_dec, tgt_embd_dim, hidden_dim_dec, approach = 'Bahdanau', method="dot")
decoder = DecoderAttn(output_dim, tgt_embd_dim, hidden_dim_dec, num_layers_dec, attention, dropout_prob)


model = Seq2Seq(encoder, decoder, device).to(device)


In [38]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 256, num_layers=2, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): DecoderAttn(
    (attn): Attention(
      (ln1): Linear(in_features=512, out_features=512, bias=True)
      (ln2): Linear(in_features=512, out_features=512, bias=True)
      (ln3): Linear(in_features=256, out_features=512, bias=True)
      (ln4): Linear(in_features=512, out_features=2, bias=True)
      (ln5): Linear(in_features=768, out_features=512, bias=True)
    )
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2)
    (out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [39]:
def train(model, iterator, optimizer, criterion, teacher, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg, teacher)
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [40]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing !!
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)


            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [41]:
EOS_IDX = TRG.vocab.stoi['<eos>']

def translate(sentence):

    model.eval()
    
    with torch.no_grad():
        sentence = sentence.lower()
        sent_vec = [SRC.vocab.stoi[token] for token in sentence.split()]

        translation_idx = model.translate(torch.tensor(sent_vec))
        for t in translation_idx:
            if t[0] != EOS_IDX:
                print(TRG.vocab.itos[t[0]], end=' ')
            else:
                break


# Training and BLEu estimations


In [42]:
max_epochs = 20
CLIP = 1
teacher = 0.2


optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.7)


best_valid_loss = float('inf')
for epoch in range(1, max_epochs+1):
    
    
    train_loss = round(train(model, train_iterator, optimizer, criterion, teacher, CLIP), 5)
    valid_loss = round(evaluate(model, valid_iterator, criterion), 5)
    scheduler.step()
    
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    
    print('\nEpoch: {} \nTrain Loss {}  Val loss {}:'.format(epoch, train_loss, valid_loss))
    print('Train Perplexity {}  Val Perplexity {}:'.format(np.exp(train_loss), np.exp(valid_loss)))
    print('Teacher forcing:', teacher)

    translate("Ich habe einen Hund")


Epoch: 1 
Train Loss 5.02642  Val loss 4.57103:
Train Perplexity 152.3864913670454  Val Perplexity 96.643601435321:
Teacher forcing: 0.2
a . 
Epoch: 2 
Train Loss 4.36643  Val loss 4.26193:
Train Perplexity 78.76194903711755  Val Perplexity 70.94677868728083:
Teacher forcing: 0.2
is . 
Epoch: 3 
Train Loss 3.99227  Val loss 3.96867:
Train Perplexity 54.17773333734162  Val Perplexity 52.914108255059034:
Teacher forcing: 0.2
is . . 
Epoch: 4 
Train Loss 3.61246  Val loss 3.63381:
Train Perplexity 37.05710124840456  Val Perplexity 37.85677651795954:
Teacher forcing: 0.2
is . 
Epoch: 5 
Train Loss 3.32383  Val loss 3.41635:
Train Perplexity 27.766492833863992  Val Perplexity 30.458040039214634:
Teacher forcing: 0.2
is . . 
Epoch: 6 
Train Loss 3.07818  Val loss 3.34271:
Train Perplexity 21.718838118249995  Val Perplexity 28.295704254493184:
Teacher forcing: 0.2
is to . . 
Epoch: 7 
Train Loss 2.92735  Val loss 3.18328:
Train Perplexity 18.67806797520208  Val Perplexity 24.125756397553193:

In [43]:
model.load_state_dict(torch.load('model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print('| Test Loss: {} Test PPL:{}|'.format(test_loss, np.exp(test_loss)))

| Test Loss: 2.920739710330963 Test PPL:18.55500771551206|


In [44]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


def translate_2(sentence):
    """
    function that uses .translate() method of the model to translate german sentence into english
    params:
        sentence: tokenized gernam sentence
    """
    model.eval()
    
    res = []
    with torch.no_grad():

        sent_vec = [SRC.vocab.stoi[token] for token in sentence]
        translation_idx = model.translate(torch.tensor(sent_vec))
        for t in translation_idx:
            if t[0] != EOS_IDX:
                res.append(TRG.vocab.itos[t[0]])
            else:
                break
    return res

In [45]:
translations = []
ground_truth = []
german = []

for i in test_data:
  translations.append(translate_2(i.src[1:])[:-1])
  ground_truth.append(i.trg[:-1])
  german.append(i.src[1:])

for i in valid_data:
  translations.append(translate_2(i.src[1:])[:-1])
  ground_truth.append(i.trg[:-1])
  german.append(i.src[1:])

print(len(translations), len(ground_truth), len(german))

2014 2014 2014


In [46]:
smoothie = SmoothingFunction().method5
corpus_bleu([[ref] for ref in ground_truth], translations, smoothing_function=smoothie) * 100

26.199896941988065

In [47]:
for k in range(10):
    i = np.random.randint(0,len(translations))

    print('GERMAN:'," ".join(german[i]))
    print('TRANSLATION:'," ".join(translations[i]))
    print('ENGLISH:'," ".join(ground_truth[i])+'\n')

GERMAN: ipod seinem auf musik hört und bank einer auf sitzt mann ein
TRANSLATION: man sitting on a bench bench listening listening to music
ENGLISH: a man sits on a bench listing to his

GERMAN: freien im megafon einem mit stehen hemden blauen in personen zwei
TRANSLATION: people in blue shirts are standing a a .
ENGLISH: two people in blue shirts are outside with a bullhorn

GERMAN: fangen zu ball den um , laufen footballspieler die
TRANSLATION: players players running the football running running to the ball
ENGLISH: the football players are running to get the ball

GERMAN: gartenschlauch einem mit spielt t-shirt roten einem und schnuller einem mit junge kleiner ein
TRANSLATION: little boy with a and and red shirt shirt shirt plays with a a .
ENGLISH: a toddler boy with a pacifier and a red shirt plays with a garden hose

GERMAN: hintergrund im bäumen mit straße eine gehen menschen aussehende gruftis wie vier
TRANSLATION: five people with people people are walking down a a street
ENG