##Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation



In [1]:
%%bash
python -m spacy download en
python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List
import torch

In [3]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [4]:
# Create source and target language tokenizer. Make sure to install the dependencies.

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en')

In [5]:
token_transform[TGT_LANGUAGE]('This is my notebook')

['This', 'is', 'my', 'notebook']

In [6]:
# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

In [7]:
#yield_tokens(train_iter, TGT_LANGUAGE)

In [8]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

## Building SRC & TRG language Vocabulary

In [9]:
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  # Training data Iterator 
  train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
  # Create torchtext's Vocab object 
  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

In [10]:
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [11]:
vocab_transform[TGT_LANGUAGE](['this', 'is'])

[595, 11]

In [12]:
vocab_transform[SRC_LANGUAGE](['es', 'de'])

[302, 7281]

In [13]:
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

## Can also Use
```
src_vocab_pipeline = lambda x: vocab_transform[SRC_LANGUAGE](x)
tgt_vocab_pipeline = lambda x: vocab_transform[TGT_LANGUAGE](x)
```

In [14]:
vocab_transform[TGT_LANGUAGE](['this', 'is', 'my', 'game'])

[595, 11, 2228, 138]

In [15]:
# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

In [16]:
tensor_transform(vocab_transform[TGT_LANGUAGE](['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '\n']))

tensor([   2,   20,   26,   16, 1170,  809,   18,   58,   85,  337, 1340,    6,
           5,    3])

## Test collate func

In [17]:
tensor_transform(vocab_transform[TGT_LANGUAGE](['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '\n']))

tensor([   2,   20,   26,   16, 1170,  809,   18,   58,   85,  337, 1340,    6,
           5,    3])

In [18]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    tgt_batch, src_batch = [], []
    for _src, _tgt in batch:
      _tgt = token_transform[TGT_LANGUAGE](_tgt)
      _src = token_transform[SRC_LANGUAGE](_src)

      _tgt_tok = vocab_transform[TGT_LANGUAGE](_tgt)
      _src_tok = vocab_transform[SRC_LANGUAGE](_src)

      _tgt_tok_app = tensor_transform(_tgt_tok)
      _src_tok_app = tensor_transform(_src_tok)

      tgt_batch.append(_tgt_tok_app)
      src_batch.append(_src_tok_app)

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)

    return src_batch, tgt_batch

## Test the data loader

In [19]:
from torch.utils.data import DataLoader

BATCH_SIZE = 5

train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

for src, tgt in train_dataloader:
  print(src.shape)
  print(tgt.shape)
  break
  #tgt = tgt.to(device)

torch.Size([18, 5])
torch.Size([18, 5])


In [20]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
import torch

In [21]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #no dropout as only one layer!
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded) #no cell state!
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden

In [22]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #context = [n layers * n directions, batch size, hid dim]
        
        #n layers and n directions in the decoder will both always be 1, therefore:
        #hidden = [1, batch size, hid dim]
        #context = [1, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        emb_con = torch.cat((embedded, context), dim = 2)
            
        #emb_con = [1, batch size, emb dim + hid dim]
            
        output, hidden = self.rnn(emb_con, hidden)
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        
        #seq len, n layers and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), 
                           dim = 1)
        
        #output = [batch size, emb dim + hid dim * 2]
        
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden

In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is the context
        context = self.encoder(src)
        
        #context also used as the initial hidden state of the decoder
        hidden = context
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden state and the context state
            #receive output tensor (predictions) and new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

In [24]:
INPUT_DIM = len(vocab_transform[SRC_LANGUAGE])
OUTPUT_DIM = len(vocab_transform[TGT_LANGUAGE])
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

BATCH_SIZE = 128

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [25]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19206, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(10840, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=10840, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 24,729,688 trainable parameters


In [27]:
optimizer = optim.Adam(model.parameters())

In [28]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [29]:
######################################################################
# Let's define training and evaluation loop that will be called for each 
# epoch.
#

from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    
    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)


        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        clip = 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        losses += loss.item()

    return losses / len(val_dataloader)


In [30]:
from timeit import default_timer as timer
NUM_EPOCHS = 15

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, optimizer)
    end_time = timer()
    val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


Epoch: 1, Train loss: 5.006, Val loss: 4.427, Epoch time = 65.833s
Epoch: 2, Train loss: 4.219, Val loss: 4.282, Epoch time = 65.519s
Epoch: 3, Train loss: 3.915, Val loss: 3.913, Epoch time = 65.578s
Epoch: 4, Train loss: 3.643, Val loss: 3.584, Epoch time = 65.683s
Epoch: 5, Train loss: 3.300, Val loss: 3.470, Epoch time = 65.688s
Epoch: 6, Train loss: 3.033, Val loss: 3.258, Epoch time = 65.692s
Epoch: 7, Train loss: 2.749, Val loss: 3.258, Epoch time = 65.665s
Epoch: 8, Train loss: 2.528, Val loss: 3.177, Epoch time = 65.551s
Epoch: 9, Train loss: 2.294, Val loss: 3.159, Epoch time = 65.658s
Epoch: 10, Train loss: 2.103, Val loss: 3.074, Epoch time = 65.681s
Epoch: 11, Train loss: 1.978, Val loss: 3.075, Epoch time = 65.679s
Epoch: 12, Train loss: 1.827, Val loss: 3.135, Epoch time = 65.701s
Epoch: 13, Train loss: 1.700, Val loss: 3.108, Epoch time = 65.677s
Epoch: 14, Train loss: 1.611, Val loss: 3.020, Epoch time = 65.612s
Epoch: 15, Train loss: 1.509, Val loss: 3.148, Epoch time

### Making Inference

In [31]:
# take a list and create a subset from until '<eos>'
def create_sentence(l):
  len_sentence = 0

  for i in range(len(l)):
    if l[i] == '<eos>':
      break

    len_sentence+=1

  sentence = ' '.join(l[:len_sentence])

  return sentence



In [32]:
model.eval()
losses = 0

val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

for src, tgt in val_dataloader:
  src = src.to(device)
  tgt = tgt.to(device)

  output = model(src, tgt)
  output = output[1:]
  tgt = tgt[1:]

  v = output.argmax(2)

  for k in range(10):
    j = random.randint(0,BATCH_SIZE-1)

    result = vocab_transform[TGT_LANGUAGE].lookup_tokens(v[:,j].tolist())
    sourcetxt = vocab_transform[SRC_LANGUAGE].lookup_tokens(src[1:,j].tolist())
    acttgt = vocab_transform[TGT_LANGUAGE].lookup_tokens(tgt[:,j].tolist())

    print('Input Text: ', create_sentence(sourcetxt).rstrip('\n'))
    print('Predicted Text  : ', create_sentence(result).rstrip('\n'))
    print('Actual Output Text: ', create_sentence(acttgt).rstrip('\n'))
    print('*'*40)

  break

Input Text:  Baby sieht sich die Blätter am Zweig eines Baumes an . 
Predicted Text  :  Baby looking at the top at a tree . 
Actual Output Text:  Baby looking at the leaves on a branch of a tree . 
****************************************
Input Text:  Eine Person in einem roten <unk> liegt auf sehr <unk> Art auf einer Mauer vor einem Laternenpfahl . 
Predicted Text  :  A person in a red - is lying on a a on a a of of a of a . . . a . 
Actual Output Text:  A person wearing a red long - sleeved shirt is lying down on a wall in front of a lamp post in a very unusual manner . 
****************************************
Input Text:  Bauarbeiter stehen auf einer Maschine 
Predicted Text  :  Construction workers standing on a of a machine . 
Actual Output Text:  Construction workers standing on top of a piece of machinery . 
****************************************
Input Text:  Ein asiatisches Mädchen mit grünem Hut und Schürze serviert Getränke auf einem Tablett . 
Predicted Text  :  An older 