# 2-Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import time

In [2]:
SEED=1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic=True

In [3]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='0'
torch.set_num_threads(4)

In [4]:
spacy_de=spacy.load('de')
spacy_en=spacy.load('en')

In [5]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return[tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC= Field(tokenize=tokenize_de, init_token='<sos>',
          eos_token='<eos>',
          lower=True)
TRG=Field(tokenize=tokenize_en, init_token='<sos>',
         eos_token='<eos>',
         lower=True)

In [7]:
train_data, valid_data, test_data=Multi30k.splits(exts=('.de','.en'),fields=(SRC,TRG))

In [8]:
print(vars(train_data.examples[0]))

{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [10]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
BATCH_SIZE=128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data,valid_data,test_data),
                                                                      batch_size=BATCH_SIZE, device=device)


In [12]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.input_dim=input_dim
        self.emb_dim=emb_dim
        self.hid_dim=hid_dim
        self.dropout=dropout
        
        self.embedding=nn.Embedding(input_dim, emb_dim)
        self.rnn=nn.GRU(emb_dim,hid_dim)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,src):
        embedded=self.dropout(self.embedding(src))
        outputs, hidden= self.rnn(embedded)
        return hidden

Decoder

The decoder is where the implementation differs significantly from the previous model and we alleviate some of the information compression.
Instead of the GRU in the decoder taking just the target token, $y_t$ and the previous hidden state $s_{t-1}$ as inputs, it also takes the context vector $z$

In [13]:
class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim, hid_dim, dropout):
        super().__init__()
        self.emb_dim=emb_dim
        self.hid_dim=hid_dim
        self.output_dim=output_dim
        self.dropout=dropout
        
        self.embedding=nn.Embedding(output_dim,emb_dim)
        self.rnn=nn.GRU(emb_dim+hid_dim,hid_dim)
        self.out=nn.Linear(emb_dim+hid_dim*2,output_dim)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,input, hidden, context):
        input=input.unsqueeze(0)
        embedded=self.dropout(self.embedding(input))
        emb_con=torch.cat((embedded,context),dim=2)
        output,hidden=self.rnn(emb_con,hidden)
        output=torch.cat((embedded.squeeze(0),hidden.squeeze(0),context.squeeze(0)),dim=1)
        
        prediciton=self.out(output)
        
        return prediciton, hidden

In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.device=device
        assert encoder.hid_dim ==decoder.hid_dim, \
        "Hidden dimension of encoder and decoder must be equal!"
    def forward(self,src, trg, teacher_forcing_ration=0.5):
        batch_size=trg.shape[1]
        max_len=trg.shape[0]
        trg_vocab_size=self.decoder.output_dim
        
        outputs=torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        context=self.encoder(src)
        
        hidden=context
        
        input=trg[0,:]
        
        for t in range(1, max_len):
            
            output, hidden=self.decoder(input, hidden, context)
            outputs[t]=output
            teacher_force=random.random()<teacher_forcing_ration
            top1=output.max(1)[1]
            input=(trg[t] if teacher_force else top1)
        return outputs

In [16]:
INPUT_DIM=len(SRC.vocab)
OUTPUT_DIM=len(TRG.vocab)
ENC_EMB_DIM=256
DEC_EMB_DIM=256
HID_DIM=512
ENC_DROPOUT=0.5
DEC_DROPOUT=0.5

enc=Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec=Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model=Seq2Seq(enc, dec, device).to(device)

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')