# **Implement Seq2Seq from scratch**

---
Model: Seq2Seq Bi-GRU without Attention

Dataset: Huggingface's mt_en_vi


### Create model

In [None]:
import torch.nn as nn
import torch
import random

In [None]:
class RNNEncoder(nn.Module):
    def __init__(self, source_vocab_size, emb_size=300, hidden_size=1024, num_layers=2, dropout_ratio=0.2, bidirectional=True):
        super().__init__()
        self.src_vocab_size = source_vocab_size
        self.hidden_size = hidden_size // 2 if bidirectional else hidden_size
        self.n_layers = num_layers
        self.n_directions = 2 if bidirectional else 1

        self.dropout = nn.Dropout(p=dropout_ratio)
        self.embedding = nn.Embedding(num_embeddings=source_vocab_size, embedding_dim=emb_size)
        self.gru = nn.GRU(input_size=emb_size, hidden_size=self.hidden_size, num_layers=num_layers,
                          bidirectional=bidirectional, dropout=dropout_ratio)
    
    def forward(self, inputs):
        # inputs: [max_input_length, bs]

        emb = self.dropout(self.embedding(inputs))
        out, hid = self.gru(emb)

        if self.n_directions == 2:
            hid = hid.view(self.n_layers, self.n_directions, -1, self.hidden_size)
            hid = torch.cat((hid[:, 0, :, :], hid[:, 1, :, :]), dim=2)
        
        return hid

    def load_pretrained_embedding(self):
        pass

In [None]:
class RNNDecoder(nn.Module):
    def __init__(self, target_vocab_size, emb_size=300, hidden_size=1024, num_layers=2, dropout_ratio=0.2):
        super().__init__()
        self.trg_vocab_size = target_vocab_size
        self.hidden_size = hidden_size
        self.n_layers = num_layers

        self.dropout = nn.Dropout(p=dropout_ratio)
        self.embedding = nn.Embedding(num_embeddings=target_vocab_size, embedding_dim=emb_size)
        self.gru = nn.GRU(input_size=emb_size, hidden_size=hidden_size, 
                          num_layers=num_layers, dropout=dropout_ratio)
        self.fc = nn.Linear(in_features=hidden_size, out_features=target_vocab_size)
    
    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        # input: [1, bs]

        emb = self.dropout(self.embedding(input))
        out, hid = self.gru(emb, hidden)
        
        pred = self.fc(out.squeeze(0))
        # pred: [bs, target_vocab_size]

        return pred, hid

    def load_pretrained_embedding(self):
        pass

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.bidirectional_encoder = True if encoder.n_directions == 2 else False
        
    def forward(self, src, trg, teacher_forcing_ratio=0):
        
        # src: [max_input_length, bs]
        # trg: [max_output_length, bs]
        
        batch_size = trg.shape[1]
        max_output_len = trg.shape[0]
        trg_vocab_size = self.decoder.trg_vocab_size
        
        # tensor to store decoder outputs
        preds = torch.zeros(max_output_len, batch_size, trg_vocab_size).to(DEVICE)
    
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
                
        # first input to the decoder is the <sos> tokens
        input = trg[0]
        
        for t in range(1, max_output_len):         
            pred, hidden = self.decoder(input, hidden)
            preds[t] = pred
            teacher_force = random.random() < teacher_forcing_ratio
            best_pred = pred.argmax(1) 
            input = trg[t] if teacher_force else best_pred
        
        return preds

### Prepare data

Load dataset

In [None]:
!pip install datasets
from datasets import load_dataset
hf_dataset = load_dataset('mt_eng_vietnamese', 'iwslt2015-vi-en')



Reusing dataset mt_eng_vietnamese (/root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-vi-en/1.0.0/87223258c122f5f4a9bee0428064f4b49a9463fad8177a3b03c0615e4f3122b7)


Preprocess data

In [None]:
# Import hf's tokenizer
!pip install transformers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Collecting transformers
  Using cached https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl
Processing /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45/sacremoses-0.0.43-cp37-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
  Using cached https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [None]:
# Create iterator through a data set. Convert text into tensors
# Returns list of batches, each batch: {'src': [max_input_length, bs], 'trg': [max_output_length, bs]}

def make_iterator(dataset, batch_size):
    n_examples = len(dataset)
    random.shuffle(dataset)
    iterator = []
    for i in range(n_examples // batch_size):
        src_texts = []
        trg_texts = []
        for j in range(batch_size):
            src_texts.append(dataset[batch_size*i+j]['en'])
            trg_texts.append(dataset[batch_size*i+j]['vi'])
        src_tensors = tokenizer(src_texts, padding='max_length', max_length=MAX_INPUT_LENGTH, truncation=True, return_tensors='pt')['input_ids'].permute(1, 0)
        trg_tensors = tokenizer(trg_texts, padding='max_length', max_length=MAX_OUTPUT_LENGTH, truncation=True, return_tensors='pt')['input_ids'].permute(1, 0)
        new_batch = {'src': src_tensors, 'trg': trg_tensors}
        iterator.append(new_batch)
    return iterator

### Training

In [None]:
def train(model, iterator, criterion, optimizer):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch['src'].to(DEVICE)
        trg = batch['trg'].to(DEVICE)

        optimizer.zero_grad()
        
        output = model(src, trg, TEACHER_FORCING_RATIO)

        
        # src: [max_input_length, bs]
        # trg: [max_output_length, bs]
        # output: [max_output_length, bs, trg_vocab_size]
        
        trg_vocab_size = output.shape[-1]
        
        output = output[1:].reshape(-1, trg_vocab_size)
        trg = trg[1:].reshape(-1)
        
        # trg = [(trg len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
                
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch['src'].to(DEVICE)
            trg = batch['trg'].to(DEVICE)

            output = model(src, trg)

            # src: [max_input_length, bs]
            # trg: [max_output_length, bs]
            # output: [max_output_length, bs, trg_vocab_size]
            
            trg_vocab_size = output.shape[-1]
            
            output = output[1:].reshape(-1, trg_vocab_size)
            trg = trg[1:].reshape(-1)
            
            # trg = [(trg len - 1) * batch size]
            # output = [(trg len - 1) * batch size, output dim]
            
            loss = criterion(output, trg)            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# Training hyperparams
BATCH_SIZE = 128
MAX_INPUT_LENGTH = 32
MAX_OUTPUT_LENGTH = 64
NUM_EPOCHS = 3
LEARNING_RATE = 0.001
TEACHER_FORCING_RATIO = 0.1

# Model hyperparams
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
VOCAB_SIZE = tokenizer.vocab_size
encoder = RNNEncoder(VOCAB_SIZE).to(DEVICE)
decoder = RNNDecoder(VOCAB_SIZE).to(DEVICE)
seq2seq = Seq2Seq(encoder, decoder).to(DEVICE)

loss_function = nn.CrossEntropyLoss(ignore_index=0)     # ignore [PAD] token
optim = torch.optim.Adam(seq2seq.parameters(), lr=LEARNING_RATE)

In [None]:
# Continue training
# seq2seq.load_state_dict(torch.load('vanilla-seq2seq.pt'))
NUM_EPOCHS = 4
LEARNING_RATE = 0.0001
TEACHER_FORCING_RATIO = 0
optim = torch.optim.Adam(seq2seq.parameters(), lr=LEARNING_RATE)

In [None]:
# Train model and save the best checkpoint
# best_valid_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    
    # Generate train_iterator, valid_iterator
    train_iterator = make_iterator(hf_dataset['train'][:]['translation'], BATCH_SIZE)
    valid_iterator = make_iterator(hf_dataset['validation'][:]['translation'], BATCH_SIZE)

    start_time = time.time()
    
    train_loss = train(seq2seq, train_iterator, loss_function, optim)
    valid_loss = evaluate(seq2seq, valid_iterator, loss_function)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(seq2seq.state_dict(), 'vanilla-seq2seq-bi.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Time: 29m 8s
	Train Loss: 3.834
	 Val. Loss: 3.921
Epoch: 02 | Time: 29m 11s
	Train Loss: 3.812
	 Val. Loss: 3.915
Epoch: 03 | Time: 29m 10s
	Train Loss: 3.800
	 Val. Loss: 3.920
Epoch: 04 | Time: 29m 10s
	Train Loss: 3.790
	 Val. Loss: 3.900


### Inference

In [None]:
# Load model at best checkpoint
seq2seq.load_state_dict(torch.load('vanilla-seq2seq.pt'))

<All keys matched successfully>

In [None]:
# Compute loss on test set
test_iterator = make_iterator(hf_dataset['test'][:]['translation'], BATCH_SIZE)
test_loss = evaluate(seq2seq, test_iterator, loss_function)
print(f'\tTest Loss: {test_loss:.3f}')

	Test Loss: 3.900
