<a href="https://colab.research.google.com/github/ahsanabbas123/NLP-From-Scratch/blob/master/Seq2Seq_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [0]:
# For reproducible results

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 1.2MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=88f90dc8c3665a027d064c49d4fafb1cdbd93ebaf420d87a753f680fdae17b95
  Stored in directory: /tmp/pip-ephem-wheel-cache-ea9417yu/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [0]:
# Loading spacy langs
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [0]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [0]:
# Define the source and target fields
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

In [0]:
# split Multi30k dataset
train_data, val_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields=(SRC, TRG))

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 4.88MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.44MB/s]

downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.43MB/s]


In [0]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(val_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(vars(train_data.examples[0]))

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [0]:
# Build Vocab
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [0]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5893


In [0]:
# Get iterators using BucketIterator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128
train_iterator, val_iterator, test_iterator = BucketIterator.splits((train_data, val_data, test_data), batch_size=BATCH_SIZE, device=device)

In [0]:
'''
Encoder :

input_dim is the size/dimensionality of the one-hot vectors that will be input to the encoder. This is equal to the input (source) vocabulary size.
emb_dim is the dimensionality of the embedding layer. This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
hid_dim is the dimensionality of the hidden and cell states.
n_layers is the number of layers in the RNN.
dropout is the amount of dropout to use. This is a regularization parameter to prevent overfitting. Check out this for more details about dropout.

The RNN returns: outputs (the top-layer hidden state for each time-step), hidden (the final hidden state for each layer, 
$h_T$, stacked on top of each other) and cell (the final cell state for each layer, $c_T$, stacked on top of each other).
As we only need the final hidden and cell states (to make our context vector), forward only returns hidden and cell.
'''


class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.1):
    super().__init__()
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # src is [src len , batch_size]
    embedded = self.dropout(self.embedding(src))
    # embedded is [src len, batch_size, emd_dim]
    output, (hidden, cell) = self.rnn(embedded)
    # output is [src len, batch_size, hid_dim]
    # hidden, cell is [n_layers, batch_size, hid_dim]
    
    return hidden, cell


'''
Decoder :
- Within the forward method, we accept a batch of input tokens, previous hidden states and previous cell states. 
- As we are only decoding one token at a time, the input tokens will always have a sequence length of 1.
- We unsqueeze the input tokens to add a sentence length dimension of 1. 
- Then, similar to the encoder, we pass through an embedding layer and apply dropout. 
- This batch of embedded tokens is then passed into the RNN with the previous hidden and cell states. 
- This produces an output (hidden state from the top layer of the RNN), a new hidden state (one for each layer, stacked on top of each other)
  and a new cell state (also one per layer, stacked on top of each other). 
- We then pass the output (after getting rid of the sentence length dimension) through the linear layer to receive our prediction. 
- We then return the prediction, the new hidden state and the new cell state.
'''

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.1):
    super().__init__()
    self.hid_dim = hid_dim
    self.output_dim = output_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim, emb_dim)
    self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
    self.fc = nn.Linear(hid_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, cell):
    # input is [batch_size]
    # hidden and cell are [n_layers, batch_size, hid_dim]
    input = input.unsqueeze(0)
    # input is [1, batch_size]
    embedded = self.dropout(self.embedding(input))
    # input is [1, batch_size, emd_dim]
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    # output is [1 =(seq len), batch_size, hid_dim]
    # hidden, cell are is [n_layers, batch_size, hid_dim]
    prediction = self.fc(output.squeeze(0))
    # prediction is [batch_size, output_dim]

    return prediction, hidden, cell


In [0]:
'''
Seq2Seq :
For the final part of the implemenetation, we'll implement the seq2seq model. This will handle:

- receiving the input/source sentence
- using the encoder to produce the context vectors
- using the decoder to produce the predicted output/target sentence
'''
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
    assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"

  def forward(self, src, trg, teacher_forcing_ratio=0.5):

    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    hidden, cell = self.encoder(src)
    input = trg[0,:]
    
    for t in range(1, trg_len):      
      #insert input token embedding, previous hidden and previous cell states
      #receive output tensor (predictions) and new hidden and cell states
      output, hidden, cell = self.decoder(input, hidden, cell)
      
      #place predictions in a tensor holding predictions for each token
      outputs[t] = output
      
      #decide if we are going to use teacher forcing or not
      teacher_force = random.random() < teacher_forcing_ratio
      
      #get the highest predicted token from our predictions
      top1 = output.argmax(1) 
      
      #if teacher forcing, use actual next token as next input
      #if not, use predicted token
      input = trg[t] if teacher_force else top1

    return outputs


In [0]:
''' Training the model '''
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [0]:
'''
When using apply, the init_weights function will be called on every module and sub-module within our model.
For each module we loop through all of the parameters 
and sample them from a uniform distribution with nn.init.uniform_.
'''

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 13,899,013 trainable parameters


In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [0]:
'''
At each iteration:

- get the source and target sentences from the batch, $X$ and $Y$
- zero the gradients calculated from the last batch
- feed the source and target into the model to get the output, $\hat{Y}$
- as the loss function only works on 2d inputs with 1d targets we need to flatten each of them with .view
- we slice off the first column of the output and target tensors as mentioned above
- calculate the gradients with loss.backward()
- clip the gradients to prevent them from exploding (a common issue in RNNs)
- update the parameters of our model by doing an optimizer step
- sum the loss value to a running total
- Finally, we return the loss that is averaged over all batches.

'''
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
    def epoch_time(start_time, end_time):
      elapsed_time = end_time - start_time
      elapsed_mins = int(elapsed_time / 60)
      elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
      return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 30s
	Train Loss: 5.042 | Train PPL: 154.783
	 Val. Loss: 5.048 |  Val. PPL: 155.697
Epoch: 02 | Time: 0m 30s
	Train Loss: 4.450 | Train PPL:  85.598
	 Val. Loss: 4.690 |  Val. PPL: 108.820
Epoch: 03 | Time: 0m 30s
	Train Loss: 4.158 | Train PPL:  63.928
	 Val. Loss: 4.560 |  Val. PPL:  95.557
Epoch: 04 | Time: 0m 30s
	Train Loss: 3.955 | Train PPL:  52.188
	 Val. Loss: 4.470 |  Val. PPL:  87.371
Epoch: 05 | Time: 0m 30s
	Train Loss: 3.781 | Train PPL:  43.844
	 Val. Loss: 4.327 |  Val. PPL:  75.737
Epoch: 06 | Time: 0m 30s
	Train Loss: 3.638 | Train PPL:  38.019
	 Val. Loss: 4.211 |  Val. PPL:  67.414
Epoch: 07 | Time: 0m 30s
	Train Loss: 3.506 | Train PPL:  33.305
	 Val. Loss: 4.083 |  Val. PPL:  59.302
Epoch: 08 | Time: 0m 30s
	Train Loss: 3.364 | Train PPL:  28.915
	 Val. Loss: 3.999 |  Val. PPL:  54.549
Epoch: 09 | Time: 0m 30s
	Train Loss: 3.232 | Train PPL:  25.324
	 Val. Loss: 3.943 |  Val. PPL:  51.573
Epoch: 10 | Time: 0m 30s
	Train Loss: 3.143 | Train PPL

In [0]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.879 | Test PPL:  48.378 |
