<a href="https://colab.research.google.com/github/akshat-suwalka/NLP/blob/main/Transformer_for_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Library

In [1]:
!pip install torchtext==0.8.1



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
import torchtext
import sys 
from torchtext.datasets import Multi30k
from torchtext.data import BucketIterator, Field

In [3]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


# Additional utils function

In [4]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

# Data Preprocessing

In [5]:
spacy_eng = spacy.load('en')
spacy_ger = spacy.load('de')

# Tokenize any sentence into words
# "My name is akshat" --> ['My', 'name', 'is', 'akshat']
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

# Help in data preprocessing
german = Field(tokenize = tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
english = Field(tokenize = tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")

train_data, valid_data, test_data = Multi30k.splits(
    exts = (".de", ".en"), fields = (german,english)
)

german.build_vocab(train_data, max_size = 10000, min_freq = 2)
english.build_vocab(train_data, max_size = 10000, min_freq = 2)



#Model

## Transformer

In [6]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoding_layers,
        num_decoding_layers,
        forward_expansion,
        dropout,
        max_len,
        device
    ):
        super(Transformer, self).__init__()

        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
        self.device = device

        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoding_layers,
            num_decoding_layers,
            forward_expansion,
            dropout
        )

        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, src):
        # src shape : src_len, N
        src_mask = src.transpose(0,1) == self.src_pad_idx
        # (N, src_len)
        return src_mask

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device)
        trg_positions = torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device)

        embed_src = self.src_word_embedding(src) + self.src_position_embedding(src_positions)
        embed_src = self.dropout(embed_src)

        embed_trg = self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions)
        embed_trg = self.dropout(embed_trg)

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            tgt_mask = trg_mask,
        )

        out = self.fc_out(out)

        return out



# Training Part

## Initialize training phase

In [7]:
# Setup Training phase
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

## Training hyperparameters

In [8]:
num_epochs = 5
learning_rate = 3e-4
batch_size = 32

## Model hyperparameters

In [9]:
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100                # number of words in a sentence and used for positional encoding
forward_expansion = 4
src_pad_idx = english.vocab.stoi["<pad>"]

## Tensorboard for nice plots

In [10]:
writer = SummaryWriter("runs/loss_plot")
step = 0

## Training batches

In [11]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,    # This will help to make same length sentence in a same batch to reduce padding computation
    sort_key = lambda x: len(x.src),
    device = device
)



## Start

In [12]:
# initializing transformer model
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device
).to(device)

In [13]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [14]:
# When computing cost function then it will help to avoid the computation on padding output part
pad_idx = english.vocab.stoi["<pads>"] 
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [15]:
# if model is already available
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

In [16]:
# example sentence
sentence = "ein pferd geht unter einer brücke neben einem boot."

In [18]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch+1} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict" : model.state_dict(),
            "optimizer" : optimizer.state_dict()
        }

        save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length = 100
    )

    print(f"Translated example sentence \n {translated_sentence}")
    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward propagation

        # traget is right shifted so that output of transformer will try match with target first element which earlier was second element before right shifting
        output = model(inp_data, target[:-1]) 

        # we want there to be a shift of 1 between the output and the target and 
        # we want the output to be one ahead of the target and
        # we're going to do reshape and
        # we're gonna do minus 1 since this is just going to be a single long vector for every target word that we want the index for every target word
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()

        loss = criterion(output, target)
        loss.backward()

        # Avoid exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)

        optimizer.step()

        writer.add_scalar("Training loss", loss, global_step = step)

        step += 1


[Epoch0 / 5]
=> Saving checkpoint
Translated example sentence 
 ['excitement', 'assisting', 'military', 'steel', 'assisting', 'assisting', 'assisting', 'assisting', 'skimpy', 'hurt', 'assisting', 'excitement', 'assisting', 'skimpy', 'hurt', 'cleaned', 'assisting', 'cleaned', 'assisting', 'assisting', 'military', 'excitement', 'hurt', 'assisting', 'excitement', 'hurt', 'assisting', 'excitement', 'military', 'effort', 'hurt', 'assisting', 'military', 'excitement', 'participant', 'assisting', 'hurt', 'assisting', 'excitement', 'blankly', 'men', 'hurt', 'hurt', 'cleaned', 'assisting', 'excitement', 'taxis', 'whole', 'hurt', 'cleaned', 'hurt', 'assisting', 'military', 'excitement', 'assisting', 'effort', 'assisting', 'men', 'swung', 'whole', 'hurt', 'hurt', 'hurt', 'assisting', 'military', 'assisting', 'men', 'coconuts', 'military', 'military', 'herself', 'hurt', 'reaction', 'effort', 'cleaned', 'assisting', 'sharp', 'beers', 'whole', 'taxis', 'assisting', 'military', 'excitement', 'excitem



[Epoch1 / 5]
=> Saving checkpoint
Translated example sentence 
 ['a', 'horse', 'is', 'walking', 'under', 'a', 'boat', 'next', 'to', 'a', 'boat', '.', '<eos>']
[Epoch2 / 5]
=> Saving checkpoint
Translated example sentence 
 ['a', 'horse', 'walks', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
[Epoch3 / 5]
=> Saving checkpoint
Translated example sentence 
 ['a', 'horse', 'walks', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
[Epoch4 / 5]
=> Saving checkpoint
Translated example sentence 
 ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']


# Evaluating with BLeu Score

In [19]:
# if model is already available
load_model = True  # Yes, we trained it 

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

=> Loading checkpoint


In [None]:
score = bleu(test_data, model, german, english, device)
print(f"Bleu score {score*100:.2f}")