<a href="https://colab.research.google.com/github/Bhavnicksm/marathi-neural-machine-translation/blob/main/nmt_transformer_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Before beginning this notebook, ensure that you have data.csv in available in the working directory

In [None]:
#!pip install torchtext==0.8.0

In [None]:
#!python -m spacy download en

## Hyperparameter declaration

In [None]:
from argparse import Namespace

In [None]:
hype = Namespace(
    BATCH_SIZE = 128,
    NUM_EPOCHS = 10,
    CLIP = 1,
    DEVICE = None,
    
    save_checkpoint =True,
    load_checkpoint = False
)

In [None]:
checkpoint = Namespace(
    epoch_num = 0,
    model_params = None,
    optim_params = None,
    loss=0,
)

In [None]:
#example usage
hype.BATCH_SIZE

128

In [None]:
#to dict
vars(hype)

{'BATCH_SIZE': 128,
 'CLIP': 1,
 'DEVICE': None,
 'NUM_EPOCHS': 10,
 'load_checkpoint': False,
 'save_checkpoint': True}

## Data Processing

### Loading the data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data = pd.read_csv('data.csv', header=None)
data.columns = ['english', 'marathi']
data.tail()

Unnamed: 0,english,marathi
40746,Just saying you don't like fish because of the...,हड्डींमुळे मासे आवडत नाही असं म्हणणं हे काय मा...
40747,The Japanese Parliament today officially elect...,आज जपानी संसदेने अधिकृतरित्या र्‍यौतारौ हाशिमो...
40748,Tom tried to sell his old VCR instead of throw...,टॉमने त्याचा जुना व्ही.सी.आर फेकून टाकण्याऐवजी...
40749,You can't view Flash content on an iPad. Howev...,आयपॅडवर फ्लॅश आशय बघता येत नाही. पण तुम्ही त्य...
40750,"In 1969, Roger Miller recorded a song called ""...","१९६९मध्ये रॉजर मिलरने ""यू डोन्ट वॉन्ट माय लव्ह..."


### Building tokenizers

In [None]:
import re
import string
import spacy

In [None]:
#tokenizers for both

def tokenize_mar(text):
  for punc in string.punctuation:
    text = text.replace(punc, " "+punc+" " )
  return [tok.strip() for tok in re.split(r' ', text) if tok!='']

spacy_en = spacy.load('en')

def tokenize_eng(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
#examples of tokenized sentences
ex_tok_mar = tokenize_mar(data['marathi'][40000])
print(ex_tok_mar)

ex_tok_eng = tokenize_eng(data['english'][40000])
print(ex_tok_eng)

['उद्याची', 'मीटिंग', 'कुठे', 'असणार', 'आहे', 'हे', 'तुम्हाला', 'माहीत', 'आहे', 'का', '?']
['Do', 'you', 'know', 'where', 'tomorrow', "'s", 'meeting', 'is', 'going', 'to', 'be', '?']


### Building Vocabularies

In [None]:
import torchtext
from torchtext.vocab import Vocab
from collections import Counter

print(torchtext.__version__)

0.8.0


In [None]:
def build_vocab(data, tokenizer):
  counter = Counter()
  for text in data:
    counter.update(tokenizer(text))
  return Vocab(counter, max_size=10000, specials=('<unk>','<pad>','<sos>','<eos>'),)

In [None]:
mar_vocab = build_vocab(data['marathi'],tokenize_mar)
eng_vocab = build_vocab(data['english'],tokenize_eng)

In [None]:
print(f'Length of Marathi vocab: {len(mar_vocab)}')
print(f'Length of English vocab: {len(eng_vocab)}')

Length of Marathi vocab: 10004
Length of English vocab: 6400


### Changing the dataset to have tokens

In [None]:
import torch

In [None]:
def data_process(data, src_tokenizer, tar_tokenizer, src_vocab, tar_vocab):
  raw_src_iter = iter(data['marathi'])
  raw_tar_iter = iter(data['english'])

  data = []

  for (raw_src, raw_tar) in zip(raw_src_iter,raw_tar_iter):
    src_tensor = torch.tensor([src_vocab[tok] for tok in src_tokenizer(raw_src)], dtype=torch.long)
    tar_tensor = torch.tensor([tar_vocab[tok] for tok in tar_tokenizer(raw_tar)], dtype=torch.long)

    data.append((src_tensor,tar_tensor))

  return data

In [None]:
dataset = data_process(data, tokenize_mar, tokenize_eng, mar_vocab, eng_vocab)

### DataLoaders

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
hype.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
hype.DEVICE

'cpu'

In [None]:
PAD_IDX = mar_vocab["<pad>"]
SOS_IDX = mar_vocab["<sos>"]
EOS_IDX = mar_vocab["<eos>"]

print(f"pad index: {PAD_IDX}")
print(f"sos index: {SOS_IDX}")
print(f"eos index: {EOS_IDX}")

pad index: 1
sos index: 2
eos index: 3


In [None]:
def batch_proc(data_batch):
  src_batch = []
  tar_batch = []
  
  for (src_item, tar_item) in data_batch:
    src_batch.append(torch.cat([torch.tensor([SOS_IDX]), src_item , torch.tensor([EOS_IDX])], dim=0))
    tar_batch.append(torch.cat([torch.tensor([SOS_IDX]), tar_item , torch.tensor([EOS_IDX])], dim=0))
  
  src_batch = pad_sequence(src_batch, padding_value= PAD_IDX)
  tar_batch = pad_sequence(tar_batch, padding_value= PAD_IDX)

  return src_batch, tar_batch

In [None]:
train_iter = DataLoader(dataset, batch_size= hype.BATCH_SIZE , shuffle=True, collate_fn= batch_proc)

In [None]:
# a = next(iter(train_iter))
# print(a)

## Modelling

In [None]:
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

In [None]:
class Transformer(nn.Module):

  def __init__(self,
               embedding_size: int,
               src_vocab_size: int,
               trg_vocab_size: int,
               src_pad_idx: int,
               num_heads: int,
               num_encoder_layers: int,
               num_decoder_layers: int,
               forward_expansion: int,
               dropout: float,
               max_len: int, 
               device: str,
               ):
    
    super().__init__()

    self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
    self.src_position_embedding = nn.Embedding(max_len, embedding_size)
    self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
    self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

    self.device = device

    self.transformer = nn.Transformer(embedding_size, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout)

    self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.src_pad_idx = src_pad_idx


  def make_src_mask(self, src):
    src_mask = src.transpose(0, 1) == self.src_pad_idx

    # (N, src_len)
    return src_mask.to(self.device)

  def forward(self, src, trg):
    src_seq_length, N = src.shape
    trg_seq_length, N = trg.shape

    src_positions = (
        torch.arange(0, src_seq_length)
        .unsqueeze(1)
        .expand(src_seq_length, N)
        .to(self.device)
    )

    trg_positions = (
        torch.arange(0, trg_seq_length)
        .unsqueeze(1)
        .expand(trg_seq_length, N)
        .to(self.device)
    )

    embed_src = self.dropout(
        (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
    )
    embed_trg = self.dropout(
        (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
    )

    src_padding_mask = self.make_src_mask(src)
    trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
        self.device
    )

    out = self.transformer(
        embed_src,
        embed_trg,
        src_key_padding_mask=src_padding_mask,
        tgt_mask=trg_mask,
    )
    out = self.fc_out(out)
    return out


## Training

In [None]:
import tqdm
from tqdm import notebook

import time

In [None]:
model_hype = Namespace(
    src_vocab_size = len(mar_vocab),
    trg_vocab_size = len(eng_vocab),
    embedding_size = 512,
    num_heads = 8,
    num_encoder_layers = 3,
    num_decoder_layers = 3,
    dropout = 0.10,
    max_len = 100,
    forward_expansion = 4,
    src_pad_idx = mar_vocab["<pad>"]
)

model_hype.src_pad_idx

1

In [None]:
# creating the final model object
model = Transformer(
    model_hype.embedding_size,
    model_hype.src_vocab_size,
    model_hype.trg_vocab_size,
    model_hype.src_pad_idx,
    model_hype.num_heads,
    model_hype.num_encoder_layers,
    model_hype.num_decoder_layers,
    model_hype.forward_expansion,
    model_hype.dropout,
    model_hype.max_len,
    hype.DEVICE,
).to(hype.DEVICE)

In [None]:
def init_model(model: nn.Module):
  for name, param in model.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data, mean=0., std = 0.01)
    else:
      nn.init.constant_(param.data, 0.)

model.apply(init_model)

Transformer(
  (src_word_embedding): Embedding(10004, 512)
  (src_position_embedding): Embedding(100, 512)
  (trg_word_embedding): Embedding(6400, 512)
  (trg_position_embedding): Embedding(100, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=4, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=4, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAtt

In [None]:
def count_params (model: nn.Module):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"There are {count_params(model):,} parameters in the model.")

There are 21,285,144 parameters in the model.


In [None]:
optimizer = optim.Adam(model.parameters())

tar_PAD_IDX = eng_vocab['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=tar_PAD_IDX)

In [None]:
def train(model: nn.Module, iterator: torch.utils.data.DataLoader, optimizer: optim.Optimizer, criteria: nn.Module , clip: float, device: str):
  model.train()

  epoch_loss = 0

  for _, (src,tar) in enumerate(iterator):
    src, tar = src.to(device), tar.to(device)

    optimizer.zero_grad()

    output = model(src, tar)

    output = output[1:].view(-1, output.shape[-1])
    tar = tar[1:].view(-1)

    loss = criteria(output, tar)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()

    epoch_loss += loss.item()
  
  return epoch_loss/len(iterator)

In [None]:
if hype.load_checkpoint:
  checkpoint = torch.load('transformer_checkpoint.pth')

In [None]:
epoch_t = notebook.trange(hype.NUM_EPOCHS, desc='loss')

for epoch in epoch_t:

  train_loss = train(model,train_iter, optimizer, criterion, hype.CLIP, hype.DEVICE)

  if hype.save_checkpoint:
    checkpoint.epoch_num = epoch
    checkpoint.model_params = model.state_dict
    checkpoint.optim_params = optimizer.state_dict
    checkpoint.loss = train_loss
    torch.save(checkpoint,'transformer_checkpoint.pth')

  epoch_t.set_description(f"loss: {train_loss: .5f}")
  epoch_t.refresh()
  time.sleep(0.1)

HBox(children=(FloatProgress(value=0.0, description='loss', max=10.0, style=ProgressStyle(description_width='i…

KeyboardInterrupt: ignored