<a href="https://colab.research.google.com/github/arnav39/CS779_Machine_Translation/blob/main/main_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS779 Machine Translation

- hindi to english 

- trying to process a batch at once

## Importing libraries

In [1]:
!pip install indic-nlp-library --quiet
!python -m spacy download en_core_web_sm --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h2023-04-12 08:05:40.066516: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-12 08:05:42.895133: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-

In [2]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np
import torch.nn.functional as F
import spacy
import os
from tqdm.notebook import tqdm
import sys
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize import indic_normalize
import pickle
import random
from torch.utils.data import Dataset, DataLoader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device = {device}")

device = cuda


- loading the data

In [6]:
with open('/content/drive/MyDrive/wiki.hi/inp_sent.pkl', 'rb') as f: 
  output_sent_list = pickle.load(f) # output is english

with open('/content/drive/MyDrive/wiki.hi/out_sent.pkl', 'rb') as f: 
  input_sent_list = pickle.load(f) # input is hindi

print(type(input_sent_list))
print(len(input_sent_list))

print(type(output_sent_list))
print(len(output_sent_list))

<class 'list'>
140000
<class 'list'>
140000


## Vocab classes

In [7]:
class Lang():

  def __init__(self, name, spacy_tokenizer):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2, '<PAD>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>", 3: '<PAD>'}
    self.word2count = {}
    self.n_words = 4
    self.tokenizer = spacy_tokenizer

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    tokens = [token.text for token in self.tokenizer(sentence.lower())]
    return tokens

  def __len__(self):
    return self.n_words

In [8]:
class Hindi_lang():

  def __init__(self, name):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2, '<PAD>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>", 3: '<PAD>'}
    self.word2count = {}
    self.n_words = 4
    self.normalizer = indic_normalize.DevanagariNormalizer(lang='hi', remove_nuktas=True)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    # first normalize the sentence, then tokenize
    norm_sent = self.normalizer.normalize(sentence)
    tokens = indic_tokenize.trivial_tokenize(norm_sent)
    return tokens

  def __len__(self):
    return self.n_words
  

## Building, saving and loading the vocabs

- building the enlgish vocab (output_lang_vocab)

In [53]:
nlp_english = spacy.load("en_core_web_sm")
english_output_vocab = Lang("english", nlp_english)

for my_sent in tqdm(output_sent_list):
  english_output_vocab.add_sentence(my_sent)

print(len(english_output_vocab))

# saving the english vocab

with open('/content/drive/MyDrive/wiki.hi/english_output_vocab.pkl', 'wb') as f: 
  pickle.dump(english_output_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

24261


- building the hindi vocab (input_lang_vocab)

In [54]:
hindi_input_vocab = Hindi_lang("hindi")

for my_sent in tqdm(input_sent_list):
  hindi_input_vocab.add_sentence(my_sent)

print(len(hindi_input_vocab))

with open('/content/drive/MyDrive/wiki.hi/hindi_input_vocab.pkl', 'wb') as f: 
  pickle.dump(hindi_input_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

27939


- loading the english and hindi vocab

In [9]:
with open('/content/drive/MyDrive/wiki.hi/english_output_vocab.pkl', 'rb') as f: 
  english_output_vocab = pickle.load(f)

with open('/content/drive/MyDrive/wiki.hi/hindi_input_vocab.pkl', 'rb') as f: 
  hindi_input_vocab = pickle.load(f)

In [10]:
SOS_TOKEN_INDEX = english_output_vocab.word2index['<SOS>']
print(SOS_TOKEN_INDEX)

0


In [11]:
EOS_TOKEN_INDEX = english_output_vocab.word2index['<EOS>']
print(EOS_TOKEN_INDEX)

1


## Preparing the data and making the Dataloader

- preparing the data in the form of list of tuple to feed into the model

In [45]:
MAX_LENGTH = 64 # this will be length of each sentence
BATCH_SIZE = 128

In [13]:
training_data = []

for i in range(len(input_sent_list)):
  pair = (input_sent_list[i], output_sent_list[i])
  training_data.append(pair)

print(len(training_data))
print(training_data[0])

140000
('और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे) से नजात दे', "and deliver us by Thy mercy from the people of the unbelievers. '")


In [14]:
def tensorFromSentence(lang_vocab, sentence, max_length=MAX_LENGTH):

  tokens = lang_vocab.tokenize_sentence(sentence)
  indexes = [lang_vocab.word2index[token] for token in tokens]
  indexes.append(EOS_TOKEN_INDEX)

  pad_index = lang_vocab.word2index['<PAD>']

  if len(indexes) < max_length:
    indexes += [pad_index] * (max_length - len(indexes))
  else:
    indexes = indexes[:max_length]
    indexes[-1] = EOS_TOKEN_INDEX

  return torch.tensor(indexes, dtype=torch.long)
  # (max_length,)

In [15]:
class Seq2SeqDataset(Dataset):
  
  def __init__(self, pairs):
    self.pairs = pairs

  def __len__(self):
    return len(self.pairs)

  def __getitem__(self, index):
    input_sentence, target_sentence = self.pairs[index]
    input_tensor = tensorFromSentence(hindi_input_vocab, input_sentence)
    target_tensor = tensorFromSentence(english_output_vocab, target_sentence)
    return input_tensor, target_tensor

In [16]:
dataset = Seq2SeqDataset(training_data)

- let's look how does the elements of the dataset look like

In [97]:
inp_t, targ_t = dataset[0]
print(inp_t.shape) # (max_length)
print(targ_t.shape)

torch.Size([64])
torch.Size([64])


In [46]:
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
print(len(dataloader))

1094


- let's look at how a batch looks like

In [47]:
batch = next(iter(dataloader))
print(type(batch))
print(len(batch))

<class 'list'>
2


In [48]:
print(type(batch[0]), type(batch[1]))

<class 'torch.Tensor'> <class 'torch.Tensor'>


In [49]:
print(batch[0].shape) # (batch_size, max_length)
print(batch[1].shape) 

torch.Size([128, 64])
torch.Size([128, 64])


## Classes and functions required

In [21]:
class EncoderGRU(nn.Module):

  def __init__(self, input_size, hidden_size, num_layers=1):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers)

  def forward(self, input_seqs, hidden):

    # input_seqs.shape : (batch_size, )

    embedded = self.embedding(input_seqs.unsqueeze(0))
    # embedded.shape = (1, batch_size, hidden_size)

    output, hidden = self.gru(embedded, hidden)
    # output.shape = (1, batch_size, hidden_size) , (seq_len, batch_size, hidden_size)
    # hidde.shape = (num_layers, batch_size, hidden_size)

    return output, hidden 

  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size) # (num_layers, batch_size, hidden_dim)
    return hidden

In [22]:
class DecoderGRU(nn.Module):

  def __init__(self, hidden_size, output_size, num_layers=1):
    super().__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input, hidden):
    # input shape = (batch_size, )
    output = self.embedding(input.unsqueeze(0)) # (1, batch_size, hidden_size)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden) 
    # output : (1, batch_size, hidden_size)
    # hidden : (num_layers, batch_size, hidden_size)
    output = self.out(output[0]) # (batch_size, output_size)
    return output, hidden

  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size) # (num_layers, batch_size, hidden_size)
    return hidden

In [26]:
def train(input_tensors: torch.Tensor,
          target_tensors: torch.Tensor,
          encoder: EncoderGRU,
          decoder: DecoderGRU,
          encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio=0.5):
  
  # input_tensors.shape = (batch_size, max_length)
  # Target_tensors.shape = (batch_size, max_length)

  batch_size = input_tensors.size(0)

  input_tensors = input_tensors.transpose(0, 1).to(device) # transpose to (max_len, batch_size), then push to device
  target_tensors = target_tensors.transpose(0, 1).to(device) # transpose to (max_len, batch_size), then push to device

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  encoder_hidden = encoder.init_hidden(batch_size).to(device)
  encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, encoder.hidden_size).to(device)

  loss = 0

  for ei in range(MAX_LENGTH):
    encoder_output, encoder_hidden = encoder(input_tensors[ei], encoder_hidden) 
    # encoder is taking input of shape of (batch_size, )
    encoder_outputs[ei] = encoder_output[0, :, :]


  decoder_input = torch.tensor([SOS_TOKEN_INDEX] * batch_size, device=device, dtype=torch.long)
  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:

    # feed the target as the next input
    for di in range(MAX_LENGTH): 
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      loss += criterion(decoder_output, target_tensors[di])
      decoder_input = target_tensors[di]

  else:

    # without teacher forcing: use it's own predictions as input in the next step
    for di in range(MAX_LENGTH):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      # decoder_output : (batch_size, output_size)
      topv, topi = decoder_output.topk(k=1) 

      # topv, topi : (batch_size, 1)

      decoder_input = topi.detach().squeeze(-1) # decoder needs 1d input of shape (batch_size, )
      loss += criterion(decoder_output, target_tensors[di])

      if (decoder_input == EOS_TOKEN_INDEX).all():
        break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  ans = loss.detach().cpu().item()/MAX_LENGTH
  return ans

## Actual code

- trying to train the model

In [42]:
hidden_size = 100
learning_rate = 0.001
max_epochs = 5
num_layers = 2

In [43]:
encoder = EncoderGRU(len(hindi_input_vocab), hidden_size, num_layers).to(device)
decoder = DecoderGRU(hidden_size, len(english_output_vocab), num_layers).to(device)

print(encoder)
print(decoder)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

EncoderGRU(
  (embedding): Embedding(27939, 100)
  (gru): GRU(100, 100, num_layers=2)
)
DecoderGRU(
  (embedding): Embedding(24261, 100)
  (gru): GRU(100, 100, num_layers=2)
  (out): Linear(in_features=100, out_features=24261, bias=True)
)


In [None]:
for epoch in tqdm(range(max_epochs)):

  epoch_loss = 0

  for batch in tqdm(dataloader):

    input_tensors = batch[0].to(device)
    target_tensors = batch[1].to(device)

    loss = train(input_tensors, target_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    epoch_loss += loss

  print(f"epoch = {epoch}/{max_epochs}, LOSS = {epoch_loss/len(dataloader)}")

  torch.save(encoder.state_dict(), '/content/drive/MyDrive/wiki.hi/encoder.params')
  torch.save(decoder.state_dict(), '/content/drive/MyDrive/wiki.hi/decoder.params')

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1094 [00:00<?, ?it/s]

# random stuff

In [118]:
a = torch.tensor([SOS_TOKEN_INDEX]*4, dtype=torch.long)
print(a)

tensor([0, 0, 0, 0])


In [88]:
a = torch.randn(32, 64)
print(a.shape)

b = a.transpose(0, 1)
print(b.shape)

torch.Size([32, 64])
torch.Size([64, 32])


In [10]:
a = torch.randn(4, 10)
b = a[0:0+1, :]

print(f"a.shape = {a.shape}, b.shape = {b.shape}")

a.shape = torch.Size([4, 10]), b.shape = torch.Size([1, 10])


In [11]:
a = torch.rand(1, 4, 10)
b = a[0, :, :]
print(b.shape)

torch.Size([4, 10])


In [23]:
a = torch.randn(3, 5) # (batch_size, output_size)
print(a)

tensor([[ 0.8925, -0.1343,  0.8933,  1.3184, -0.7888],
        [ 0.8849,  1.4044, -0.1104, -1.2036, -0.5046],
        [-1.2304,  0.2524,  1.1802, -0.5002,  1.6134]])


In [24]:
topv, topi = a.topk(1)

In [26]:
print(topv.shape) # (batch_size, 1)

torch.Size([3, 1])


In [30]:
print(topi.shape) # (batch_size, 1)

torch.Size([3, 1])


In [31]:
topi.squeeze(-1)

tensor([3, 1, 4])

In [33]:
a = torch.randint(20, size=(5,))
print(a)

tensor([ 9, 14,  1,  4,  8])


In [37]:
if (a == 1).all():
  print("false")

else:
  print("true")

true
