<a href="https://colab.research.google.com/github/arnav39/CS779_Machine_Translation/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS779 Machine Translation

- hindi to english 

- processing a single_sentence pair at once

## Importing libraries

In [None]:
!pip install indic-nlp-library --quiet
!python -m spacy download fr_core_news_sm --quiet
!python -m spacy download en_core_web_sm --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h2023-04-11 22:56:40.697121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-11 22:56:43.155881: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-

In [None]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np
import torch.nn.functional as F
import spacy
from tqdm.notebook import tqdm
import sys
import gensim
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize import indic_normalize
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device = {device}")

device = cuda


- loading the data

In [None]:
with open('/content/drive/MyDrive/wiki.hi/inp_sent.pkl', 'rb') as f: 
  output_sent_list = pickle.load(f) # output is english

with open('/content/drive/MyDrive/wiki.hi/out_sent.pkl', 'rb') as f: 
  input_sent_list = pickle.load(f) # input is hindi

print(type(input_sent_list))
print(len(input_sent_list))

print(type(output_sent_list))
print(len(output_sent_list))

<class 'list'>
140000
<class 'list'>
140000


## Classes and functions required

In [None]:
class EncoderGRU(nn.Module):

  def __init__(self, input_size, hidden_size):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)

  def forward(self, input, hidden):

    # input.shape : (1, )

    embedded = self.embedding(input).view(1, 1, -1)
    # embedded.shape = (1, 1, hidden_size)

    output, hidden = self.gru(embedded, hidden)
    # output.shape = (1, 1, hidden_size) , (seq_len, batch_size, hidden_size)
    # hidde.shape = (1, 1, hidden_size), (num_layers, batch_size, hidden_size)

    return output, hidden 

  def init_hidden(self):
    hidden = torch.zeros(1, 1, self.hidden_size) # (num_layers, batch_size, hidden_dim)
    return hidden

In [None]:
class DecoderGRU(nn.Module):

  def __init__(self, hidden_size, output_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input, hidden):
    output = self.embedding(input).view(1, 1, -1)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden)
    output = self.out(output[0])
    return output, hidden

  def init_hidden(self):
    hidden = torch.zeros(1, 1, self.hidden_size) # (num_layers, batch_size, hidden_size)
    return hidden

In [None]:
def train(input_tensor: torch.Tensor, target_tensor: torch.Tensor, encoder: EncoderGRU, decoder: DecoderGRU, encoder_optimizer, decoder_optimizer, criterion):

  input_tensor = input_tensor.to(device)
  target_tensor = target_tensor.to(device)
  
  encoder_hidden = encoder.init_hidden().to(device)

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)

  encoder_outputs = torch.zeros(input_length, encoder.hidden_size).to(device)

  loss = 0

  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    encoder_outputs[ei] = encoder_output[0, 0] 

  decoder_input = torch.LongTensor([0]).to(device)
  decoder_hidden = encoder_hidden

  for di in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    # print(decoder_output)
    # print(target_tensor[di])
    # sys.exit()
    single_loss = criterion(decoder_output, target_tensor[di].view(1))
    # print(single_loss)
    # sys.exit()
    loss += single_loss
    decoder_input = target_tensor[di]

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  ans = loss.detach().cpu().item()/target_length
  return ans

In [None]:
class Lang():

  def __init__(self, name, spacy_tokenizer):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>"}
    self.word2count = {}
    self.n_words = 3
    self.tokenizer = spacy_tokenizer

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    tokens = [token.text for token in self.tokenizer(sentence.lower())]
    return tokens

  def __len__(self):
    return self.n_words

In [None]:
class Hindi_lang():

  def __init__(self, name):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>"}
    self.word2count = {}
    self.n_words = 3
    self.normalizer = indic_normalize.DevanagariNormalizer(lang='hi', remove_nuktas=True)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    # first normalize the sentence, then tokenize
    norm_sent = self.normalizer.normalize(sentence)
    tokens = indic_tokenize.trivial_tokenize(norm_sent)
    return tokens

  def __len__(self):
    return self.n_words
  

## Actual code

- building the enlgish vocab (output_lang_vocab)

In [None]:
nlp_english = spacy.load("en_core_web_sm")
english_output_vocab = Lang("english", nlp_english)

for my_sent in tqdm(output_sent_list):
  english_output_vocab.add_sentence(my_sent)

  0%|          | 0/140000 [00:00<?, ?it/s]

In [None]:
print(len(english_output_vocab))

24260


In [None]:
# saving the english vocab

with open('/content/drive/MyDrive/wiki.hi/english_output_vocab.pkl', 'wb') as f: 
  pickle.dump(english_output_vocab, f)

- building the hindi vocab (input_lang_vocab)

In [None]:
hindi_input_vocab = Hindi_lang("hindi")

for my_sent in tqdm(input_sent_list):
  hindi_input_vocab.add_sentence(my_sent)

with open('/content/drive/MyDrive/wiki.hi/hindi_input_vocab.pkl', 'wb') as f: 
  pickle.dump(hindi_input_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

- loading the english and hindi vocab

In [None]:
with open('/content/drive/MyDrive/wiki.hi/english_output_vocab.pkl', 'rb') as f: 
  english_output_vocab = pickle.load(f)

with open('/content/drive/MyDrive/wiki.hi/hindi_input_vocab.pkl', 'rb') as f: 
  hindi_input_vocab = pickle.load(f)

- preparing the data in the form of list of tuple to feed into the model

In [None]:
training_data = []

for i in range(len(input_sent_list)):
  pair = (input_sent_list[i], output_sent_list[i])
  training_data.append(pair)

print(len(training_data))
print(training_data[0])

140000
('और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे) से नजात दे', "and deliver us by Thy mercy from the people of the unbelievers. '")


- trying to train the model

In [None]:
hidden_size = 100
learning_rate = 0.01
max_epochs = 10

In [None]:
encoder = EncoderGRU(len(hindi_input_vocab), hidden_size).to(device)
decoder = DecoderGRU(hidden_size, len(english_output_vocab)).to(device)

print(encoder)
print(decoder)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

EncoderGRU(
  (embedding): Embedding(27938, 100)
  (gru): GRU(100, 100)
)
DecoderGRU(
  (embedding): Embedding(24260, 100)
  (gru): GRU(100, 100)
  (out): Linear(in_features=100, out_features=24260, bias=True)
)


In [None]:
for epoch in tqdm(range(max_epochs)):

  epoch_loss = 0

  for pair in tqdm(training_data):

    input_tokens = hindi_input_vocab.tokenize_sentence(pair[0])
    target_tokens = english_output_vocab.tokenize_sentence(pair[1])

    input_tensor = torch.tensor([hindi_input_vocab.word2index[my_token] for my_token in input_tokens]).to(device)
    target_tensor = torch.tensor([english_output_vocab.word2index[my_token] for my_token in target_tokens]).to(device)

    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    epoch_loss += loss

  print(f"epoch = {epoch}/{max_epochs}, LOSS = {epoch_loss/len(training_data)}")

  torch.save(encoder.state_dict(), '/content/drive/MyDrive/wiki.hi/encoder.params')
  torch.save(decoder.state_dict(), '/content/drive/MyDrive/wiki.hi/decoder.params')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/140000 [00:00<?, ?it/s]

epoch = 0/10, LOSS = 4.560584468340916


  0%|          | 0/140000 [00:00<?, ?it/s]