# CS779 Machine Translation

- hindi to english 

- trying to process a batch at once

## Importing libraries

In [2]:
!pip install indic-nlp-library --quiet
!python -m spacy download en_core_web_sm --quiet

[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np
import torch.nn.functional as F
import spacy
import os
from tqdm.notebook import tqdm
import sys
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize import indic_normalize
import pickle
import random
from torch.utils.data import Dataset, DataLoader

In [35]:
import pandas as pd

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device = {device}")

device = cuda


In [5]:
with open('/kaggle/input/cs779-mt-hindi-2-english/inp_sent.pkl', 'rb') as f: 
  output_sent_list = pickle.load(f) # output is english

with open('/kaggle/input/cs779-mt-hindi-2-english/out_sent.pkl', 'rb') as f: 
  input_sent_list = pickle.load(f) # input is hindi

print(type(input_sent_list))
print(len(input_sent_list))

print(type(output_sent_list))
print(len(output_sent_list))

<class 'list'>
140000
<class 'list'>
140000


## Vocab classes

In [6]:
class Lang():

  def __init__(self, name, spacy_tokenizer):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2, '<PAD>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>", 3: '<PAD>'}
    self.word2count = {}
    self.n_words = 4
    self.tokenizer = spacy_tokenizer

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    tokens = [token.text for token in self.tokenizer(sentence.lower())]
    return tokens

  def __len__(self):
    return self.n_words

In [7]:
class Hindi_lang():

  def __init__(self, name):
    self.name = name
    self.word2index = {"<SOS>":0, '<EOS>': 1, "<UNK>": 2, '<PAD>': 3}
    self.index2word = {0: "<SOS>", 1: "<EOS>", 2: "<UNK>", 3: '<PAD>'}
    self.word2count = {}
    self.n_words = 4
    self.normalizer = indic_normalize.DevanagariNormalizer(lang='hi', remove_nuktas=True)

  def add_word(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.n_words
      self.word2count[word] = 1
      self.index2word[self.n_words] = word
      self.n_words += 1

    else:
      self.word2count[word] += 1

  def add_sentence(self, sentence):
    tokens = self.tokenize_sentence(sentence)
    for token in tokens: 
      self.add_word(token)

  def tokenize_sentence(self, sentence):
    # first normalize the sentence, then tokenize
    norm_sent = self.normalizer.normalize(sentence)
    tokens = indic_tokenize.trivial_tokenize(norm_sent)
    return tokens

  def __len__(self):
    return self.n_words

## Building, saving and loading the vocabs

In [10]:
nlp_english = spacy.load("en_core_web_sm")
english_output_vocab = Lang("english", nlp_english)

for my_sent in tqdm(output_sent_list):
  english_output_vocab.add_sentence(my_sent)

print(len(english_output_vocab))

#saving the english vocab

with open('/kaggle/working/english_output_vocab.pkl', 'wb') as f: 
  pickle.dump(english_output_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

24261


In [9]:
hindi_input_vocab = Hindi_lang("hindi")

for my_sent in tqdm(input_sent_list):
  hindi_input_vocab.add_sentence(my_sent)

print(len(hindi_input_vocab))

with open('/kaggle/working/hindi_input_vocab.pkl', 'wb') as f: 
  pickle.dump(hindi_input_vocab, f)

  0%|          | 0/140000 [00:00<?, ?it/s]

27939


In [8]:
with open('/kaggle/input/somecs779/english_output_vocab.pkl', 'rb') as f:
    english_output_vocab = pickle.load(f)
    
with open('/kaggle/input/somecs779/hindi_input_vocab.pkl', 'rb') as f:
    hindi_input_vocab = pickle.load(f)

In [9]:
SOS_TOKEN_INDEX = english_output_vocab.word2index['<SOS>']
print(SOS_TOKEN_INDEX)

EOS_TOKEN_INDEX = english_output_vocab.word2index['<EOS>']
print(EOS_TOKEN_INDEX)

0
1


## Preparing the data and making the Dataloader

In [10]:
MAX_LENGTH = 64 # this will be length of each sentence
BATCH_SIZE = 128

In [11]:
training_data = []

for i in range(len(input_sent_list)):
  pair = (input_sent_list[i], output_sent_list[i])
  training_data.append(pair)

print(len(training_data))
print(training_data[0])

140000
('और अपनी रहमत से हमें इन काफ़िर लोगों (के नीचे) से नजात दे', "and deliver us by Thy mercy from the people of the unbelievers. '")


In [12]:
def tensorFromSentence(lang_vocab, sentence, max_length=MAX_LENGTH):

  tokens = lang_vocab.tokenize_sentence(sentence)
  indexes = [lang_vocab.word2index[token] for token in tokens]
  indexes.append(EOS_TOKEN_INDEX)

  pad_index = lang_vocab.word2index['<PAD>']

  if len(indexes) < max_length:
    indexes += [pad_index] * (max_length - len(indexes))
  else:
    indexes = indexes[:max_length]
    indexes[-1] = EOS_TOKEN_INDEX

  return torch.tensor(indexes, dtype=torch.long)
  # (max_length,)

In [13]:
class Seq2SeqDataset(Dataset):
  
  def __init__(self, pairs):
    self.pairs = pairs

  def __len__(self):
    return len(self.pairs)

  def __getitem__(self, index):
    input_sentence, target_sentence = self.pairs[index]
    input_tensor = tensorFromSentence(hindi_input_vocab, input_sentence)
    target_tensor = tensorFromSentence(english_output_vocab, target_sentence)
    return input_tensor, target_tensor

In [14]:
dataset = Seq2SeqDataset(training_data)

In [15]:
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
print(len(dataloader))

1094


## Classes and functions required

In [16]:
class EncoderGRU(nn.Module):

  def __init__(self, input_size, hidden_size, num_layers=1):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers)

  def forward(self, input_seqs, hidden):

    # input_seqs.shape : (batch_size, )

    embedded = self.embedding(input_seqs.unsqueeze(0))
    # embedded.shape = (1, batch_size, hidden_size)

    output, hidden = self.gru(embedded, hidden)
    # output.shape = (1, batch_size, hidden_size) , (seq_len, batch_size, hidden_size)
    # hidde.shape = (num_layers, batch_size, hidden_size)

    return output, hidden 

  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size) # (num_layers, batch_size, hidden_dim)
    return hidden

In [17]:
class DecoderGRU(nn.Module):

  def __init__(self, hidden_size, output_size, num_layers=1):
    super().__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, num_layers)
    self.out = nn.Linear(hidden_size, output_size)

  def forward(self, input, hidden):
    # input shape = (batch_size, )
    output = self.embedding(input.unsqueeze(0)) # (1, batch_size, hidden_size)
    output = F.relu(output)
    output, hidden = self.gru(output, hidden) 
    # output : (1, batch_size, hidden_size)
    # hidden : (num_layers, batch_size, hidden_size)
    output = self.out(output[0]) # (batch_size, output_size)
    return output, hidden

  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size) # (num_layers, batch_size, hidden_size)
    return hidden

In [18]:
def train(input_tensors: torch.Tensor,
          target_tensors: torch.Tensor,
          encoder: EncoderGRU,
          decoder: DecoderGRU,
          encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio=0.5):
  
  # input_tensors.shape = (batch_size, max_length)
  # Target_tensors.shape = (batch_size, max_length)

  batch_size = input_tensors.size(0)

  input_tensors = input_tensors.transpose(0, 1).to(device) # transpose to (max_len, batch_size), then push to device
  target_tensors = target_tensors.transpose(0, 1).to(device) # transpose to (max_len, batch_size), then push to device

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  encoder_hidden = encoder.init_hidden(batch_size).to(device)
  encoder_outputs = torch.zeros(MAX_LENGTH, batch_size, encoder.hidden_size).to(device)

  loss = 0

  for ei in range(MAX_LENGTH):
    encoder_output, encoder_hidden = encoder(input_tensors[ei], encoder_hidden) 
    # encoder is taking input of shape of (batch_size, )
    encoder_outputs[ei] = encoder_output[0, :, :]


  decoder_input = torch.tensor([SOS_TOKEN_INDEX] * batch_size, device=device, dtype=torch.long)
  decoder_hidden = encoder_hidden

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:

    # feed the target as the next input
    for di in range(MAX_LENGTH): 
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      loss += criterion(decoder_output, target_tensors[di])
      decoder_input = target_tensors[di]

  else:

    # without teacher forcing: use it's own predictions as input in the next step
    for di in range(MAX_LENGTH):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
      # decoder_output : (batch_size, output_size)
      topv, topi = decoder_output.topk(k=1) 

      # topv, topi : (batch_size, 1)

      decoder_input = topi.detach().squeeze(-1) # decoder needs 1d input of shape (batch_size, )
      loss += criterion(decoder_output, target_tensors[di])

      if (decoder_input == EOS_TOKEN_INDEX).all():
        break

  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  ans = loss.detach().cpu().item()/MAX_LENGTH
  return ans

## Actual code

In [19]:
hidden_size = 100
learning_rate = 0.001
max_epochs = 10
num_layers = 6

In [21]:
encoder = EncoderGRU(len(hindi_input_vocab), hidden_size, num_layers).to(device)
decoder = DecoderGRU(hidden_size, len(english_output_vocab), num_layers).to(device)

encoder.load_state_dict(torch.load('/kaggle/input/somecs779/encoder.params'))
decoder.load_state_dict(torch.load('/kaggle/input/somecs779/decoder.params'))

print(encoder)
print(decoder)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

EncoderGRU(
  (embedding): Embedding(27939, 100)
  (gru): GRU(100, 100, num_layers=6)
)
DecoderGRU(
  (embedding): Embedding(24261, 100)
  (gru): GRU(100, 100, num_layers=6)
  (out): Linear(in_features=100, out_features=24261, bias=True)
)


In [22]:
for epoch in tqdm(range(max_epochs)):

  epoch_loss = 0

  for batch in tqdm(dataloader):

    input_tensors = batch[0].to(device)
    target_tensors = batch[1].to(device)

    loss = train(input_tensors, target_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    epoch_loss += loss

  print(f"epoch = {epoch}/{max_epochs}, LOSS = {epoch_loss/len(dataloader)}")

  with open('/kaggle/working/training_loss.txt', "a") as f: 
        my_dict = {"epoch": epoch, "max_epochs": max_epochs, "epoch_loss": epoch_loss/len(dataloader)}
        f.write(f"{my_dict}\n")

  torch.save(encoder.state_dict(), '/kaggle/working/encoder.params')
  torch.save(decoder.state_dict(), '/kaggle/working/decoder.params')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 0/10, LOSS = 1.7596835223804663


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 1/10, LOSS = 1.659582788809126


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 2/10, LOSS = 1.605336882732468


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 3/10, LOSS = 1.5629786416425129


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 4/10, LOSS = 1.5289457392431047


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 5/10, LOSS = 1.4780008552606188


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 6/10, LOSS = 1.4660549174170172


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 7/10, LOSS = 1.4271439889231374


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 8/10, LOSS = 1.4103795755824833


  0%|          | 0/1094 [00:00<?, ?it/s]

epoch = 9/10, LOSS = 1.3730219078456245


# Evaluation

In [39]:
encoder.cpu()
encoder.eval()

decoder.eval()
decoder.cpu()

DecoderGRU(
  (embedding): Embedding(24261, 100)
  (gru): GRU(100, 100, num_layers=6)
  (out): Linear(in_features=100, out_features=24261, bias=True)
)

In [43]:
def tensorFromSentenceEval(lang_vocab, sent, max_length=MAX_LENGTH):
    
    tokens = lang_vocab.tokenize_sentence(sent)
    existing_tokens = lang_vocab.word2index.keys()
    
    indexes = []
    for token in tokens:
        if token in existing_tokens:
            index = lang_vocab.word2index[token] 
        else:
            index = lang_vocab.word2index['<UNK>']
        indexes.append(index)

    pad_index = lang_vocab.word2index['<PAD>']

    if len(indexes) < max_length:
        indexes += [pad_index] * (max_length - len(indexes))

    else:
        indexes = indexes[:max_length]

    return torch.tensor(indexes, dtype=torch.long)
    # (max_length,)

In [54]:
class Test_Set(Dataset):
    
    def __init__(self, test_sentences_list):
        self.sent_list = test_sentences_list
        
    def __len__(self):
        return len(self.sent_list)
    
    def __getitem__(self, index):
        input_sentence = self.sent_list[index]
        input_tensor = tensorFromSentenceEval(hindi_input_vocab, input_sentence)
        return input_tensor

In [70]:
def evaluate(encoder, decoder, input_tensors, max_length=MAX_LENGTH):
    with torch.no_grad():
        
        batch_size = input_tensors.size(1)    
        
        # Initialize the encoder hidden state
        encoder_hidden = encoder.init_hidden(batch_size)
        
        for i in range(max_length):
            _, encoder_hidden = encoder(input_tensors[i], encoder_hidden)

        # Initialize the decoder input as a tensor of SOS tokens
        decoder_input = torch.tensor([SOS_TOKEN_INDEX] * batch_size, dtype=torch.long)

        # Initialize the decoder hidden state with the final hidden state of the encoder
        decoder_hidden = encoder_hidden

        # Initialize the decoded output sequence as a list of empty tensors
        decoded_outputs = [torch.tensor([], dtype=torch.long)] * batch_size

        # Loop over each time step in the output sequence
        for timestep in range(max_length):
            # Pass the decoder input and hidden state through the decoder
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

            # Choose the token with the highest score as the next input to the decoder
            _, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze(-1).detach()

            # Concatenate the decoded output tensor for this time step to the decoded_outputs list
            for i in range(batch_size):
                decoded_outputs[i] = torch.cat((decoded_outputs[i], topi[i].unsqueeze(0)))

        # Return the decoded output sequences as a tensor of shape (max_length, batch_size)
        return torch.stack(decoded_outputs).transpose(0, 1)


In [74]:
def IndexesToSent(indexes, output_vocab):
    
    words = []
    for idx in indexes:
        if idx == EOS_TOKEN_INDEX:
            break
        if idx != output_vocab.word2index['<PAD>']:
            words.append(output_vocab.index2word[idx])
            
    return ' '.join(words)

In [45]:
test_set = pd.read_csv("/kaggle/input/mt-dev-set/eng_Hindi_data_dev_X.csv", header=None)
print(test_set.shape)

(40000, 1)


In [37]:
test_set.head()

Unnamed: 0,0
0,और अनुसर्ण करो उस सर्वोत्तम चीज़ का जो तुम्हार...
1,एक क़ाफ़िला आया। फिर उसने पनिहारा को भेजा। उसन...
2,जो कोई सुचरित लेकर आया उसको उससे भी अच्छा प्रा...
3,raviratlami @aol. inEMAIL OF TRANSLATORS
4,घटना/कार्य/बैठक संपादक में RSVP क्षेत्र दिखायें


In [38]:
test_sentences = test_set.iloc[:, 0].tolist()
print(type(test_sentences))
print(len(test_sentences))

<class 'list'>
40000


In [72]:
test_dataset = Test_Set(test_sentences)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [56]:
batch = next(iter(test_loader))
print(type(batch))
print(batch.shape)

<class 'torch.Tensor'>
torch.Size([128, 64])


In [75]:
output_list = []

for batch in tqdm(test_loader):
    
    temp = evaluate(encoder, decoder, batch.transpose(0, 1)).squeeze()
    
    for sent in temp:
        index_list = sent.tolist()
        conv_sent = IndexesToSent(index_list, english_output_vocab)
        output_list.append(conv_sent)

In [76]:
print(output_list[:5])

['and and whoever % show and he these then then invalid evolution & save to disable say move he open is save % the and and the could say the light there " " st and say and error i or set unable can " toggles they or construct it and say click the _ unable failed the the you select and the the and can is allah audio how do but and click and they show and they so _ disc this then you move he the they _ and if and use in odessa and and then it % and start error say comma no they he the those kde they but and % use does and select error this except click it and show they', 'the the will 1 the is said are we they opaque invitations & project allah use : tab said in he the s is the many is not , revelation rain is allah i _ when : when server thou they here to not o whether they they a is the : here the s to to quot the can the we good display we not he has disc many you those i here when they the it have lord _ disc is he can tab said is call save the you those the the is they the he is s t

In [87]:
print(output_list[-1])




In [88]:
with open('/kaggle/working/answer.txt', "a") as f: 
    for out in output_list:
        f.write(f"{out.encode('utf-8')}\n")

# Random

In [58]:
a = [torch.tensor([], dtype=torch.long)] * 3
print(a)

[tensor([], dtype=torch.int64), tensor([], dtype=torch.int64), tensor([], dtype=torch.int64)]


In [27]:
a = {1: "hell", 2: "bye"}
b = 'gandu'

if b in a.values():
    print("yes")
else:
    print("no")

no


In [69]:
a = torch.rand(3, 4)

for i in a:
    print(i.tolist())

[0.22617703676223755, 0.4085897207260132, 0.9911841154098511, 0.33149629831314087]
[0.05995970964431763, 0.7917261123657227, 0.5551609992980957, 0.08077502250671387]
[0.11251026391983032, 0.07683908939361572, 0.26595205068588257, 0.06419819593429565]
