### For running the notebook, create a directory with name: "Colab_Notebooks" in Google Drive
Then save the data files "train.csv" & "testhindistatements.csv" in that "Colab_Notebooks" directory. The output file "answer.txt", ".pth file" and other pre-processed files will be saved in the same folder: "Colab_Notebooks" 

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

## Installing indicnlp module for tokenizing hindi sentences

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

## Importing Python dependencies

In [None]:
import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"   # Used for debugging cuda device errors. It forces synchronus execution b/n host & device.
import csv
import numpy as np
import  re
import string
from unicodedata import normalize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

# For tokenizing english sentence
import spacy
# For tokenizing hindi sentence  
from indicnlp.tokenize import indic_tokenize

import random

## Setting up the device based on availabilty of GPU
The name of GPU assigned is printed after this cell

In [None]:
device = torch.device('cpu')
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('The GPU allocated is:',torch.cuda.get_device_name(0))
else:
  device = torch.device('cpu')
# device = xm.xla_device()

## Load default spacy english-core-web model

In [None]:
spacy_english = spacy.load('en')

## Function that returns list of tokens in an english sentence using sacy tokenizer

In [None]:
def spacy_english_tokenizer(sentence):
  tokenized_sent = spacy_english.tokenizer(sentence)
  token_list = []
  for token in tokenized_sent:
    token_list.append(token.text)
  return token_list

## Function that returns list of tokens in a hindi sentence using indicnlp tokenizer

In [None]:
def indicnlp_hindi_tokenizer(sentence):
  token_list = indic_tokenize.trivial_tokenize(sentence)
  return token_list

## Testing english tokenizer on a sample sentence

In [None]:
sentence = 'hello world! Here i come'
# spacy_english_tokenizer(sentence)
indicnlp_hindi_tokenizer(sentence)

## Testing hindi tokenizer on a sample sentence

In [None]:
sentence = 'वहाँ पहुँचने में कितना समय लगेगा।'
indicnlp_hindi_tokenizer(sentence)

## Maximum length of sentences we will be considering

In [None]:
maximum_length = 15

## Function that cleans Train data and saves the clean train data to a csv file

In [None]:
def clean_train_data():
  table = str.maketrans('', '', string.punctuation)
  num_lines = 0
  avg_length = 0
  maxi_length = 0
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','w') as clean_csv_file:  # Opening a csv file to save clean train data
    csv_writer = csv.writer(clean_csv_file)
    csv_writer.writerow(['hindi','english'])
    with open('/content/drive/MyDrive/Colab_Notebooks/train.csv','r') as csv_file: # Opening train.csv file
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        num_lines+=1
        avg_length += len(line[1].split()) 
        if len(line[1].split()) > maxi_length:
          maxi_length = len(line[1].split()) # finding longest hindi sentence in train set
        hindi_sen = line[1]
        english_sen = line[2]
        if len(hindi_sen) == 0 or len(english_sen) ==0:   # skipping lines from train.csv if they don't contain hindi or english sentence
          continue
        if len(hindi_sen) > 1.5* len(english_sen) or len(english_sen) > 1.5* len(hindi_sen): # skipping sentences if length of hindi and englisj sentences are not similar  
          continue
        
        if len(english_sen.split())>maximum_length or len(hindi_sen.split())>maximum_length: # skipping sentences that are very long
          continue
        
        english_sen = english_sen.strip()
        english_sen = normalize('NFD',english_sen).encode('ascii', 'ignore') # normailsing english text to NFD form
        english_sen = english_sen.decode('UTF-8')
        temp_str_words = english_sen.split()
        temp_str_words = [word.translate(table) for word in temp_str_words] # removing punctuation from english text
        
        temp_str_words = [word.lower() for word in temp_str_words] # lower casing english text
        temp_str_words = [word for word in temp_str_words if word.isalpha()]
        
        temp_str_words.insert(0,'sos'); temp_str_words.append('eos') # appending sos, eos tokens to english sentences
        english_sen = ' '.join(map(str, temp_str_words))

        hindi_sen = hindi_sen.strip()
        hindi_sen.replace('♪', ''); hindi_sen.replace('♫', '') # removing musical symbols from hindi sentences
        hindi_sen = normalize('NFD',hindi_sen).encode('UTF-8', 'ignore') # normalising hindi text
        hindi_sen = hindi_sen.decode('UTF-8')
        temp_str_words = hindi_sen.split()
        temp_str_words = [word.translate(table) for word in temp_str_words] # removing punctuation from hindi text
        
        temp_str_words = [word.lower() for word in temp_str_words]
        temp_str_words = [word for word in temp_str_words if not (word==' ')]
       
        temp_str_words.insert(0,'sos'); temp_str_words.append('eos') # appending sos and eos to hindi sentences
        hindi_sen = ' '.join(map(str, temp_str_words))
        csv_writer.writerow([hindi_sen,english_sen])

  return (avg_length//num_lines),maxi_length # finding avg length of hindi sentences in train data

## Cleaning the Training data and printing train data set statistics

In [None]:
train_hindi_sen_avg_length,train_hindi_sen_max_length = clean_train_data()
print('traindata_hindi_sentence_avg_length: '+str(train_hindi_sen_avg_length))
print('traindata_hindi_sentence_max_length: '+str(train_hindi_sen_max_length))

## Function that cleans test data and saves it to csv file

In [None]:
def clean_test_data():
   table = str.maketrans('', '', string.punctuation)
   num_lines = 0
   avg_length = 0
   maxi_length = 0
   with open('/content/drive/MyDrive/Colab_Notebooks/clean_test.csv','w') as clean_csv_file: # opening a csv file to save clean test data
    csv_writer = csv.writer(clean_csv_file)
    csv_writer.writerow(['hindi'])
    with open('/content/drive/MyDrive/Colab_Notebooks/testhindistatements.csv','r') as csv_file: # opening hindistatements.csv
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        num_lines+=1
        hindi_sen = line[2]
        hindi_sen = hindi_sen.strip()
        hindi_sen.replace('♪', ''); hindi_sen.replace('♫', '')
        hindi_sen = normalize('NFD',hindi_sen).encode('UTF-8', 'ignore') # normailsing test data
        hindi_sen = hindi_sen.decode('UTF-8')
        temp_str_words = hindi_sen.split()
        avg_length += len(temp_str_words)
        if len(temp_str_words) > maxi_length:
          maxi_length = len(temp_str_words)
        temp_str_words = [word.translate(table) for word in temp_str_words] # removing punctuation
        
        temp_str_words = [word.lower() for word in temp_str_words]
        temp_str_words = [word for word in temp_str_words if word != ' ']
        hindi_sen = ' '.join(map(str, temp_str_words))

        csv_writer.writerow([hindi_sen])
    return (avg_length//num_lines),maxi_length # finding avg length of hindi sentence in test data

## Cleaning the Test data and printing test data statistics

In [None]:
avg_hindi_sen_test_len,max_hindi_sen_test_len = clean_test_data()
print('avg_hindi_sen_test_len: '+str(avg_hindi_sen_test_len))
print('max_hindi_sen_test_len: '+str(max_hindi_sen_test_len))

## Creating dictionaries for hindi and english vocabulary

In [None]:
hindi_tok_to_ind_dict = {'pad':0, 'sos':1, 'eos':2} # creating hindi token to index dictinory
hindi_ind_to_tok_dict = {0:'pad', 1:'sos', 2:'eos'} # creating hindi index to token dictionary
english_tok_to_ind_dict = {'pad':0, 'sos':1, 'eos':2} # creating english token to index dictionary
english_ind_to_tok_dict = {0:'pad', 1:'sos', 2:'eos'} # creating english index to token dictionary 

## Creating Hindi vocabulary

In [None]:
def generate_hindi_vocab():
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      index_count = 3
      for line in csv_reader:
        hindi_sen = line[0]
        tokens = indicnlp_hindi_tokenizer(hindi_sen)
        for token in tokens:
          if token not in hindi_tok_to_ind_dict:
            hindi_tok_to_ind_dict[token] = index_count
            hindi_ind_to_tok_dict[index_count] = token
            index_count+=1

In [None]:
generate_hindi_vocab()
print('size of hindi vocab: '+str(len(hindi_ind_to_tok_dict)))
print(len(hindi_tok_to_ind_dict))

## Generating English Vocabulary

In [None]:
def generate_english_vocab():
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      index_count = 3
      for line in csv_reader:
        english_sen = line[1]
        tokens = indicnlp_hindi_tokenizer(english_sen)
        for token in tokens:
          if token not in english_tok_to_ind_dict:
            english_tok_to_ind_dict[token] = index_count
            english_ind_to_tok_dict[index_count] = token
            index_count+=1

In [None]:
generate_english_vocab()
print('Size of English Vocabulary: '+str(len(english_ind_to_tok_dict)))
print(len(english_tok_to_ind_dict))

## Replacing tokens in train data with their corresponding indices from Vocab

In [None]:
def numeric_clean_data():
  with open('/content/drive/MyDrive/Colab_Notebooks/numeric_clean_data.csv','w') as numeric_csv_file:
    csv_writer = csv.writer(numeric_csv_file)
    csv_writer.writerow(['hindi','english'])
    list_of_lists = []
    with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        hindi_sen = line[0]
        english_sen = line[1]
        tokens = indicnlp_hindi_tokenizer(hindi_sen)
        hindi_ind_ls = [hindi_tok_to_ind_dict[token] for token in tokens]
        while len(hindi_ind_ls) > maximum_length:
          hindi_ind_ls.pop()
        while len(hindi_ind_ls) < (maximum_length):
          hindi_ind_ls.append(0)
        tokens = indicnlp_hindi_tokenizer(english_sen)
        english_ind_ls = [english_tok_to_ind_dict[token] for token in tokens]
        while len(english_ind_ls) > maximum_length:
          english_ind_ls.pop()
        while len(english_ind_ls) < (maximum_length):
          english_ind_ls.append(0)
        list_of_lists.append([hindi_ind_ls,english_ind_ls])
        csv_writer.writerow([hindi_ind_ls,english_ind_ls])
    return list_of_lists

In [None]:
clean_numeric = numeric_clean_data()
clean_np = torch.tensor(clean_numeric)

In [None]:
clean_np.shape[0]
clean_np[0,1]

## Making iterable data set using Dataset for Dataloader

In [None]:
class hindi_english_train_dataset(Dataset):
  def __init__(self):
    clean_np = torch.tensor(clean_numeric)
    self.num_samples = clean_np.shape[0]
    self.hindi_sen = clean_np[:,0]
    self.english_sen = clean_np[:,1]

  def __getitem__(self,index):
      return self.hindi_sen[index], self.english_sen[index]

  def __len__(self):
      return self.num_samples

In [None]:
train_data = hindi_english_train_dataset()
hindi_sen,english_sen = train_data[1]
print(hindi_sen)

## Setting Model Hyperparameters

In [None]:
# input size to encoder
hindi_one_hot_len = len(hindi_tok_to_ind_dict)
# input size and output size to decoder (both should be same as output of decoder is fed to decoder)
english_one_hot_len = len(english_tok_to_ind_dict)

num_heads = 8
num_encoder_layers = 2
num_decoder_layers = 2

# number of epochs to be trained
num_epochs = 70

# learning rate
learning_rate = 0.001

# batch size
batch_size = 128

#drop out rates for encoder & decoder
dropout = 0.1

max_len = maximum_length
forward_expansion = 4
pad_index = hindi_tok_to_ind_dict["pad"]

# embedding size
embedding_size = 512

## Creating iterator for train data

In [None]:
train_iterator = DataLoader(dataset=train_data,batch_size=batch_size,shuffle=True)

## Function that generates Sinusoidal Positional Embeddings

In [None]:
def create_sinusoidal_Embeddings(maximum_sequence_length,embedding_size):
    temp = np.array([[position / np.power(10000, 2 * (dim // 2) /embedding_size) for dim in range(embedding_size)] for position in range(maximum_sequence_length)])
    pos_embeddings = torch.zeros(temp.shape)
    pos_embeddings[:, 0::2] = torch.FloatTensor(np.sin(temp[:, 0::2]))
    pos_embeddings[:, 1::2] = torch.FloatTensor(np.cos(temp[:, 1::2]))
    return pos_embeddings.to(device)

## Function That generates Positional embeddings based on nn.Embedding

In [None]:
class create_Embeddings(nn.Module):
  def __init__(self,vocab_size,embedding_size,maximum_sequence_length):
    super(create_Embeddings,self).__init__()
    self.embedding_size = embedding_size
    self.token_embedding = nn.Embedding(vocab_size,embedding_size)
    self.position_embedding = nn.Embedding(maximum_sequence_length,embedding_size)

  def forward(self,source,source_sequence_ids):
    ## Option1: choose fully connected netwrok to generate positional embeddings.
    return (self.token_embedding(source)+self.position_embedding(source_sequence_ids))
    ## Option2: choose sinusoidal positional embeddings.
    #temp = create_sinusoidal_Embeddings(source.shape[0],self.embedding_size).unsqueeze(1).expand(source.shape[0],source.shape[1],self.embedding_size)
    #return (self.token_embedding(source)+temp) 

## Function that generates Target Mask

In [None]:
def generate_target_mask(target_sequence_length):
  mask = (torch.tril(torch.ones(target_sequence_length, target_sequence_length)) == 1)
  return mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(device)


## Function that generates source mask

In [None]:
def generate_source_mask(hindi_batch,pad_index):
  return (hindi_batch.transpose(0, 1) == pad_index).to(device)

## Transformer Model

In [None]:
class Transformer_NMT(nn.Module):
    def __init__(self, hindi_vocab_size, english_vocab_size,embedding_size,pad_index,num_heads,num_encoder_layers,
                 num_decoder_layers,forward_expansion,dropout_p,maximum_sequence_length):
        super(Transformer_NMT, self).__init__()
        self.hindi_embedding = create_Embeddings(hindi_vocab_size, embedding_size,maximum_sequence_length)
        self.english_embedding = create_Embeddings(english_vocab_size, embedding_size,maximum_sequence_length)
        self.transformer = nn.Transformer(d_model=embedding_size,nhead=num_heads,num_encoder_layers=num_encoder_layers,num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=forward_expansion,dropout=dropout_p) #activation='relu'  activation='gelu'
        
        self.fc_out = nn.Linear(embedding_size, english_vocab_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, hindi_batch, english_batch):
        hindi_sequence_length, N = hindi_batch.shape
        english_sequence_length, N = english_batch.shape
        hindi_sequence_positions = torch.arange(0, hindi_sequence_length).unsqueeze(1).expand(hindi_sequence_length, N).to(device)
        english_sequence_positions = torch.arange(0, english_sequence_length).unsqueeze(1).expand(english_sequence_length, N).to(device)

        hindi_batch_embedding = self.hindi_embedding(hindi_batch,hindi_sequence_positions)
        english_batch_embedding = self.english_embedding(english_batch,english_sequence_positions)
        
        hindi_sen_pad_mask = generate_source_mask(hindi_batch,pad_index).to(device)
        english_sentence_mask = generate_target_mask(english_sequence_length).to(device)

        prediction = self.transformer(hindi_batch_embedding,english_batch_embedding,src_key_padding_mask=hindi_sen_pad_mask,tgt_mask=english_sentence_mask)
        prediction = self.fc_out(prediction)
        return prediction

## Model Weight Initialization

In [None]:
def init_weights(model):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -1, 1)

## Creating the Model

In [None]:

model = Transformer_NMT(hindi_one_hot_len,english_one_hot_len,embedding_size,pad_index,num_heads,num_encoder_layers,
                        num_decoder_layers,forward_expansion,dropout,max_len).to(device)

# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/after35epochs_final.pth')
# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch_W5_1.pth')
# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/after70epochs_final.pth')

criterian = nn.CrossEntropyLoss(ignore_index=pad_index)
# init_weights(model)

## Using Adam Optimizer and Scheduler

In [None]:
optimizer = optim.Adam(model.parameters(),lr = learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

## Clipping gradient function to avoid exploding gradient problem

In [None]:
def clip_gradients(model):
  torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)

## Function to insert start and end tokens at the begining and end of a token list

In [None]:
def insert_sos_eos(tokens):
  tokens.insert(0, 'sos')
  tokens.append('eos')
  return tokens

## Function that output's indices for a list of tokens from Hinidi Vocabulary

In [None]:
def token_to_index(tokens):
  indices = []
  for token in tokens:
    if token in hindi_tok_to_ind_dict:
      indices.append(hindi_tok_to_ind_dict[token])
  return indices

## Function that output's tokens for a list of indices from English Vocabulary

In [None]:
def index_to_token(indices):
  english_sentence = []
  for index in indices:
    english_sentence.append(english_ind_to_tok_dict[index])
  return english_sentence

## Function that translates hindi test sentences to English sentences

In [None]:
def hindi_to_english(model, text,device, max_length=15):
    tokens = indicnlp_hindi_tokenizer(text)
    tokens = insert_sos_eos(tokens)
    indices_for_tokens = token_to_index(tokens)
    if len(indices_for_tokens) > maximum_length:
      indices_for_tokens = indices_for_tokens[0:maximum_length]
    
    hindi_batch = torch.LongTensor(indices_for_tokens).unsqueeze(1).to(device)
    
    decoder_output = [english_tok_to_ind_dict["sos"]]

    for word in range(max_length):
        last_inp_decoder = torch.LongTensor(decoder_output).unsqueeze(1).to(device)

        with torch.no_grad():
            current_output = model(hindi_batch,last_inp_decoder)

        decoder_output.append(current_output.argmax(2)[-1,:].item())
        if current_output.argmax(2)[-1,:].item() == english_tok_to_ind_dict["eos"]:
            break

    english_sentence = index_to_token(decoder_output)
    english_sentence.remove('sos')
    if english_sentence[-1]=='eos':
      english_sentence.pop()
    return english_sentence

## Training and Checking the loss on Train data set

In [None]:
training_loss = []
sentence = 'वहाँ पहुँचने में कितना समय लगेगा'
min_loss = 10000
for epoch in range(num_epochs):
  batch_num = 1
  # torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch.pth')
  model.train(True)
  training_loss = []
  for batch_index,(hindi_batch,english_batch) in enumerate(train_iterator):
    # print(batch_num)
    hindi_batch = torch.transpose(hindi_batch, 0, 1).to(device)
    english_batch = torch.transpose(english_batch, 0, 1).to(device)
    
    output = model(hindi_batch,english_batch[:-1, :])
    output = output.reshape(-1,output.shape[2])
    english_batch = english_batch[1:].reshape(-1)  ##flatening the tensor into a vector
    
    optimizer.zero_grad()
    loss = criterian(output,english_batch)  
    training_loss.append(loss)
    if loss.item() < min_loss:
      min_loss = loss.item()
      # torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_with_leastloss.pth')
    # print('loss'+str(loss))
    # print('batch_num:'+str(batch_num))
    loss.backward()
    clip_gradients(model)
    optimizer.step()
    # scheduler.step()
    batch_num+=1
    
  T_loss = sum(training_loss)
  print(f'Epoch [{epoch+1}/{num_epochs}], training loss: {T_loss/len(train_iterator)}')
  # if loss < 0.0005:
    # break

In [None]:
# torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/after70epochs_final.pth')

In [None]:
# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch_W4.pth')

## Opening a text file to store translated sentences for test data

In [None]:
f = open('/content/drive/MyDrive/Colab_Notebooks/answer.txt',"w")

## Translating the test data

In [None]:
model.eval()
i = 1
with open('/content/drive/MyDrive/Colab_Notebooks/clean_test.csv','r') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')
  next(csv_reader)
  for line in csv_reader:
    sentence = line[0]
    english_sentence = hindi_to_english(model,sentence,device,max_length=maximum_length)
    output_sen = ' '.join(map(str, english_sentence))
    output_sen = output_sen + '\n'
    f.write(output_sen)
    print(i)
    i=i+1

f.close()