# For running the notebook, create a directory with name: "Colab_Notebooks" in Google Drive
# Then save the data files "train.csv" & "hindistatements.csv" in the Colab_Notebooks directory

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

## Installing torchtext used for pre-processing, loading, batching the data

In [None]:
!pip install torchtext==0.4

## Installing indicnlp module for tokenizing hindi sentences

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

In [None]:
import csv
import pandas as pd
import numpy as np
import  re
import string
from unicodedata import normalize

import torch
# import torch_xla
# import torch_xla.core.xla_model as xm

import torch.nn as nn
import torch.optim as optim
# Field used for pre-porcessing # TabularDataset to load data #BucketIterator used to construct iterator to do batching & padding
from torchtext.data import Field, TabularDataset, BucketIterator


# For tokenizing english sentence
import spacy
# For tokenizing hindi sentence  
from indicnlp.tokenize import indic_tokenize
# For train, test split
from sklearn.model_selection import train_test_split
import random

## Setting up the device based on availabilty of GPU

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
# device = xm.xla_device()

## Function that cleans Train data and saves the clean train data to a csv file

In [None]:
def clean_train_data():
  raw_data_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/train.csv',delimiter=',')
  raw_data_df['hindi'].replace('', np.nan, inplace=True)
  raw_data_df['english'].replace('', np.nan, inplace=True)
  raw_data_df.dropna(subset=['hindi'], inplace=True)
  raw_data_df.dropna(subset=['english'], inplace=True)

  raw_data_df = raw_data_df.dropna(how = 'all')
  raw_data_df.drop(raw_data_df[raw_data_df['hindi']==raw_data_df['english']].index,inplace = True)

  raw_data_df.drop_duplicates(keep='first',inplace=True)

  raw_data_df['hindi'] = raw_data_df['hindi'].str.strip()
  raw_data_df['english'] = raw_data_df['english'].str.strip()
  #removing lines for which translations are not relatively equal sized
  raw_data_df['hindi_len'] = raw_data_df.apply(lambda row: len(row['hindi'].split()),axis=1)
  raw_data_df['english_len'] = raw_data_df.apply(lambda row: len(row['english'].split()),axis=1)

  raw_data_df.query('hindi_len < 80 & english_len < 80',inplace = True)
  raw_data_df.query('hindi_len < english_len * 1.5 & english_len < 1.5*hindi_len',inplace = True)


  ## normalizing, stripping punctuation,lowering english and hindi text ##
  table = str.maketrans('', '', string.punctuation)
  for i in raw_data_df.index:
      temp_str = raw_data_df['english'][i]
      raw_data_df.at[i, "english"] = normalize('NFD',temp_str).encode('ascii', 'ignore')
      raw_data_df.at[i, "english"] = raw_data_df.at[i, "english"].decode('UTF-8')
      temp_str_words = raw_data_df.at[i, "english"].split()
      temp_str_words = [word.translate(table) for word in temp_str_words]
      #temp_str_words = [regx_printable.sub('', word) for word in temp_str_words]
      temp_str_words = [word.lower() for word in temp_str_words]
      temp_str_words = [word for word in temp_str_words if word.isalpha()]
      raw_data_df.at[i, "english"] = ' '.join(map(str, temp_str_words))
      
      temp_str = raw_data_df['hindi'][i]
      raw_data_df.at[i, "hindi"] = normalize('NFD',temp_str).encode('UTF-8', 'ignore')
      raw_data_df.at[i, "hindi"] = raw_data_df.at[i, "hindi"].decode('UTF-8')
      temp_str_words = raw_data_df.at[i, "hindi"].split()
      temp_str_words = [word.translate(table) for word in temp_str_words]
      #temp_str_words = [regx_printable.sub('', word) for word in temp_str_words]
      temp_str_words = [word.lower() for word in temp_str_words]
      raw_data_df.at[i, "hindi"] = ' '.join(map(str, temp_str_words))
      #raw_data_df.at[i, "hindi"] = raw_data_df.at[i, "hindi"].replace(u"\u0964".encode('UTF-8'), "")
      #raw_data_df.at[i, "hindi"] = re.sub(r'\|', '',  raw_data_df.at[i, "hindi"])

  # numbers = str.maketrans('', '', string.digits)
  # raw_data_df['english']=raw_data_df['english'].apply(lambda x: x.translate(numbers))
  # raw_data_df['hindi']=raw_data_df['hindi'].apply(lambda x: x.translate(numbers))
  # raw_data_df['hindi'] = raw_data_df['hindi'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

  # raw_data_df.drop(raw_data_df.columns[[0]], axis = 1,inplace=True)

  raw_data_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv',index=False)
  

## Function that cleans test data and saves it to csv file

In [None]:
def clean_test_data():
  raw_data_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/hindistatements.csv', delimiter=',')
  raw_data_df['hindi'] = raw_data_df['hindi'].str.strip()
  table = str.maketrans('', '', string.punctuation)
  for i in raw_data_df.index:
      temp_str = raw_data_df['hindi'][i]
      raw_data_df.at[i, "hindi"] = normalize('NFD',temp_str).encode('UTF-8', 'ignore')
      raw_data_df.at[i, "hindi"] = raw_data_df.at[i, "hindi"].decode('UTF-8')
      temp_str_words = raw_data_df.at[i, "hindi"].split()
      temp_str_words = [word.translate(table) for word in temp_str_words]
      temp_str_words = [word.lower() for word in temp_str_words]
      raw_data_df.at[i, "hindi"] = ' '.join(map(str, temp_str_words))
    
  raw_data_df.drop(raw_data_df.columns[[0,1]], axis = 1,inplace=True)
  raw_data_df.at[4710,'hindi'] = '/ ?'
  raw_data_df.to_csv('/content/drive/MyDrive/Colab_Notebooks/test_clean_data.csv',index=False)

## Cleaning the Training data

In [None]:
clean_train_data()

## Cleaning the Test data

In [None]:
clean_test_data()

## Load default spacy english-core-web model

In [None]:
spacy_english = spacy.load('en')

## Function that returns list of tokens in an english sentence using sacy tokenizer

In [None]:
def spacy_english_tokenizer(sentence):
  tokenized_sent = spacy_english.tokenizer(sentence)
  token_list = []
  for token in tokenized_sent:
    token_list.append(token.text)
  return token_list

## Function that returns list of tokens in a hindi sentence using indicnlp tokenizer

In [None]:
def indicnlp_hindi_tokenizer(sentence):
  token_list = indic_tokenize.trivial_tokenize(sentence)
  return token_list

## Testing english tokenizer on a sample sentence

In [None]:
sentence = 'hello world!'
spacy_english_tokenizer(sentence)

## Testing hindi tokenizer on a sample sentence

In [None]:
sentence = 'वहाँ पहुँचने में कितना समय लगेगा।'
indicnlp_hindi_tokenizer(sentence)

### Defining Preprocessing pipelines for both hindi and english sentences using Field class in torchtext.data

In [None]:
english = Field(sequential=True ,init_token='sos', eos_token='eos', tokenize = spacy_english_tokenizer,is_target=True)
hindi = Field(sequential=True ,init_token='sos',eos_token='eos',tokenize = indicnlp_hindi_tokenizer,stop_words=['।'])

## Making a train, validation data split (20% data used for validation)
## Saving train.csv, valid.csv to drive

In [None]:
clean_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv',delimiter=',')
train, valid = train_test_split(clean_df, test_size = 0.2,random_state=123)
train.to_csv('/content/drive/MyDrive/Colab_Notebooks/clean_train.csv',index=False)
valid.to_csv('/content/drive/MyDrive/Colab_Notebooks/clean_valid.csv',index=False)

## Loading the train and validation data

In [None]:
train_data, valid_data = TabularDataset.splits(
        path='/content/drive/MyDrive/Colab_Notebooks/', train='clean_train.csv',
        test='clean_valid.csv', format='csv',
        fields={'hindi':('hindi',hindi),'english':('english',english)})

## Building Hindi and English Vocabuary with maximum limit set to 18000 tokens

In [None]:
# building vocabulary of maximum size with 18000 tokens for both ennglish & hindi
english.build_vocab(train_data,max_size=18000)
hindi.build_vocab(train_data,max_size=18000)

## Checking the indexes assigned to various tokens in english and hindi

In [None]:
english.vocab.stoi['<unk>']
hindi.vocab.itos[10003]
english.vocab.itos[7]

## Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,num_layers,dropout_p=0.1):
    super(Encoder,self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout_p)

    self.embedding = nn.Embedding(input_size,embedding_size)
    self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p,bidirectional=True)

  def forward(self,sample):
    # print('sample'); print(sample[:,0])
    embedding = self.embedding(sample)
    embedding_withdropout = self.dropout(embedding)
    encoder_outputs,(hidden,cell) = self.rnn(embedding_withdropout)
    # print(hidden[0]); print(hidden[1])
    # print(hidden.shape)
    # print('hidden after');
    hidden = 0.5*(hidden[0] + hidden[1])
    cell = 0.5*(cell[0]+cell[1])
    # print(hidden.shape); #print(cell.shape)
    # hidden = 0.5 * torch.sum(hidden,dim=0)
    hidden = hidden.unsqueeze(0)
    cell = cell.unsqueeze(0)
    # print(hidden); print(hidden.shape)
    # print('encoder_hidden after mean:'); print(hidden)
    return encoder_outputs,hidden,cell

## Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,output_size,num_layers,dropout_p=0.1):
    super(Decoder,self).__init__()
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout_p)
    self.embedding_size= embedding_size

    self.embedding = nn.Embedding(self.input_size,self.embedding_size)
    self.rnn = nn.LSTM(self.embedding_size,hidden_size,num_layers,dropout=dropout_p)
    self.fc = nn.Linear(self.hidden_size,self.output_size)

  def forward(self,x,hidden,cell):
    x = x.unsqueeze(0)
    # print(x.shape)
    embedding = self.embedding(x)
    embedding_dropout = self.dropout(embedding)
    # print(embedding.shape)
    outputs, (hidden,cell) = self.rnn(embedding_dropout,(hidden,cell))
    y_hats = self.fc(outputs)
    # print('shape of y_hats before squeeze::')
    # print(y_hats.shape)
    y_hats = y_hats.squeeze(0)
    # print('shape of y_hats after squeeze::')
    # print(y_hats.shape)
    # print(hidden.shape); print(cell.shape)
    return y_hats,hidden,cell

## Seq2Seq Model

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder):
    super(Seq2Seq,self).__init__()
    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,hindi_batch,english_batch,teacher_force_ratio = 0.5):
    batch_size = hindi_batch.shape[1]
    english_batch_len = english_batch.shape[0]
    english_vocab_size = len(english.vocab)

    outputs = torch.zeros(english_batch_len,batch_size,english_vocab_size).to(device)
    encoder_outputs,hidden,cell = self.encoder(hindi_batch)

    sample=english_batch[0]
    # print('sample:'); print(sample)

    for token in range(1,english_batch_len):
      predict,hidden,cell = self.decoder(sample,hidden,cell)
      # print("reached here")
      outputs[token] = predict
      # print('outputs'); print(predict.shape); print(predict)
      probable_token = predict.argmax(1)
      # print('probable_token')
      # print(probable_token)
      sample = english_batch[token] if random.random() < teacher_force_ratio else probable_token
      # print(english_batch[token].shape)
    return outputs


## Setting Seq to Seq Model Hyperparameters

In [None]:
# input size to encoder
hindi_one_hot_len = len(hindi.vocab)
# input size and output size to decoder (both should be same as output of decoder is fed to decoder)
english_one_hot_len = len(english.vocab)
output_size = len(english.vocab)

# hidden size
hidden_size = 1024
# number of epochs to be trained
num_epochs = 60
# num_epochs = 5
# learning rate
learning_rate = 0.001
# batch size
batch_size = 128
#drop out rates for encoder & decoder
dropout_p = 0.1

# number of layers for encode & decoder
num_layers = 1

# encoder embedding size
encoder_embedding_size = 300
# decoder embedding size
decoder_embedding_size = 300

## Bucket iterator form trchtext that batches examples of similar lengths together

In [None]:
train_iterator,valid_iterator = BucketIterator.splits((train_data, valid_data),batch_size = batch_size,
                                                     sort_within_batch=True, sort_key=lambda x: len(x.hindi),device = device)

## Model Weight Initialization

In [None]:
def init_weights(model):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -1, 1)
    # nn.init.xavier_uniform_(param.data, gain=1.0)

## Creating the Model

In [None]:
encoder = Encoder(hindi_one_hot_len,encoder_embedding_size,
                      hidden_size,num_layers,dropout_p).to(device)

decoder = Decoder(english_one_hot_len,decoder_embedding_size,
                      hidden_size,output_size,num_layers,dropout_p).to(device)

model = Seq2Seq(encoder,decoder).to(device)

pad_index = english.vocab.stoi['<pad>']
criterian = nn.CrossEntropyLoss(ignore_index=pad_index)

# init_weights(model)

## Using Adam optimizer

In [None]:
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

## Clipping gradient function to avoid exploding gradient problem

In [None]:
def clip_gradients(model):
  torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)

## Function to insert start and end tokens at the begining and end of a token list

In [None]:
def insert_sos_eos(tokens):
  tokens.insert(0, hindi.init_token)
  tokens.append(hindi.eos_token)
  return tokens

## Function that output's indices for a list of tokens from Hinidi Vocabulary

In [None]:
def token_to_index(tokens):
  indices = []
  for token in tokens:
    indices.append(hindi.vocab.stoi[token])
  return indices

## Function that output's tokens for a list of indices from English Vocabulary

In [None]:
def index_to_token(indices):
  english_sentence = []
  for index in indices:
    english_sentence.append(english.vocab.itos[index])
  return english_sentence

In [None]:
def hindi_to_english(model, text, hindi, english, device, max_length=15):
    tokens = indicnlp_hindi_tokenizer(text)
    tokens = insert_sos_eos(tokens)
    indices_for_tokens = token_to_index(tokens)
    ## unsqueeze the index list of input sentence to get a batch of single sentence
    hindi_batch = torch.LongTensor(indices_for_tokens).unsqueeze(1).to(device)
    
    with torch.no_grad():
        encoder_outputs,hidden, cell = model.encoder(hindi_batch)

    decoder_output = [english.vocab.stoi["sos"]]

    for word in range(max_length):
        last_inp_decoder = torch.LongTensor([decoder_output[-1]]).to(device)

        with torch.no_grad():
            current_output, hidden, cell = model.decoder(last_inp_decoder, hidden, cell)

        decoder_output.append(current_output.argmax(1).item())
        if current_output.argmax(1).item() == english.vocab.stoi["eos"]:
            break

    english_sentence = index_to_token(decoder_output)
    english_sentence.remove('sos')
    if english_sentence[-1]=='eos':
      english_sentence.pop()
    return english_sentence

## Loading the saved model

In [None]:
# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/checkpoint_apr03_sub2.pth')

## Training and Checking the loss on Validation data set

In [None]:
training_loss = []
validation_loss = []
sentence = 'वहाँ पहुँचने में कितना समय लगेगा'
min_loss = 10000
for epoch in range(num_epochs):
  batch_num = 1
  torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch_W2.pth')
  model.train(True)
  training_loss = []; validation_loss = []
  for batch_index, batch in enumerate(train_iterator):
    hindi_batch = batch.hindi.to(device)
    english_batch = batch.english.to(device)

    # print('inputdata:');print(hindi_batch.shape);print('outputdata:');print(english_batch.shape)
    # print(hindi_batch[:,0]);print(hindi_batch[:,1]);print(hindi_batch[:,2]);print(hindi_batch[:,3]);print(hindi_batch[:,4])
    # print(english_batch[:,0]);print(english_batch[:,1]);print(english_batch[:,2]);print(english_batch[:,3]);print(english_batch[:,4]);
    output = model(hindi_batch,english_batch)

    output = output[1:].reshape(-1,output.shape[2])
    english_batch = english_batch[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterian(output,english_batch)
    training_loss.append(loss)
    if loss.item() < min_loss:
      min_loss = loss.item()
      torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_with_leastloss_W2.pth')
    # print('loss'+str(loss))
    # print('batch_num:'+str(batch_num))
    loss.backward()
    clip_gradients(model)
    optimizer.step()
    batch_num+=1

  ######## running on validation set ########
  with torch.no_grad():
    model.eval()
    for batch_index, batch in enumerate(valid_iterator):
      hindi_batch = batch.hindi.to(device)
      english_batch = batch.english.to(device)
      output = model(hindi_batch,english_batch)
      output = output[1:].reshape(-1,output.shape[2])
      english_batch = english_batch[1:].reshape(-1)
      loss = criterian(output,english_batch)
      validation_loss.append(loss)
  T_loss = sum(training_loss); V_loss = sum(validation_loss)
  print(f'Epoch [{epoch+1}/{num_epochs}], training loss: {T_loss/len(train_iterator)}, validation_loss: {V_loss/len(valid_iterator)}')

In [None]:
# torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/afterepochs_apr03_sub2.pth')

In [None]:
# FILE = 'checkpointV01.pth'
# torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpointV01.pth')

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/test_clean_data.csv')

In [None]:
# test_df.at[4994,'hindi']
print(test_df.shape)
f = open('/content/drive/MyDrive/Colab_Notebooks/answer.txt',"w")

In [None]:
model.eval()
for i in test_df.index:
    sentence = str(test_df['hindi'][i])
    # print(type(sentence))
    english_sentence = hindi_to_english(model,sentence,hindi,english,device,max_length=40)
    output_sen = ' '.join(map(str, english_sentence))
    output_sen = output_sen + '\n'
    f.write(output_sen)
    print(i)

f.close()