# For running the notebook, create a directory with name: "Colab_Notebooks" in Google Drive
# Then save the data files "train.csv" & "hindistatements.csv" in the Colab_Notebooks directory

## Model: Sequence to Sequence With Attention (Encoder: Bidirectional LSTM), (Decoder: LSTM)

## Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl

## Installing indicnlp module for tokenizing hindi sentences

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

In [None]:
import csv
import numpy as np
import  re
import string
from unicodedata import normalize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

# For tokenizing english sentence
import spacy
# For tokenizing hindi sentence  
from indicnlp.tokenize import indic_tokenize

import random

## Setting up the device based on availabilty of GPU

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
# device = xm.xla_device()

## Load default spacy english-core-web model

In [None]:
spacy_english = spacy.load('en')

## Function that returns list of tokens in an english sentence using sacy tokenizer

In [None]:
def spacy_english_tokenizer(sentence):
  tokenized_sent = spacy_english.tokenizer(sentence)
  token_list = []
  for token in tokenized_sent:
    token_list.append(token.text)
  return token_list

## Function that returns list of tokens in a hindi sentence using indicnlp tokenizer

In [None]:
def indicnlp_hindi_tokenizer(sentence):
  token_list = indic_tokenize.trivial_tokenize(sentence)
  return token_list

## Testing english tokenizer on a sample sentence

In [None]:
sentence = 'hello world! Here i come'
# spacy_english_tokenizer(sentence)
indicnlp_hindi_tokenizer(sentence)

## Testing hindi tokenizer on a sample sentence

In [None]:
sentence = 'वहाँ पहुँचने में कितना समय लगेगा।'
indicnlp_hindi_tokenizer(sentence)

In [None]:
maximum_length = 15

## Function that cleans Train data and saves the clean train data to a csv file

In [None]:
def clean_train_data():
  table = str.maketrans('', '', string.punctuation)
  regx_printable = re.compile('[%s]' % re.escape(string.printable))
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','w') as clean_csv_file:
    csv_writer = csv.writer(clean_csv_file)
    csv_writer.writerow(['hindi','english'])
    with open('/content/drive/MyDrive/Colab_Notebooks/train.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        hindi_sen = line[1]
        english_sen = line[2]
        if len(hindi_sen) == 0 or len(english_sen) ==0:
          continue
        if len(hindi_sen) > 1.5* len(english_sen) or len(english_sen) > 1.5* len(hindi_sen):
          continue
        
        if len(english_sen.split())>maximum_length or len(hindi_sen.split())>maximum_length:
          continue
        english_sen = english_sen.strip()
        english_sen = normalize('NFD',english_sen).encode('ascii', 'ignore')
        english_sen = english_sen.decode('UTF-8')
        temp_str_words = english_sen.split()
        temp_str_words = [word.translate(table) for word in temp_str_words]
        #temp_str_words = [regx_printable.sub('', word) for word in temp_str_words]
        temp_str_words = [word.lower() for word in temp_str_words]
        temp_str_words = [word for word in temp_str_words if word.isalpha()]
        
        temp_str_words.insert(0,'sos'); temp_str_words.append('eos')
        english_sen = ' '.join(map(str, temp_str_words))

        hindi_sen = hindi_sen.strip()
        hindi_sen.replace('♪', '')
        hindi_sen = normalize('NFD',hindi_sen).encode('UTF-8', 'ignore')
        hindi_sen = hindi_sen.decode('UTF-8')
        temp_str_words = hindi_sen.split()
        temp_str_words = [word.translate(table) for word in temp_str_words]
        # temp_str_words = [regx_printable.sub('', word) for word in temp_str_words]
        temp_str_words = [word.lower() for word in temp_str_words]
        temp_str_words = [word for word in temp_str_words if not (word==' ')]
       
        temp_str_words.insert(0,'sos'); temp_str_words.append('eos')
        hindi_sen = ' '.join(map(str, temp_str_words))
        csv_writer.writerow([hindi_sen,english_sen])

## Cleaning the Training data

In [None]:
clean_train_data()

## Function that cleans test data and saves it to csv file

In [None]:
def clean_test_data():
   table = str.maketrans('', '', string.punctuation)
   with open('/content/drive/MyDrive/Colab_Notebooks/clean_test.csv','w') as clean_csv_file:
    csv_writer = csv.writer(clean_csv_file)
    csv_writer.writerow(['hindi'])
    with open('/content/drive/MyDrive/Colab_Notebooks/testhindistatements.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        hindi_sen = line[2]
        hindi_sen = hindi_sen.strip()
        hindi_sen = normalize('NFD',hindi_sen).encode('UTF-8', 'ignore')
        hindi_sen = hindi_sen.decode('UTF-8')
        temp_str_words = hindi_sen.split()
        temp_str_words = [word.translate(table) for word in temp_str_words]
        # temp_str_words = [regx_printable.sub('', word) for word in temp_str_words]
        temp_str_words = [word.lower() for word in temp_str_words]
        temp_str_words = [word for word in temp_str_words if word != ' ']
        hindi_sen = ' '.join(map(str, temp_str_words))

        csv_writer.writerow([hindi_sen])

## Cleaning the Test data

In [None]:
clean_test_data()

In [None]:
hindi_tok_to_ind_dict = {'pad':0, 'sos':1, 'eos':2}
hindi_ind_to_tok_dict = {0:'pad', 1:'sos', 2:'eos'}
english_tok_to_ind_dict = {'pad':0, 'sos':1, 'eos':2}
english_ind_to_tok_dict = {0:'pad', 1:'sos', 2:'eos'}

In [None]:
def generate_hindi_vocab():
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      index_count = 3
      for line in csv_reader:
        hindi_sen = line[0]
        tokens = indicnlp_hindi_tokenizer(hindi_sen)
        for token in tokens:
          if token not in hindi_tok_to_ind_dict:
            hindi_tok_to_ind_dict[token] = index_count
            hindi_ind_to_tok_dict[index_count] = token
            index_count+=1

In [None]:
generate_hindi_vocab()
print(len(hindi_ind_to_tok_dict))
print(len(hindi_tok_to_ind_dict))

In [None]:
def generate_english_vocab():
  with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      index_count = 3
      for line in csv_reader:
        english_sen = line[1]
        tokens = indicnlp_hindi_tokenizer(english_sen)
        for token in tokens:
          if token not in english_tok_to_ind_dict:
            english_tok_to_ind_dict[token] = index_count
            english_ind_to_tok_dict[index_count] = token
            index_count+=1

In [None]:
generate_english_vocab()
print(len(english_ind_to_tok_dict))
print(len(english_tok_to_ind_dict))

In [None]:
def numeric_clean_data():
  with open('/content/drive/MyDrive/Colab_Notebooks/numeric_clean_data.csv','w') as numeric_csv_file:
    csv_writer = csv.writer(numeric_csv_file)
    csv_writer.writerow(['hindi','english'])
    list_of_lists = []
    with open('/content/drive/MyDrive/Colab_Notebooks/clean_data.csv','r') as csv_file:
      csv_reader = csv.reader(csv_file, delimiter=',')
      next(csv_reader)
      for line in csv_reader:
        hindi_sen = line[0]
        english_sen = line[1]
        tokens = indicnlp_hindi_tokenizer(hindi_sen)
        hindi_ind_ls = [hindi_tok_to_ind_dict[token] for token in tokens]
        while len(hindi_ind_ls) > maximum_length:
          hindi_ind_ls.pop()
        while len(hindi_ind_ls) < (maximum_length):
          hindi_ind_ls.append(0)
        tokens = indicnlp_hindi_tokenizer(english_sen)
        english_ind_ls = [english_tok_to_ind_dict[token] for token in tokens]
        while len(english_ind_ls) > maximum_length:
          english_ind_ls.pop()
        while len(english_ind_ls) < (maximum_length):
          english_ind_ls.append(0)
        list_of_lists.append([hindi_ind_ls,english_ind_ls])
        csv_writer.writerow([hindi_ind_ls,english_ind_ls])
    return list_of_lists

In [None]:
clean_numeric = numeric_clean_data()
clean_np = torch.tensor(clean_numeric)

In [None]:
clean_np.shape[0]
clean_np[0,1]

## Creating enumerable data set using Dataset class for DataLoader

In [None]:
class hindi_english_train_dataset(Dataset):
  def __init__(self):
    clean_np = torch.tensor(clean_numeric)
    self.num_samples = clean_np.shape[0]
    self.hindi_sen = clean_np[:,0]
    self.english_sen = clean_np[:,1]

  def __getitem__(self,index):
      return self.hindi_sen[index], self.english_sen[index]

  def __len__(self):
      return self.num_samples

## Sample Testing the iterable data set created in above cell

In [None]:
train_data = hindi_english_train_dataset()
hindi_sen,english_sen = train_data[1]
print(hindi_sen)

## Setting Seq to Seq Model Hyperparameters

In [None]:
# input size to encoder
hindi_one_hot_len = len(hindi_tok_to_ind_dict)
# input size and output size to decoder (both should be same as output of decoder is fed to decoder)
english_one_hot_len = len(english_tok_to_ind_dict)
output_size = len(english_tok_to_ind_dict)

# hidden size
hidden_size = 1024
# number of epochs to be trained
num_epochs = 70
# learning rate
learning_rate = 0.001
# batch size
batch_size = 128
#drop out rates for encoder & decoder
dropout_p = 0

# number of layers for encode & decoder
num_layers = 1

# encoder embedding size
encoder_embedding_size = 300
# decoder embedding size
decoder_embedding_size = 300

## Creating iterator for training data using DataLoader

In [None]:
train_iterator = DataLoader(dataset=train_data,batch_size=batch_size,shuffle=True)

## Encoder

In [None]:
class Encoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,num_layers,dropout_p=0.1):
    super(Encoder,self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout_p)

    self.embedding = nn.Embedding(input_size,embedding_size)
    self.rnn = nn.LSTM(embedding_size,hidden_size,num_layers,dropout=dropout_p,bidirectional=True)

  def forward(self,sample):
    # print('sample'); print(sample[:,0])
    embedding = self.embedding(sample)
    embedding_withdropout = self.dropout(embedding)
    encoder_outputs,(hidden,cell) = self.rnn(embedding_withdropout)
    # print('encoder')
    # print(encoder_outputs.shape)
    # print(hidden.shape)
    hidden = 0.5*(hidden[0] + hidden[1])
    cell = 0.5*(cell[0]+cell[1])
    # encoder_outputs = 0.5*(encoder_outputs[0]+encoder_outputs[1])
    hidden = hidden.unsqueeze(0)
    cell = cell.unsqueeze(0)
    # encoder_outputs = encoder_outputs.unsqueeze(0)
    return encoder_outputs,hidden,cell

## Attention mechanism

In [None]:
def attention_mech(encoder_outputs,dec_prev_hidden):
  # print('attention')
  dec_prev_hidden = torch.cat((dec_prev_hidden,dec_prev_hidden),dim=2)
  # print(dec_prev_hidden.shape)
  # encoder_outputs = encoder_outputs.permute(0,2,1)
  attention_score = dec_prev_hidden*encoder_outputs
  # print(attention_score.shape)
  attention_score = torch.sum(attention_score,dim=-1)
  # print(attention_score.shape)
  attention_score = attention_score.unsqueeze(2)
  # print(attention_score.shape)
  attention_weights = torch.softmax(attention_score,dim=2)
  context_vector = attention_weights*encoder_outputs
  context_vector = torch.sum(context_vector,dim=0)
  context_vector = context_vector.unsqueeze(0)
  # print(context_vector.shape)
  return context_vector,attention_weights

## Decoder

In [None]:
class Decoder(nn.Module):
  def __init__(self,input_size,embedding_size,hidden_size,output_size,num_layers,dropout_p=0.1):
    super(Decoder,self).__init__()
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = nn.Dropout(dropout_p)
    self.embedding_size= embedding_size

    self.embedding = nn.Embedding(self.input_size,self.embedding_size)
    self.rnn = nn.LSTM((self.embedding_size+2*self.hidden_size),hidden_size,num_layers,dropout=dropout_p)
    self.fc = nn.Linear(self.hidden_size,self.output_size)

  def forward(self,x,hidden,cell,encoder_outputs):
    x = x.unsqueeze(0)
  
    embedding = self.embedding(x)
    embedding_dropout = self.dropout(embedding)
    # print(embedding_dropout.shape)
    context_vector,attention_weights = attention_mech(encoder_outputs,hidden)
  
    rnn_input = torch.cat((context_vector,embedding_dropout),dim=2)
    outputs, (hidden,cell) = self.rnn(rnn_input,(hidden,cell))
    y_hats = self.fc(outputs)
    
    y_hats = y_hats.squeeze(0)
   
    return y_hats,hidden,cell

## Seq2Seq Model

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder):
    super(Seq2Seq,self).__init__()
    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,hindi_batch,english_batch,teacher_force_ratio = 0.5):
    batch_size = hindi_batch.shape[1]
    english_batch_len = english_batch.shape[0]
    english_vocab_size = len(english_tok_to_ind_dict)

    outputs = torch.zeros(english_batch_len,batch_size,english_vocab_size).to(device)
    encoder_outputs,hidden,cell = self.encoder(hindi_batch)

    sample=english_batch[0]

    for token in range(1,english_batch_len):
      predict,hidden,cell = self.decoder(sample,hidden,cell,encoder_outputs)
      outputs[token] = predict
      probable_token = predict.argmax(1)
      sample = english_batch[token] if random.random() < teacher_force_ratio else probable_token
     
    return outputs


## Model Weight Initialization

In [None]:
def init_weights(model):
  for name, param in model.named_parameters():
    nn.init.uniform_(param.data, -0.5, 0.5)
    # nn.init.xavier_uniform_(param.data, gain=1.0)

## Creating the Model

In [None]:
encoder = Encoder(hindi_one_hot_len,encoder_embedding_size,
                      hidden_size,num_layers,dropout_p).to(device)

decoder = Decoder(english_one_hot_len,decoder_embedding_size,
                      hidden_size,output_size,num_layers,dropout_p).to(device)

model = Seq2Seq(encoder,decoder).to(device)

pad_index = english_tok_to_ind_dict['pad']
criterian = nn.CrossEntropyLoss(ignore_index=pad_index)
# init_weights(model)

## Using Adam optimizer

In [None]:
optimizer = optim.Adam(model.parameters(),lr = learning_rate)

## Clipping gradient function to avoid exploding gradient problem

In [None]:
def clip_gradients(model):
  torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)

## Function to insert start and end tokens at the begining and end of a token list

In [None]:
def insert_sos_eos(tokens):
  tokens.insert(0, 'sos')
  tokens.append('eos')
  return tokens

## Function that output's indices for a list of tokens from Hinidi Vocabulary

In [None]:
def token_to_index(tokens):
  indices = []
  for token in tokens:
    if token in hindi_tok_to_ind_dict:
      indices.append(hindi_tok_to_ind_dict[token])
  return indices

## Function that output's tokens for a list of indices from English Vocabulary

In [None]:
def index_to_token(indices):
  english_sentence = []
  for index in indices:
    english_sentence.append(english_ind_to_tok_dict[index])
  return english_sentence

In [None]:
def hindi_to_english(model, text,device, max_length=15):
    tokens = indicnlp_hindi_tokenizer(text)
    tokens = insert_sos_eos(tokens)
    indices_for_tokens = token_to_index(tokens)
    ## unsqueeze the index list of input sentence to get a batch of single sentence
    hindi_batch = torch.LongTensor(indices_for_tokens).unsqueeze(1).to(device)
    
    with torch.no_grad():
        encoder_outputs,hidden, cell = model.encoder(hindi_batch)

    decoder_output = [english_tok_to_ind_dict["sos"]]

    for word in range(max_length):
        last_inp_decoder = torch.LongTensor([decoder_output[-1]]).to(device)

        with torch.no_grad():
            current_output, hidden, cell = model.decoder(last_inp_decoder, hidden, cell,encoder_outputs)

        decoder_output.append(current_output.argmax(1).item())
        if current_output.argmax(1).item() == english_tok_to_ind_dict["eos"]:
            break

    english_sentence = index_to_token(decoder_output)
    english_sentence.remove('sos')
    if english_sentence[-1]=='eos':
      english_sentence.pop()
    return english_sentence

## Training and Checking the loss on Validation data set

In [None]:
training_loss = []
sentence = 'वहाँ पहुँचने में कितना समय लगेगा'
min_loss = 10000
for epoch in range(num_epochs):
  batch_num = 1
  # torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch_W3_2.pth')
  model.train(True)
  training_loss = []
  # print(batch_num)
  for batch_index,(hindi_batch,english_batch) in enumerate(train_iterator):
    # print(batch_num)
    hindi_batch = torch.transpose(hindi_batch, 0, 1).to(device)
    english_batch = torch.transpose(english_batch, 0, 1).to(device)

    # print('inputdata:');print(hindi_batch.shape);print('outputdata:');print(english_batch.shape)
    # print(hindi_batch[:,0]);print(hindi_batch[:,1]);print(hindi_batch[:,2]);print(hindi_batch[:,3]);print(hindi_batch[:,4])
    # print(english_batch[:,0]);print(english_batch[:,1]);print(english_batch[:,2]);print(english_batch[:,3]);print(english_batch[:,4]);
    output = model(hindi_batch,english_batch)

    output = output[1:].reshape(-1,output.shape[2])
    english_batch = english_batch[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterian(output,english_batch)
    training_loss.append(loss)
    if loss.item() < min_loss:
      min_loss = loss.item()
      # torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/checkpoint_with_leastloss_W3.pth')
    # print('loss'+str(loss))
    # print('batch_num:'+str(batch_num))
    loss.backward()
    clip_gradients(model)
    optimizer.step()
    batch_num+=1
  T_loss = sum(training_loss)
  print(f'Epoch [{epoch+1}/{num_epochs}], training loss: {T_loss/len(train_iterator)}')

In [None]:
# torch.save(model,'/content/drive/MyDrive/Colab_Notebooks/afterepochs_apr03_sub2.pth')

## Loading the saved model

In [None]:
# model = torch.load('/content/drive/MyDrive/Colab_Notebooks/checkpoint_eachepoch_W3.pth')

In [None]:
f = open('/content/drive/MyDrive/Colab_Notebooks/answer_test_week3.txt',"w")

In [None]:
model.eval()
i = 1
with open('/content/drive/MyDrive/Colab_Notebooks/clean_test.csv','r') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')
  next(csv_reader)
  for line in csv_reader:
    sentence = line[0]
    english_sentence = hindi_to_english(model,sentence,device,max_length=40)
    output_sen = ' '.join(map(str, english_sentence))
    output_sen = output_sen + '\n'
    f.write(output_sen)
    print(i)
    i=i+1

f.close()