In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , mean_squared_error
import matplotlib.colors
import math
from sklearn.datasets import make_blobs
from sklearn.compose import ColumnTransformer
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
import time
sns.set()
import torchvision.models as models
import copy
import torchvision
from torchvision.transforms import transforms
import os
import sys
import string
import re
import xml.etree.ElementTree as ET
from torch.utils.data import dataset

#importing essential libraries

## English and Hindi Dictionaries

In [2]:
char_eng = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_eng = '<pad>'
eng_dict = {}
eng_dict[pad_eng] = 0

for i,letter in enumerate(char_eng):
  eng_dict[letter] = i+1

print(eng_dict)

{'<pad>': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


In [3]:
hindi_dict = {}
hindi_dict['<pad>'] = 0

char_hindi = ''
for i in range(2304, 2432):
  char_hindi += chr(i)

for i, letter in enumerate(char_hindi):
  hindi_dict[letter] = i+1

print(hindi_dict)

{'<pad>': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

## Text Preprocessing helper functions

In [4]:
def split_hindi_words(hindi_list):
  new_hindi_list = []
  for hind_name in hindi_list:
    hind_name = hind_name.replace(',',' ').replace('_',' ').replace('.',' ').replace("'",' ').replace('-',' ').replace('/',' ').replace('\u200d',' ').replace('(',' ').replace(')',' ').replace('?',' ')
    hind_name = hind_name.split()
    new_hindi_list.append(hind_name)
  return new_hindi_list

def split_english_words(english_list):
  new_english_list = []
  regex = re.compile('[^a-zA-Z]')
  for eng_name in english_list:
    eng_name = eng_name.upper()
    eng_name = eng_name.replace("'",'').replace('/',' ')
    eng_name = regex.sub(' ', eng_name)
    eng_name = eng_name.split()
    new_english_list.append(eng_name)
  return new_english_list

def clean_english_list(eng_list):
  regex = re.compile('[^a-zA-Z]')
  new_english_names = []
  for word in eng_list:
    new_english_names.append(regex.sub('', word))
  return new_english_names

def clean_hindi_list(hindi_list):
  new_hindi_names = []
  for word in hindi_list:
    word = word.replace(',','').replace('_','').replace('.','').replace("'",'')
    new_hindi_names.append(word)
  return new_hindi_names

## Encoding

In [5]:
def convert_eng_to_encoded(X_train):
  list_to_return = []
  for word in X_train:
    onehotstart = torch.zeros([len(word)+1,1])
    onehotstart[len(word)][0] = 1
    for i,letter in enumerate(word):
      index = eng_dict[letter]
      onehotstart[i][0] = index
    list_to_return.append(onehotstart)
  
  return list_to_return 

def convert_hindi_to_encoded(Y_train):
  list_to_return = []
  for word in Y_train:
    onehotstart = torch.zeros([len(word)+1,1])
    onehotstart[len(word)][0] = 0
    for i,letter in enumerate(word):
      index = hindi_dict[letter]
      onehotstart[i][0] = index
    list_to_return.append(onehotstart)
  
  return list_to_return

## Class to process the data

In [6]:
class EncoderDecoderData():

  def __init__(self,filename):
    self.final_eng_list, self.final_hindi_list = self.create_data_from_XML(filename)

 
  def create_data_from_XML(self,filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    english_names = []
    hindi_names = []
    for elem in root.iter():
      if elem.tag == 'SourceName':
        english_names.append(elem.text)
      if elem.tag == 'TargetName' and elem.attrib['ID'] == '1':
        hindi_names.append(elem.text)
    
    new_english_names = split_english_words(english_names)
    new_hindi_names = split_hindi_words(hindi_names)

    final_hindi_data = []
    final_english_data = []
    
    for eng_word, hindi_word in zip(new_english_names, new_hindi_names):
      if len(eng_word) != len(hindi_word):
        print('Skipping:', eng_word, '-', hindi_word)
      else:
        for eng_word_part, hindi_word_part in zip(eng_word, hindi_word):
          final_hindi_data.append(hindi_word_part)
          final_english_data.append(eng_word_part)
    
    final_hindi_data = clean_hindi_list(final_hindi_data)
    final_english_data = clean_english_list(final_english_data)
    self.final_eng_list = final_english_data
    self.final_hindi_list = final_hindi_data
    return self.final_eng_list, self.final_hindi_list


  def generate_random_sample(self):
    index = np.random.randint(len(self.final_eng_list))
    return self.final_eng_list[index], self.final_hindi_list[index]
  
  def generate_random_batch(self,batch_size):
    index = np.random.randint(len(self.final_eng_list))
    batch_list_english = []
    batch_list_hindi = []
    for i in range(index,index+batch_size,1):
      if i >= len(self.final_eng_list):
        batch_list_english.append(self.final_eng_list[i-len(self.final_eng_list)])
        batch_list_hindi.append(self.final_hindi_list[i-len(self.final_eng_list)])
      else:
        batch_list_english.append(self.final_eng_list[i])
        batch_list_hindi.append(self.final_hindi_list[i])
    return batch_list_english, batch_list_hindi
  



## Encoder Architecture

In [7]:
class Encoder(nn.Module):
  def __init__(self,input_size,emb_size,hidden_size):
    super().__init__()  
    self.input_size  = input_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(self.input_size,self.emb_size)

    self.rnn = nn.LSTM(self.emb_size,self.hidden_size)
  
  def forward(self,input_word):
    input_to_rnn = self.embedding(input_word.long())

    output, (h_state,c_state) = self.rnn(input_to_rnn)

    return h_state,c_state

In [8]:
class Encoder_pre_attention(nn.Module):
  def __init__(self, input_size, emb_size, hidden_size):
    super().__init__()

    self.input_size = input_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(input_size,emb_size)
    self.rnn = nn.GRU(emb_size,hidden_size)
  
  def forward(self,input_word):
    input_to_rnn = self.embedding(input_word.long())
    output, h_state = self.rnn(input_to_rnn)

    return h_state

## Decoder Architecture

In [9]:
class Decoder(nn.Module):
  def __init__(self,output_size,emb_size,hidden_size):
    super().__init__()

    self.output_size = output_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size
    
    self.embedding = nn.Embedding(output_size,emb_size)
    self.rnn = nn.LSTM(emb_size, hidden_size)
    self.fc = nn.Linear(hidden_size,output_size)
  
  def forward(self,input,h_state,c_state):

    input = input.unsqueeze(0)

    input_to_rnn = self.embedding(input.long())

    output,(h_state,c_state) = self.rnn(input_to_rnn,(h_state,c_state))

    prediction = self.fc(output.squeeze(0))

    return prediction, h_state, c_state

In [46]:
class Decoder_pre_attention(nn.Module):
  def __init__(self,output_size,emb_size,hidden_size):
    super().__init__()
    
    self.output_size = output_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(output_size,emb_size)
    self.rnn = nn.GRU(hidden_size + emb_size,hidden_size)
    self.fc = nn.Linear(emb_size + hidden_size*2,output_size)
  
  def forward(self,input,h_state, context):
    
    input = input.unsqueeze(0)

    input_to_rnn = self.embedding(input.long())

    input_to_rnn_joined = torch.cat((input_to_rnn,context),2)

    output, h_state = self.rnn(input_to_rnn_joined,h_state)

    output = torch.cat((input_to_rnn,output,context),2)

    prediction = self.fc(output.squeeze(0))

    return prediction, h_state

## Seq-2-Seq Architecture

In [47]:
class seq2seq(nn.Module):
  def __init__(self,encoder,decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,input, target,teacher_forcing_ratio = 0.5):

    target_length = target.shape[0]
    batch_size = target.shape[1]
    output_size = self.decoder.output_size

    outputs_to_return = torch.zeros(target_length,batch_size,output_size)

    h_state,c_state = self.encoder(input)

    input = torch.zeros(batch_size)

    for i in range(target_length):
      output,h_state,c_state = self.decoder(input,h_state,c_state)

      outputs_to_return[i] = output

      teacher_force = np.random.random() < teacher_forcing_ratio

      top1 = output.argmax(1)

      input = target[i] if teacher_force else top1
    
    return outputs_to_return


In [48]:
class seq2seq_pre_attention(nn.Module):
  def __init__(self,encoder,decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,input,target,teacher_forcing_ratio = 0.5):
    
    batch_size = target.shape[1]
    target_length = target.shape[0]
    output_size = self.decoder.output_size

    outputs_to_return = torch.zeros(target_length,batch_size,output_size)

    context = self.encoder(input)

    h_state = context

    input = torch.zeros(batch_size)
    
    for i in range(target_length):
      output,h_state = self.decoder(input,h_state,context)

      outputs_to_return[i] = output

      teacher_force = np.random.random() < teacher_forcing_ratio

      top1 = output.argmax(1)

      input = target[i] if teacher_force else top1
    
    return outputs_to_return


## Setting up the training process

In [49]:
input_size = 27
output_size = 129
embedding_size = 256
hidden_size = 512

encoder = Encoder_pre_attention(input_size,embedding_size,hidden_size)
decoder = Decoder_pre_attention(output_size,embedding_size,hidden_size)
model = seq2seq_pre_attention(encoder,decoder)

### Weight initialization

In [50]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

seq2seq_pre_attention(
  (encoder): Encoder_pre_attention(
    (embedding): Embedding(27, 256)
    (rnn): GRU(256, 512)
  )
  (decoder): Decoder_pre_attention(
    (embedding): Embedding(129, 256)
    (rnn): GRU(768, 512)
    (fc): Linear(in_features=1280, out_features=129, bias=True)
  )
)

### Loss function and optimizer

In [51]:
opt = optim.Adam(model.parameters(),lr=0.001)

In [52]:
loss_func = nn.CrossEntropyLoss(ignore_index=0)

### Training Helper functions

In [53]:
def generate_batch(data_loader,batch_size=16):
  X_train,Y_train = data_loader.generate_random_batch(5000)
  X_train, Y_train = convert_eng_to_encoded(X_train), convert_hindi_to_encoded(Y_train)

  ## Forming X_train and Y_train with given preprocessing class and functions above

  X_batch = []
  Y_batch = []
  sequence_length = np.random.randint(3,10)
  for xpoint, ypoint in zip(X_train,Y_train):
    if len(X_batch) == batch_size:
      break
    else:
      if ypoint.shape[0] == sequence_length:
        X_batch.append(xpoint)
        Y_batch.append(ypoint)
  
  # We first select a random batch length and then look for all hindi words having the same length
  # A list of X_batch and Y_batch is then formed whose length is batch_size

  max_input_length = 0
  for xpoint in X_batch:
    if xpoint.shape[0] > max_input_length:
      max_input_length = xpoint.shape[0]
  
  # We find the maximum length of the words that are stored in X_batch
  
  X_final_batch = torch.zeros(max_input_length,batch_size)
  Y_final_batch = torch.zeros(sequence_length,batch_size)
  for i in range(batch_size):
    actual = X_batch[i].shape[0]
    X_final_batch[:actual,i] = X_batch[i].view(-1)
  for i in range(batch_size):
    Y_final_batch[:,i] = Y_batch[i].view(-1)
  
  # Finally we form arrays, one X_final_batch having shape (max_length_X, batch_size) and one Y_final_batch having shape (batch_length, batch_size)
  # Values from X_batch and Y_batch are appropriately placed here
  
  return X_final_batch,Y_final_batch



In [54]:
def train(model, data_loader, loss_func, optimizer,no_of_batches):
  
  model.train()
  total_loss = 0
  optimizer.zero_grad()
  for i in range(no_of_batches):
    X_final_batch, Y_final_batch = generate_batch(data_loader,16)
    
       
    input = X_final_batch
    target = Y_final_batch
    
    

    output = model(input,target)

    output_size = output.shape[-1]

    loss = loss_func(output.view(-1,output_size),target.view(-1).long())
    total_loss += loss.item()
    loss.backward(retain_graph=True)

  torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

  optimizer.step()

  

  return total_loss/no_of_batches



### Evaluation helper function

In [55]:
def evaluate(model, test_data_loader, loss_func):
  
  model.eval()
  X_test,Y_test = test_data_loader.generate_random_batch(32)
  X_test_new = convert_eng_to_encoded(X_test)
  Y_test_new = convert_hindi_to_encoded(Y_test)
  
  total_loss = 0

  for i in range(len(X_test_new)):
    input = X_test_new[i]
    target = Y_test_new[i]


    output = model(input,target)

    output_size = output.shape[-1]

    loss = loss_func(output.view(-1,output_size),target.view(-1).long())


    total_loss += loss.item()

  return total_loss/32



## Starting the training process

In [56]:
data_store = EncoderDecoderData('training.xml')

Skipping: ['MAHARANI', 'PADMINI'] - ['महारानी', 'पद्', 'मिनी']
Skipping: ['STATE', 'MUSEUM', 'OF', 'THE', 'VERMONT', 'HISTORICAL', 'SOCIETY'] - ['स्टेट', 'म्युज़ियम', 'ऑफ', 'द', 'वरमाउंट', 'हिस्टॉरिकल', 'सोसायट', 'ी']
Skipping: ['I', 'DUKAANT'] - ['इंदुकांत']
Skipping: ['EFFIE', 'AWARDS'] - ['एफी', 'अवार्ड्', 'स']
Skipping: ['LAURENCE', 'OLIVIER', 'AWARDS'] - ['लॉरेंस', 'ओलिवर', 'अवार्ड्', 'स']
Skipping: ['ETTA'] - ['एट्', 'टा']
Skipping: ['COLLEGE', 'FOOTBALL', 'AWARDS'] - ['कॉलेज', 'फुटबॉल', 'अवार्ड्', 'स']
Skipping: ['STEVE', 'RHODES'] - ['स्टीव', 'रोड्', 'स']
Skipping: ['WINDHAM', 'COUNTY', 'HISTORICAL', 'MUSEUM'] - ['व', 'िंडहैम', 'काउंट', 'ी', 'ह', 'िस्टॉर', 'िकल', 'म्युज़ियम']
Skipping: ['PLAZA'] - ['प्लाज़ा', '66']
Skipping: ['ADVAKI'] - ['अद्', 'वाकी']
Skipping: ['BHAALACHAN', 'DR'] - ['भालचन्द्र']
Skipping: ['BARHARWA', 'JUNCTION'] - ['बरहरवा']
Skipping: ['STATE', 'BNK', 'TR'] - ['स्टेट', 'बैंक', 'ऑफ', 'त्रावणकोर']
Skipping: ['IN', 'DRAJEET'] - ['इन्द्रजीत']
Skipping: ['WOODSTO

In [57]:
data_tester = EncoderDecoderData('testing.xml')

Skipping: ['W', 'TTEMBERG'] - ['यूटमबर्ग']


In [58]:
epochs = 100

for i in tqdm_notebook(range(epochs)):
  print('Training Loss:',train(model,data_store,loss_func,opt,20))
  print('validation Loss:',evaluate(model,data_store,loss_func))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Training Loss: 4.85427622795105
validation Loss: 4.618210569024086
Training Loss: 4.5607970476150514
validation Loss: 4.323454365134239
Training Loss: 4.241516661643982
validation Loss: 3.987589880824089
Training Loss: 3.8402589678764345
validation Loss: 3.576748311519623
Training Loss: 3.7195451855659485
validation Loss: 3.5410033464431763
Training Loss: 3.6192405581474305
validation Loss: 3.7381883561611176
Training Loss: 3.6278109431266783
validation Loss: 3.4831403121352196
Training Loss: 3.6286044836044313
validation Loss: 3.6207800954580307
Training Loss: 3.4905601859092714
validation Loss: 3.5406046956777573
Training Loss: 3.482581579685211
validation Loss: 3.422429397702217
Training Loss: 3.4594224095344543
validation Loss: 3.542326919734478
Training Loss: 3.4694266080856324
validation Loss: 3.4094421938061714
Training Loss: 3.519970381259918
validation Loss: 3.4220425188541412
Training Loss: 3.410076451301575
validation Loss: 3.422138050198555
Training Loss: 3.3707704305648805

In [None]:
epochs = 100

for i in tqdm_notebook(range(epochs)):
  print('Training Loss:',train(model,data_store,loss_func,opt,20))
  print('validation Loss:',evaluate(model,data_store,loss_func))

In [None]:
# torch.save(model.state_dict(),'Trained_model_2.pth')
# torch.save(opt.state_dict(),'Trained_Optimizer_2')
# # state_dict = torch.load('Trained_model.pth')
# # net.load_state_dict(state_dict)
# # accuracy_on_model(net,len(X_test_new),1,X_test_new,Y_test_new,X_test,Y_test)