<a href="https://colab.research.google.com/github/abhiraman/Capstone_Project/blob/main/M_C_Transliteration_padded_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dependencies

In [1]:
import xml.etree.ElementTree as ET
from torch.utils.data import Dataset,DataLoader
import string,re
import torch
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn as nn
import torch.optim as optim
from IPython.display import clear_output
import matplotlib.pyplot as plt

# Load Data from GitHub

In [2]:
!git clone -l -s git://github.com/GokulNC/NLP-Exercises cloned-repo

Cloning into 'cloned-repo'...
remote: Enumerating objects: 72, done.[K
remote: Total 72 (delta 0), reused 0 (delta 0), pack-reused 72[K
Receiving objects: 100% (72/72), 2.39 MiB | 5.42 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [3]:
%cd cloned-repo

/content/cloned-repo


In [4]:
%cd Transliteration-Indian-Languages/Original-NEWS2012-data/Training
!ls

/content/cloned-repo/Transliteration-Indian-Languages/Original-NEWS2012-data/Training
NEWS2012-Training-EnBa-14623.xml  NEWS2012-Training-EnKa-11955.xml
NEWS2012-Training-EnHe-11501.xml  NEWS2012-Training-EnMa-9000.xml
NEWS2012-Training-EnHi-13937.xml  NEWS2012-Training-EnTa-11957.xml


In [5]:
if torch.cuda.is_available():
  MyDevice = 'cuda'
else:MyDevice = 'cpu'
print(MyDevice)

cuda


Getting all Hindi & English letters

In [6]:
## Get all hindi consonants ##
# Hindi Unicode Hex Range is 2304:2432. Source: https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)
pad = "PAD"
hindi_alphabets = [pad]+[chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabets_indexed = {hindi_alphabets[i]:i for i in range(len(hindi_alphabets))}
print(hindi_alphabets_indexed)

english_alphabets = string.ascii_uppercase
english_alphabets_indexed = {}
english_alphabets_indexed[pad]=0
for ind,char in enumerate(english_alphabets,start=1):
  english_alphabets_indexed[char] = ind
print(len(english_alphabets_indexed))

{'PAD': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 11

Clean String Lines

In [7]:
non_eng_letters_regex = re.compile('[^a-zA-Z ]')
def _cleanEnglishWord(line):
  line = line.replace('-',' ').replace(',',' ').upper()
  line = non_eng_letters_regex.sub('', line)
  return line.split()

def _cleanLanguageWord(line):
  line = line.replace('-',' ').replace(',',' ')
  cleanedStr = ''
  for eChar in line:
    if eChar in  hindi_alphabets or eChar in ' ':
      cleanedStr+=eChar
  return cleanedStr.split()


# Custom Data Loader

In [8]:
class TextLoader(Dataset):
  def __init__(self,xmlFile=None):
    super().__init__()
    self.fileName = xmlFile
    self.allEngWords,self.allHindiWords = [],[]
    self._read_clean_data()

  def __len__(self):
    return len(self.allEngWords)

  def _read_clean_data(self):
    tree = ET.parse(self.fileName)
    root = tree.getroot()
    for child in root:
      engWord = _cleanEnglishWord(child[0].text)
      hindWord = _cleanLanguageWord(child[1].text)
      if len(engWord)!=len(hindWord):
        print("Skipping --> {} --- {}".format(child[0].text,child[1].text))
      for eWord in engWord:
        self.allEngWords.append(eWord)
      for eWord in hindWord:
        self.allHindiWords.append(eWord)
  def __getitem__(self,idx):
    return {"EnglishWord":self.allEngWords[idx],"HindiWord":self.allHindiWords[idx]}

dataSet = TextLoader(xmlFile='NEWS2012-Training-EnHi-13937.xml')

In [None]:
for ind,i in enumerate(dataSet):
  if ind>5:break
  print(i)

{'EnglishWord': 'RAASAVIHAAREE', 'HindiWord': 'रासविहारी'}
{'EnglishWord': 'DEOGAN', 'HindiWord': 'देवगन'}
{'EnglishWord': 'ROAD', 'HindiWord': 'रोड'}
{'EnglishWord': 'SHATRUMARDAN', 'HindiWord': 'शत्रुमर्दन'}
{'EnglishWord': 'MAHIJUBA', 'HindiWord': 'महिजुबा'}
{'EnglishWord': 'SABINE', 'HindiWord': 'सैबिन'}


In [13]:
class CustomWordLoader():
  def __init__(self):
    pass
  def indexHindiWords(self,HindiWordList,maxCharWord,device='cpu'):
    finalTensor = torch.zeros(len(HindiWordList),maxCharWord+1,len(hindi_alphabets_indexed))
    for wordIndex,eWord in enumerate(sorted(HindiWordList,reverse=True)):
      for charIndex,eChar in enumerate(eWord):
        pos = hindi_alphabets_indexed.get(eChar)
        finalTensor[wordIndex][charIndex][pos]=1
    return finalTensor.permute(1,0,2).to(device)

  
  def indexEnglishWords(self,EnglishWordList,maxCharWord,device='cpu'):
    finalTensor = torch.zeros(len(EnglishWordList),maxCharWord+1,1) ## (BatchSize,max_str_len,1)
    for wIndex,eWord in enumerate(EnglishWordList):
      for Cindex,eChar in enumerate(eWord):
        pos = english_alphabets_indexed.get(eChar)
        finalTensor[wIndex][Cindex]=pos
    return finalTensor.to(device)
  
  
  def returnPackedData(self):
    def _getMaxStrLen(wordList):
      return max([len(i) for i in wordList])
    finalDict = {}
    clubbedList = list(((eDict['EnglishWord'],eDict['HindiWord']) for eDict in self.batch))
    englishList,HindiList = list(zip(*clubbedList))
    engMaxChar,hinMaxChar = _getMaxStrLen(englishList),_getMaxStrLen(HindiList)
    OHE_inputs,Targets = self.indexHindiWords(HindiList,hinMaxChar,device=MyDevice),self.indexEnglishWords(englishList,engMaxChar,device=MyDevice)
    return {"Inputs":OHE_inputs,"Targets":Targets}
  def __call__(self,batch):
    self.batch =batch
    return self.returnPackedData()

dataLoader = DataLoader(dataSet,batch_size=1,shuffle=True,collate_fn=CustomWordLoader())

In [14]:
for ind,data in enumerate(dataLoader):
  if ind>0:break
  print(data["Inputs"],data["Inputs"].size())
  print(data["Targets"].size(),data["Targets"])


tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0

Mount Drive

In [15]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [17]:
%cd /gdrive/MyDrive/Capstone_project_data/ImgtoText
!ls

/gdrive/MyDrive/Capstone_project_data/ImgtoText
 Annotations   Cropped_Images  'Sample Train'


Enoder Decoder W/O Attention

In [33]:
class Encoder_Decoder(nn.Module):
  def __init__(self,inputSize,hiddenSize,outputSize,num_layers =1,num_dirns=1,verbose=True):
    super().__init__()
    self.hiddenSize = hiddenSize
    self.outputSize = outputSize
    self.num_layers = num_layers
    self.num_dirns = num_dirns
    self.encoder_GRU = nn.GRU(inputSize,hiddenSize)
    self.decoder_GRU = nn.GRU(outputSize,hiddenSize)
    self.h2o = nn.Linear(hiddenSize,outputSize)
    self.F = nn.LogSoftmax(dim=2)
    self.Fll = nn.Softmax(dim=2)
    self.verbose = verbose
  
  def forward(self,inputs,maxCharLen,GT=None,trainFlag =True,device='cpu'):
    all_hidden,last_hidden = self.encoder_GRU (inputs)

    if self.verbose:
      print("Encoder Input : ",inputs.size())
      print("Encoder All Hidden Outputs : ",all_hidden.size())
      print("Encoder Last Hidden Output : ",last_hidden.size())


    decoder_state = last_hidden
    decoderInput = torch.zeros(1,all_hidden.size()[1],self.outputSize).to(device) ##(1,batchSize,no.of English Alphabets)
    if self.verbose:
      print("Decoder Input : ",decoderInput.size())
    
    if GT!=None:
      GT_trans = torch.transpose(GT,1,0)

    outputlist = []   
    for i in range(1,maxCharLen):
      out,decoder_state = self.decoder_GRU(decoderInput,decoder_state)
      output = self.F(self.h2o(decoder_state))
      if trainFlag:
        outputlist.append(output.squeeze(0))
      else:
        output_eval = self.Fll(self.h2o(out))
        outputlist.append(output_eval.squeeze(0))
      if self.verbose:
        print("Decoder Ouput : ",output.size())
        print("Squeezed Final Output : ",output.squeeze(0).size())
        

      maxIndexes = torch.argmax(output,dim=2,keepdim=True).type(torch.int64)
      if GT!=None:
        maxIndexes = GT_trans[i].reshape(1,all_hidden.size()[1],1).type(torch.int64)

      one_hot = torch.FloatTensor(output.size()).to(device)
      one_hot.zero_()
      one_hot.scatter_(2,maxIndexes,1)
      decoderInput = one_hot.detach()
    return outputlist
      



In [30]:
def trainBatch(modelObj,batchSize,optFn,LossFn,enforceTrain=False,device='cpu'):
  dataLoader = DataLoader(dataSet,batch_size=batchSize,shuffle=True,collate_fn=CustomWordLoader())

  total_loss = 0
  batch_counter = 0
  for ind,data in enumerate(dataLoader):
    targets = data["Targets"].squeeze(2)
    if data["Inputs"].size()[1]!=batchSize:continue
    batch_counter+=1
    pred_ouputs = modelObj(data["Inputs"],targets.size()[1],GT= targets if enforceTrain else None,device=device)
    targets_trans = torch.transpose(targets,1,0).type(torch.LongTensor).to(device)

    for index,ouputs in enumerate(pred_ouputs):
      loss = LossFn(ouputs,targets_trans[index])/batchSize ## Loss per word
      loss.backward(retain_graph=True)
      total_loss+=loss.item()         ## Total Loss per batch
    
    if enforceTrain and len(dataLoader)/(ind +1)==len(dataLoader)//3:break  ## Not allowing to over fit the data 

  return total_loss/batch_counter   ## Total Loss per Epoch



In [56]:
def training_helper(net,lr=0.5,batch_size=100,epochs=11,momentum = 0.9, display_freq=5, device = 'cpu'):
  net.to(device)
  lossFn = nn.NLLLoss()
  optFn = optim.Adam(modelObj.parameters(),lr=lr)
  enforce_Till = epochs//3
  sheduler = optim.lr_scheduler.StepLR(optFn,step_size=200,gamma=0.5)

  loss_per_epoch_array = torch.zeros(epochs+1)
  minVal= 1000000
  for i in range(epochs):
    optFn.zero_grad()
    loss_per_epoch_array[i+1] = (loss_per_epoch_array[i]*i + trainBatch(net, batch_size,optFn, lossFn, device = device, enforceTrain=True if i<enforce_Till else False ))/(i + 1)
    optFn.step()
    sheduler.step()
    
    if sheduler.get_lr()!=sheduler.get_last_lr():
      print(sheduler.get_lr())
    if loss_per_epoch_array[i]<minVal and i>0:
      minVal = loss_per_epoch_array[i]
      torch.save(net,'model_batched.pt')

    if i%display_freq == 0 and i!=0: ## Every 5 epochs refresh the loss plot ##
      clear_output(wait=True)
      print("For Epoch {} ----> Loss {}".format(i,loss_per_epoch_array[i]))
      plt.figure()
      plt.plot(loss_per_epoch_array[1:i],'-*')
      plt.xlabel("Epochs")
      plt.ylabel("Epoch Loss")
      plt.show()
  return loss_per_epoch_array


In [45]:
## HyperParameters ##
hiddensize = 256
lr = 0.005
momentum = 0.9
batch_size=64

In [57]:
modelObj = Encoder_Decoder(len(hindi_alphabets_indexed),hiddensize,len(english_alphabets_indexed),verbose=False)
torch.load('model_batched.pt')

Encoder_Decoder(
  (encoder_GRU): GRU(129, 256)
  (decoder_GRU): GRU(27, 256)
  (h2o): Linear(in_features=256, out_features=27, bias=True)
  (F): LogSoftmax(dim=2)
  (Fll): Softmax(dim=2)
)

In [None]:
training_helper(modelObj,lr=lr, momentum = momentum,batch_size=batch_size,epochs=500,device=MyDevice)

In [36]:
torch.load('model_batched.pt')

Encoder_Decoder(
  (encoder_GRU): GRU(129, 256)
  (decoder_GRU): GRU(27, 256)
  (h2o): Linear(in_features=256, out_features=27, bias=True)
  (F): LogSoftmax(dim=2)
  (Fll): Softmax(dim=2)
)

In [44]:
def test(net,data,device='cpu'):
  key,val = list(english_alphabets_indexed.keys()),english_alphabets_indexed.values()
  net.eval().to(device)
  outputs = net(data["Inputs"].to(device),data["Targets"].size()[1],trainFlag=False)
  convertedList = [[] for i in range(outputs[0].size()[0])]
  for eTensor in outputs:
    indexes = torch.argmax(eTensor,dim=1).tolist()
    strr = ''
    for i,index in enumerate(indexes):
      strr = key[index]
      convertedList[i].append(strr)
  return convertedList


testLoader = DataLoader(dataSet,batch_size=1,shuffle=True,collate_fn=CustomWordLoader())
key = list(hindi_alphabets_indexed.keys())

i=0
for data in testLoader:
  pred = test(modelObj,data)
  tempList = []
  for eTensor in data["Inputs"]:
    index = torch.argmax(eTensor,dim=1)
    tempList.append(key[int(index.item())])
  print(pred,tempList)
  i+=1
  if i>5:break

  

[['A', 'A', 'A', 'PAD', 'PAD']] ['भ', 'ा', 'ल', 'ी', 'PAD']
[['A', 'A', 'A']] ['ड', 'ै', 'म', 'PAD']
[['A', 'A', 'A', 'A', 'PAD', 'PAD', 'PAD']] ['च', 'ा', 'ल', 'ी', 'स', 'ा', 'PAD']
[['A', 'A', 'A', 'PAD']] ['स', 'े', 'व', 'ा', 'PAD']
[['A', 'A', 'A', 'A']] ['क', 'ि', 'ट', '्', 'ट', 'ी', 'PAD']
[['A', 'A', 'A', 'A', 'PAD']] ['फ', '्', 'र', 'ि', 'क', 'PAD']


In [None]:
batchSize = 1
testSet = TextLoader(xmlFile='NEWS2012-Testing-EnHi-1000.xml')
data = DataLoader(dataSet,batch_size=batchSize,shuffle=True,collate_fn=CustomWordLoader())


In [None]:
aa = [1,2,3,5,6]
b= 0
if b<