<a href="https://colab.research.google.com/github/bgohrani/Recurrent_Neural_Networks/blob/main/3.Project_Encoder_Decoder_Batched.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , mean_squared_error
import matplotlib.colors
import math
from sklearn.datasets import make_blobs
from sklearn.compose import ColumnTransformer
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim
import time
sns.set()
import torchvision.models as models
import copy
import torchvision
from torchvision.transforms import transforms
import os
import sys
import string
import re
import xml.etree.ElementTree as ET
from torch.utils.data import dataset

#importing essential libraries

## English and Hindi Dictionaries

In [None]:
char_eng = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_eng = '<pad>'
eng_dict = {}
eng_dict[pad_eng] = 0

for i,letter in enumerate(char_eng):
  eng_dict[letter] = i+1

print(eng_dict)

#Creating a dictionary for english and hindi letters which will be used for encoding

{'<pad>': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


In [None]:
hindi_dict = {}
hindi_dict['<pad>'] = 0

char_hindi = ''
for i in range(2304, 2432):
  char_hindi += chr(i)

for i, letter in enumerate(char_hindi):
  hindi_dict[letter] = i+1

print(hindi_dict)

#Hindi Dictionary

{'<pad>': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

## Text Preprocessing helper functions

In [None]:
def split_hindi_words(hindi_list):
  new_hindi_list = []
  for hind_name in hindi_list:
    hind_name = hind_name.replace(',',' ').replace('_',' ').replace('.',' ').replace("'",' ').replace('-',' ').replace('/',' ').replace('\u200d',' ').replace('(',' ').replace(')',' ').replace('?',' ')
    hind_name = hind_name.split()
    new_hindi_list.append(hind_name)
  return new_hindi_list

def split_english_words(english_list):
  new_english_list = []
  regex = re.compile('[^a-zA-Z]')
  for eng_name in english_list:
    eng_name = eng_name.upper()
    eng_name = eng_name.replace("'",'').replace('/',' ')
    eng_name = regex.sub(' ', eng_name)
    eng_name = eng_name.split()
    new_english_list.append(eng_name)
  return new_english_list

def clean_english_list(eng_list):
  regex = re.compile('[^a-zA-Z]')
  new_english_names = []
  for word in eng_list:
    new_english_names.append(regex.sub('', word))
  return new_english_names

def clean_hindi_list(hindi_list):
  new_hindi_names = []
  for word in hindi_list:
    word = word.replace(',','').replace('_','').replace('.','').replace("'",'')
    new_hindi_names.append(word)
  return new_hindi_names

#Some helper functions to help preprocess the text we have 
#First functions splits the hindi strings into single words and returns a list of the same, same for english
#The next two functions removes unnecessary characters from english and hindi words
#These functions will be used later

## Encoding

In [None]:
def convert_eng_to_encoded(X_train):
  list_to_return = []
  for word in X_train:
    onehotstart = torch.zeros([len(word)+1,1])
    onehotstart[len(word)][0] = 1
    for i,letter in enumerate(word):
      index = eng_dict[letter]
      onehotstart[i][0] = index
    list_to_return.append(onehotstart)
  
  return list_to_return 

def convert_hindi_to_encoded(Y_train):
  list_to_return = []
  for word in Y_train:
    onehotstart = torch.zeros([len(word)+1,1])
    onehotstart[len(word)][0] = 0
    for i,letter in enumerate(word):
      index = hindi_dict[letter]
      onehotstart[i][0] = index
    list_to_return.append(onehotstart)
  
  return list_to_return

#Functions for encoding but not one hot encoding, the index value is directly used

## Class to process the data

In [None]:
class EncoderDecoderData():

  def __init__(self,filename):
    self.final_eng_list, self.final_hindi_list = self.create_data_from_XML(filename)

 
  def create_data_from_XML(self,filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    english_names = []
    hindi_names = []
    for elem in root.iter():
      if elem.tag == 'SourceName':
        english_names.append(elem.text)
      if elem.tag == 'TargetName' and elem.attrib['ID'] == '1':
        hindi_names.append(elem.text)
    
    new_english_names = split_english_words(english_names)
    new_hindi_names = split_hindi_words(hindi_names)

    final_hindi_data = []
    final_english_data = []
    
    for eng_word, hindi_word in zip(new_english_names, new_hindi_names):
      if len(eng_word) != len(hindi_word):
        print('Skipping:', eng_word, '-', hindi_word)
      else:
        for eng_word_part, hindi_word_part in zip(eng_word, hindi_word):
          final_hindi_data.append(hindi_word_part)
          final_english_data.append(eng_word_part)
    
    final_hindi_data = clean_hindi_list(final_hindi_data)
    final_english_data = clean_english_list(final_english_data)
    self.final_eng_list = final_english_data
    self.final_hindi_list = final_hindi_data
    return self.final_eng_list, self.final_hindi_list


  def generate_random_sample(self):
    index = np.random.randint(len(self.final_eng_list))
    return self.final_eng_list[index], self.final_hindi_list[index]
  
  def generate_random_batch(self,batch_size):
    index = np.random.randint(len(self.final_eng_list))
    batch_list_english = []
    batch_list_hindi = []
    for i in range(index,index+batch_size,1):
      if i >= len(self.final_eng_list):
        batch_list_english.append(self.final_eng_list[i-len(self.final_eng_list)])
        batch_list_hindi.append(self.final_hindi_list[i-len(self.final_eng_list)])
      else:
        batch_list_english.append(self.final_eng_list[i])
        batch_list_hindi.append(self.final_hindi_list[i])
    return batch_list_english, batch_list_hindi
  
#This is a class we will call on the xml data we have for english and hindi
#In the init function, a create_data_from_xml is called which is defined below
#In create_data_from XML, the file is parsed and its root is iterated over to get the hindi strings and their corresponding english strings
#ID = 1 is used since one string might have multiple others
#We now have equivalent strings in the form of a nested list. Our next step is to remove strings which have unequal number of words, so we compare and skip
#Finally individual words are added into a new list, and the cleaning functions are called as usual
#Two functions to create data for us are used, one which returns a sample of a hindi and english word and the other which returns a consecutive batch of words


## Encoder Architecture

In [None]:
class Encoder(nn.Module):
  def __init__(self,input_size,emb_size,hidden_size):
    super().__init__()  
    self.input_size  = input_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size

    self.embedding = nn.Embedding(self.input_size,self.emb_size)

    self.rnn = nn.LSTM(self.emb_size,self.hidden_size)
  
  def forward(self,input_word):
    input_to_rnn = self.embedding(input_word.long())

    output, (h_state,c_state) = self.rnn(input_to_rnn)

    return h_state,c_state

#Since we want to do a sequence to sequence problem, the following methodology is followed
#First we will encode our given sequence into a fixed dimension
#Then this data will be passed on to the decoder, which will return the transliterated version of our sequence
#This task will be divided into three steps, the first being that the data is encoded, then decoded and finally a class that calls these classes as needed
#This is the encoder of the model, we specify all the dimensions in the init function and the LSTM cell that we shall be using
#So our input word is embedded first and is passed through the LSTM cell at once
#The hidden and cell states of the LSTM cell are returned as the encoded form of the word to be used

## Decoder Architecture

In [None]:
class Decoder(nn.Module):
  def __init__(self,output_size,emb_size,hidden_size):
    super().__init__()

    self.output_size = output_size
    self.emb_size = emb_size
    self.hidden_size = hidden_size
    
    self.embedding = nn.Embedding(output_size,emb_size)
    self.rnn = nn.LSTM(emb_size, hidden_size)
    self.fc = nn.Linear(hidden_size,output_size)
  
  def forward(self,input,h_state,c_state):

    input = input.unsqueeze(0)

    input_to_rnn = self.embedding(input.long())

    output,(h_state,c_state) = self.rnn(input_to_rnn,(h_state,c_state))

    prediction = self.fc(output.squeeze(0))

    return prediction, h_state, c_state

#This is the decoder model, wherein the encoded word is converted to the output
#This is a single cell of the decoder, i.e a single output will be given 
#We specify the dimensions as usual, but there is also a final linear layer that will give us a distribution over the output size
#The distribution will form a part of the input of the next cell so that is also returned, as well as the hidden and cell states of the network

## Seq-2-Seq Architecture

In [None]:
class seq2seq(nn.Module):
  def __init__(self,encoder,decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,input, target,teacher_forcing_ratio = 0.5):

    target_length = target.shape[0]
    batch_size = target.shape[1]
    output_size = self.decoder.output_size

    outputs_to_return = torch.zeros(target_length,batch_size,output_size)

    h_state,c_state = self.encoder(input)

    input = torch.zeros(batch_size)

    for i in range(target_length):
      output,h_state,c_state = self.decoder(input,h_state,c_state)

      outputs_to_return[i] = output

      teacher_force = np.random.random() < teacher_forcing_ratio

      top1 = output.argmax(1)

      input = target[i] if teacher_force else top1
    
    return outputs_to_return

#Here the task comes all together in the sequence-to-sequence architecture
#The encoder and decoder we have created are taken and a final output sequence structure is initialized to all zeros
#The input is passed through the encoder all at once and the final states are taken
#Then we loop over the target length and pass the hidden states as input to the first decoder cell to get an output, the initial input is zero
#The output is appended to the outputs tensor and is also passed on as input to the next cell
#Teacher-forcing is done at a probability of 50%, which means that initially the outputs are not passed on, rather the actual values are 
#Finally the outputs tensor is returned

## Setting up the training process

In [None]:
input_size = 27
output_size = 129
embedding_size = 256
hidden_size = 512

encoder = Encoder(input_size,embedding_size,hidden_size)
decoder = Decoder(output_size,embedding_size,hidden_size)
model = seq2seq(encoder,decoder)

#Instantiating all classes

### Weight initialization

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

#Method to initialize weights in the model

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(27, 256)
    (rnn): LSTM(256, 512)
  )
  (decoder): Decoder(
    (embedding): Embedding(129, 256)
    (rnn): LSTM(256, 512)
    (fc): Linear(in_features=512, out_features=129, bias=True)
  )
)

### Loss function and optimizer

In [None]:
opt = optim.Adam(model.parameters(),lr=0.001)

In [None]:
loss_func = nn.CrossEntropyLoss(ignore_index=0)

### Training Helper function

In [None]:
def train(model, data_loader, loss_func, optimizer,no_of_batches):
  
  model.train()
  total_loss = 0
  optimizer.zero_grad()
  for i in range(no_of_batches):
    X_train,Y_train = data_store.generate_random_batch(5000)
    X_train, Y_train = convert_eng_to_encoded(X_train), convert_hindi_to_encoded(Y_train)
    X_batch = []
    Y_batch = []
    sequence_length = np.random.randint(3,10)
    for xpoint, ypoint in zip(X_train,Y_train):
      if len(X_batch) == 16:
        break
      if ypoint.shape[0] == sequence_length:
        X_batch.append(xpoint)
        Y_batch.append(ypoint)
    max_input_length = 0
    for xpoint in X_batch:
      if xpoint.shape[0] > max_input_length:
        max_input_length = xpoint.shape[0]
    X_final_batch = torch.zeros(max_input_length,16)
    Y_final_batch = torch.zeros(sequence_length,16)
    for i in range(16):
      actual = X_batch[i].shape[0]
      X_final_batch[:actual,i] = X_batch[i].view(-1)
    for i in range(16):
      Y_final_batch[:,i] = Y_batch[i].view(-1)
    
        
    input = X_final_batch
    target = Y_final_batch
    
    

    output = model(input,target)

    output_size = output.shape[-1]

    loss = loss_func(output.view(-1,output_size),target.view(-1).long())
    total_loss += loss.item()
    loss.backward(retain_graph=True)

  torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

  optimizer.step()

  

  return total_loss/no_of_batches

#Training function, which takes a batch of inputs and passes them through the model
#loss is computed, back propagated and weights are updated
#Clipping is done to avoid explosion of gradients

#The difference here is that we want to use batching in our model to train faster
#We take a batch of 5000 words and encode it
#We then get 16 samples for which the y values are of the same size, x need not be the same 
#Once we have those, we find the largest x length and make all x values equal to that length by adding zeros
#We then take all of them together as batches so we now have x of a particular size and y of a particular size
#These batches are passed as input

### Evaluation helper function

In [None]:
def evaluate(model, test_data_loader, loss_func):
  
  model.eval()
  X_test,Y_test = test_data_loader.generate_random_batch(32)
  X_test_new = convert_eng_to_encoded(X_test)
  Y_test_new = convert_hindi_to_encoded(Y_test)
  
  total_loss = 0

  for i in range(len(X_test_new)):
    input = X_test_new[i]
    target = Y_test_new[i]


    output = model(input,target)

    output_size = output.shape[-1]

    loss = loss_func(output.view(-1,output_size),target.view(-1).long())


    total_loss += loss.item()

  return total_loss/32

#Evaluation function is similar, returns the loss on test data
#The model is sent into eval mode and no optimization is done

## Starting the training process

In [None]:
data_store = EncoderDecoderData('training.xml')

Skipping: ['MAHARANI', 'PADMINI'] - ['महारानी', 'पद्', 'मिनी']
Skipping: ['STATE', 'MUSEUM', 'OF', 'THE', 'VERMONT', 'HISTORICAL', 'SOCIETY'] - ['स्टेट', 'म्युज़ियम', 'ऑफ', 'द', 'वरमाउंट', 'हिस्टॉरिकल', 'सोसायट', 'ी']
Skipping: ['I', 'DUKAANT'] - ['इंदुकांत']
Skipping: ['EFFIE', 'AWARDS'] - ['एफी', 'अवार्ड्', 'स']
Skipping: ['LAURENCE', 'OLIVIER', 'AWARDS'] - ['लॉरेंस', 'ओलिवर', 'अवार्ड्', 'स']
Skipping: ['ETTA'] - ['एट्', 'टा']
Skipping: ['COLLEGE', 'FOOTBALL', 'AWARDS'] - ['कॉलेज', 'फुटबॉल', 'अवार्ड्', 'स']
Skipping: ['STEVE', 'RHODES'] - ['स्टीव', 'रोड्', 'स']
Skipping: ['WINDHAM', 'COUNTY', 'HISTORICAL', 'MUSEUM'] - ['व', 'िंडहैम', 'काउंट', 'ी', 'ह', 'िस्टॉर', 'िकल', 'म्युज़ियम']
Skipping: ['PLAZA'] - ['प्लाज़ा', '66']
Skipping: ['ADVAKI'] - ['अद्', 'वाकी']
Skipping: ['BHAALACHAN', 'DR'] - ['भालचन्द्र']
Skipping: ['BARHARWA', 'JUNCTION'] - ['बरहरवा']
Skipping: ['STATE', 'BNK', 'TR'] - ['स्टेट', 'बैंक', 'ऑफ', 'त्रावणकोर']
Skipping: ['IN', 'DRAJEET'] - ['इन्द्रजीत']
Skipping: ['WOODSTO

In [None]:
data_tester = EncoderDecoderData('testing.xml')

Skipping: ['W', 'TTEMBERG'] - ['यूटमबर्ग']


In [None]:
epochs = 100

for i in tqdm_notebook(range(epochs)):
  print('Training Loss:',train(model,data_store,loss_func,opt,20))
  print('validation Loss:',evaluate(model,data_store,loss_func))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Training Loss: 4.867519044876099
validation Loss: 4.817363411188126
Training Loss: 4.80725462436676
validation Loss: 4.745693862438202
Training Loss: 4.7375040531158445
validation Loss: 4.6576035767793655
Training Loss: 4.6462595701217655
validation Loss: 4.498365253210068
Training Loss: 4.441825222969055
validation Loss: 4.193519853055477
Training Loss: 4.060336399078369
validation Loss: 3.8106859177351
Training Loss: 3.8439319133758545
validation Loss: 3.718235857784748
Training Loss: 3.7936527729034424
validation Loss: 3.9040592685341835
Training Loss: 3.633707642555237
validation Loss: 3.4685273990035057
Training Loss: 3.62103590965271
validation Loss: 3.591299779713154
Training Loss: 3.596520471572876
validation Loss: 3.449893295764923
Training Loss: 3.580352222919464
validation Loss: 3.607529856264591
Training Loss: 3.571495199203491
validation Loss: 3.458219677209854
Training Loss: 3.6018193125724793
validation Loss: 3.7150397300720215
Training Loss: 3.473277199268341
validation

In [None]:
epochs = 100

for i in tqdm_notebook(range(epochs)):
  print('Training Loss:',train(model,data_store,loss_func,opt,20))
  print('validation Loss:',evaluate(model,data_store,loss_func))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Training Loss: 2.706491819024086
validation Loss: 2.9528014212846756
Training Loss: 2.653440684080124
validation Loss: 2.6931162290275097
Training Loss: 2.77426078915596
validation Loss: 2.917041133157909
Training Loss: 2.757438653707504
validation Loss: 2.6366179417818785
Training Loss: 2.5670855700969697
validation Loss: 2.7171876579523087
Training Loss: 2.601834911108017
validation Loss: 2.8337081354111433
Training Loss: 2.6417100012302397
validation Loss: 2.859104059636593
Training Loss: 2.6237579762935637
validation Loss: 2.9127277731895447
Training Loss: 2.6222614467144014
validation Loss: 2.723024219274521
Training Loss: 2.508723509311676
validation Loss: 2.618336282670498
Training Loss: 2.5753480434417724
validation Loss: 2.6094206934794784
Training Loss: 2.6305887520313265
validation Loss: 2.6566354297101498
Training Loss: 2.629073303937912
validation Loss: 2.699669197201729
Training Loss: 2.527462565898895
validation Loss: 2.6516761500388384
Training Loss: 2.5356066286563874
