In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import cv2
import pathlib
import glob
import shutil
import os
import random

In [2]:
#--------------------------------caution: terminal commands ---------------------------------------------
%cd
%cd .keras/datasets/
!rm -r *

/root
/root/.keras/datasets


In [3]:
########################################### download data from given url ###############################################

dataset_url = "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
data_dir = tf.keras.utils.get_file('dakshina_dataset_v1.0', origin=dataset_url, untar=True)

Downloading data from https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar


In [4]:
#----------------------------------terminal command -----------------------------------------------
%cd /root/.keras/datasets/dakshina_dataset_v1.0/hi/lexicons/
!ls

/root/.keras/datasets/dakshina_dataset_v1.0/hi/lexicons
hi.translit.sampled.dev.tsv   hi.translit.sampled.train.tsv
hi.translit.sampled.test.tsv


In [5]:
train_data_path = "hi.translit.sampled.train.tsv"
test_data_path = "hi.translit.sampled.test.tsv"
validation_data_path = "hi.translit.sampled.dev.tsv"

**UTILITY FUNCTIONS FOR PREPOCESSING**

In [6]:
################################# function for vectorizing the data ##########################################

def vectorize_data(train_data_path):
  input_texts = []
  target_texts = []
  input_characters = set()
  target_characters = set()
  with open(train_data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
  for line in lines[: min(num_samples, len(lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    #---------------We use "tab" as the "start sequence" character---------------------
    #----------------for the targets, and "\n" as "end sequence" character-----------------.
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

  #--------------------------------artificially added-----------------------------
  input_characters.add(" ")
  target_characters.add(" ")

  input_characters = sorted(list(input_characters))
  target_characters = sorted(list(target_characters))

  num_encoder_tokens = len(input_characters)
  num_decoder_tokens = len(target_characters)
  max_encoder_seq_length = max([len(txt) for txt in input_texts])
  max_decoder_seq_length = max([len(txt) for txt in target_texts])

  print("Number of samples:", len(input_texts))
  print("Number of unique input tokens:", num_encoder_tokens)
  print("Number of unique output tokens:", num_decoder_tokens)
  print("Max sequence length for inputs:", max_encoder_seq_length)
  print("Max sequence length for outputs:", max_decoder_seq_length)

  input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
  target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

  input_details = [input_characters, input_texts, input_token_index, num_encoder_tokens, max_encoder_seq_length]
  target_details = [target_characters, target_texts, target_token_index, num_decoder_tokens, max_decoder_seq_length]

  return (input_details, target_details)

In [7]:
################### function for converting the data into apropriate ONE-Hot vector ######################

def onehot(input_details, target_details):

    #---------------------------unzipping information-----------------------------------
    input_characters = input_details[0]
    input_texts = input_details[1]
    input_token_index = input_details[2]
    num_encoder_tokens = input_details[3]
    max_encoder_seq_length = input_details[4]

    target_characters = target_details[0]
    target_texts = target_details[1]
    target_token_index = target_details[2]
    num_decoder_tokens = target_details[3]
    max_decoder_seq_length = target_details[4]

    #---------------------------- creating 3-Dim  matrics with all entries = 0 ----------------------------------- 
    encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
    decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
    decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

    for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0

        encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
        for t, char in enumerate(target_text):
# --------------decoder_target_data is ahead of decoder_input_data by one timestep ----------------------------
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            
            if t > 0:
# ----------------decoder_target_data will be ahead by one timestep----------------------------------
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

        decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0
    
    return (encoder_input_data, decoder_input_data, decoder_target_data)

In [8]:
################### function for creating data into appropriate form required for embedding ###################

def get_input_for_embedding(input_details, embed_size, train_details = None):

  #---------------------------unzipping information-----------------------------------
  input_characters = input_details[0]
  input_texts = input_details[1]
  input_token_index = input_details[2]
  num_encoder_tokens = input_details[3]
  max_encoder_seq_length = input_details[4]

  if train_details != None:
    input_token_index = train_details[2]
  
  input_array = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
  for (i, input_text) in enumerate(input_texts):
        for (t, char) in enumerate(input_text):
          input_array[i, t] = input_token_index[" "]
          if char in input_token_index:
            input_array[i, t] = input_token_index[char]
        input_array[i, t + 1 :] = input_token_index[" "]

  return input_array




**MACHINE TRANSLITERATOR**


In [9]:
class Machine_Transliterator():

  ############################################# constructor for class Machine_Transliterator ##########################################

  def __init__(self,max_encoder_seq_length,max_decoder_seq_length,encoder_embed_size, decoder_embed_size,
               num_hidden_layers_in_encoder,num_hidden_layers_in_decoder,epochs, hidden_layer_size,
               num_encoder_tokens, cell_type, num_decoder_tokens,input_token_index, target_token_index, 
               activation="softmax",optimizer="rmsprop"):
    
    self.cell_type= cell_type
    self.hidden_layer_size = hidden_layer_size  
    self.optimizer = optimizer
    self.activation = activation   

    #-------------------------------------- Number of hidden layers -------------------------------------

    self.num_hidden_layers_in_encoder = num_hidden_layers_in_encoder
    self.num_hidden_layers_in_decoder=num_hidden_layers_in_decoder

    #-------------------------------------- sequence length -------------------------------------
    self.max_decoder_seq_length=max_decoder_seq_length
    self.max_encoder_seq_length=max_encoder_seq_length

    #---------------------------------------------Embedding size-------------------------------------
    self.encoder_embed_size = encoder_embed_size
    self.decoder_embed_size = decoder_embed_size
    
    #-----------------information obtained after preprocessing of data-------------------------------------
    self.num_encoder_tokens = num_encoder_tokens
    self.num_decoder_tokens = num_decoder_tokens

    #-----------------------------dictionaries----------------------------------------------------
    self.input_token_index = input_token_index
    self.target_token_index = target_token_index

 


#########################################function to build model ###########################################

  def build_model(self):

    
    encoder_inputs = keras.Input(shape=(None,))   
    encoder_embedding_output = tf.keras.layers.Embedding(self.num_encoder_tokens, self.encoder_embed_size)(encoder_inputs)


#------------------------------ if cell type = LSTM -------------------------------------------------------------
    if self.cell_type == "lstm":
     #--------------------- encoder -----------------------------------
      encoder = keras.layers.LSTM(self.hidden_layer_size, return_state=True, return_sequences=True)
      encoder_outputs, state_h, state_c = encoder(encoder_embedding_output)
      for i in range(1,self.num_hidden_layers_in_encoder):
        encoder = keras.layers.LSTM(self.hidden_layer_size, return_state=True,return_sequences=True)
        encoder_outputs, state_h, state_c = encoder(encoder_outputs)
      encoder_states = [state_h, state_c]

      #---------------------------decoder ---------------------------------------------------
      decoder_inputs = keras.Input(shape=(None, ))      
      decoder_embedding_output = tf.keras.layers.Embedding(self.num_decoder_tokens, self.decoder_embed_size)(decoder_inputs)

      decoder = keras.layers.LSTM(self.hidden_layer_size, return_sequences=True, return_state=True)
      decoder_outputs, _, _= decoder(decoder_embedding_output, initial_state = encoder_states)
      for i in range(1,self.num_hidden_layers_in_decoder):
        decoder = keras.layers.LSTM(self.hidden_layer_size, return_state=True,return_sequences=True)
        decoder_outputs, _ , _= decoder(decoder_outputs, initial_state = encoder_states)



#------------------------------ if cell type = Simple RNN -------------------------------------------------------------
    elif self.cell_type == "rnn":
      #--------------------- encoder -----------------------------------
      encoder = keras.layers.SimpleRNN(self.hidden_layer_size, return_state=True,return_sequences=True)
      encoder_outputs, state = encoder(encoder_embedding_output)
      for i in range(1,self.num_hidden_layers_in_encoder):
        encoder = keras.layers.SimpleRNN(self.hidden_layer_size, return_state=True,return_sequences=True)
        encoder_outputs, state = encoder(encoder_outputs)
      encoder_states = [state]

      #---------------------------decoder ---------------------------------------------------
      decoder_inputs = keras.Input(shape=(None,))      
      decoder_embedding_output = tf.keras.layers.Embedding(self.num_decoder_tokens, self.decoder_embed_size)(decoder_inputs)

      decoder = keras.layers.SimpleRNN(self.hidden_layer_size, return_state=True,return_sequences=True)
      decoder_outputs, _ = decoder(decoder_embedding_output, initial_state = encoder_states)
      for i in range(1,self.num_hidden_layers_in_decoder):
        decoder = keras.layers.SimpleRNN(self.hidden_layer_size, return_state=True,return_sequences=True)
        decoder_outputs, _= decoder(decoder_outputs, initial_state = encoder_states)



#------------------------------ if cell type = GRU -------------------------------------------------------------
    elif self.cell_type == "gru":
      #--------------------- encoder -----------------------------------
      encoder = keras.layers.GRU(self.hidden_layer_size, return_state=True,return_sequences=True)
      encoder_outputs, state = encoder(encoder_embedding_output)
      for i in range(1,self.num_hidden_layers_in_encoder):
        encoder = keras.layers.GRU(self.hidden_layer_size, return_state=True,return_sequences=True)
        encoder_outputs, state = encoder(encoder_outputs)
      encoder_states = [state]

      #---------------------------decoder ---------------------------------------------------
      decoder_inputs = keras.Input(shape=(None, ))      
      decoder_embedding_output = tf.keras.layers.Embedding(self.num_decoder_tokens, self.decoder_embed_size)(decoder_inputs)
      
      decoder = keras.layers.GRU(self.hidden_layer_size, return_state=True,return_sequences=True)
      decoder_outputs, _ = decoder(decoder_inputs, initial_state = encoder_states)
      for i in range(1,self.num_hidden_layers_in_decoder):
        decoder = keras.layers.GRU(self.hidden_layer_size, return_state=True,return_sequences=True)
        decoder_outputs, _ = decoder(decoder_outputs, initial_state = encoder_states)


    decoder_dense = keras.layers.Dense(self.num_decoder_tokens, activation = self.activation)
    decoder_outputs = decoder_dense(decoder_outputs)
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

#-------------------------- return final model ---------------------------------------------------------
    return model





#########################################function for training the model ###########################################

  def train_model(self,encoder_input_data,decoder_input_data,decoder_target_data,epochs,batch_size,
                  val_encoder_input_data, val_decoder_input_data, val_decoder_target_data):
    
     model=self.build_model()

     #-----------------compile the model -------------------------------------
     model.compile(
         optimizer=self.optimizer,
         loss="categorical_crossentropy",
         metrics=["accuracy"]
         )      
     model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size = batch_size,
        epochs = epochs,
        validation_data = ([val_encoder_input_data, val_decoder_input_data],val_decoder_target_data),)
     return model

#===================================== end of class Machine_Transliterator ==========================================



**PARAMETERS**

In [10]:
cell_type = "rnn" # Type of the recurring unit
batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.
hidden_layer_size= 256  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
activation = "softmax" #activation
optimizer = "Adam"  #optimizer
encoder_embed_size = 27 #Encoder embedsize
decoder_embed_size = 64 #Decoder embedsize
num_hidden_layers_in_encoder=2  # number of hidden layers in encoder
num_hidden_layers_in_decoder=2   # number of hidden layers in decoder

**PREPROCESSING THE DATA**

In [11]:
############################# preprocessing the train data ################################

#---------------------------- vectorizing train data ----------------------------------
input_details, target_details = vectorize_data(train_data_path)

#------------------------converting data into one-hot representation ---------------------------
encoder_input_data, decoder_input_data, decoder_target_data = onehot(input_details, target_details)

#------------------------------- unzipping the data ----------------------------------------
input_characters = input_details[0]
input_texts = input_details[1]
input_token_index = input_details[2]
num_encoder_tokens = input_details[3]
max_encoder_seq_length = input_details[4]
target_characters = target_details[0]
target_texts = target_details[1]
target_token_index = target_details[2]
num_decoder_tokens = target_details[3]
max_decoder_seq_length = target_details[4]

#-------------------- converting input data into appropriate embedding ----------------------
encoder_input_data = get_input_for_embedding(input_details, encoder_embed_size)
decoder_input_data = get_input_for_embedding(target_details, decoder_embed_size)

Number of samples: 10000
Number of unique input tokens: 27
Number of unique output tokens: 64
Max sequence length for inputs: 18
Max sequence length for outputs: 20


In [12]:
############################# preprocessing the validation data ################################

#---------------------------- vectorizing validation data ----------------------------------
val_input_details, val_target_details = vectorize_data(validation_data_path)

#------------------------converting data into one-hot representation ---------------------------
val_encoder_input_data, val_decoder_input_data, val_decoder_target_data = onehot(val_input_details, val_target_details)

#------------------------------- unzipping the data ----------------------------------------
val_input_characters = val_input_details[0]
val_input_texts = val_input_details[1]
val_input_token_index = val_input_details[2]
val_num_encoder_tokens = val_input_details[3]
val_max_encoder_seq_length = val_input_details[4]
val_target_characters = val_target_details[0]
val_target_texts = val_target_details[1]
val_target_token_index = val_target_details[2]
val_num_decoder_tokens = val_target_details[3]
val_max_decoder_seq_length = val_target_details[4]

#-------------------- converting input data into appropriate embedding ----------------------
val_encoder_input_data = get_input_for_embedding(val_input_details, encoder_embed_size, input_details)
val_decoder_input_data = get_input_for_embedding(val_target_details, decoder_embed_size, target_details)

Number of samples: 4358
Number of unique input tokens: 27
Number of unique output tokens: 64
Max sequence length for inputs: 18
Max sequence length for outputs: 16


**CREATING MACHINE TRANSLITERATOR**

In [15]:
########################### creating machine transliterator object ###############################
machine = Machine_Transliterator(
    max_encoder_seq_length,max_decoder_seq_length,encoder_embed_size, 
    decoder_embed_size,num_hidden_layers_in_encoder,num_hidden_layers_in_decoder,
    batch_size, hidden_layer_size, num_encoder_tokens, cell_type, num_decoder_tokens, 
     input_token_index,target_token_index, activation, optimizer
    )

**TRAINING**

In [16]:
model = machine.train_model(
    encoder_input_data, decoder_input_data,decoder_target_data,epochs,batch_size,
    val_encoder_input_data, val_decoder_input_data, val_decoder_target_data
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
