The following code uses the best set of hyperparameters obtained after a set 87 sweeps, on the test data. The program outputs a predictions_vanilla.csv file.

#Importing essentials and Auxiliary functions

In [36]:
!nvidia-smi

Thu May 13 04:28:21 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    33W / 250W |    841MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [37]:
import io
import csv
import numpy as np
import tensorflow 
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense, Embedding, GRU, Dropout, SimpleRNN
from keras.utils.vis_utils import plot_model
from math import log
from numpy import array
from numpy import argmax
from math import log1p 
import keras

**Fetching the dataset** 

Lexicons for Latin-Tamil are taken from Google's Dakshina dataset. The necessary datasets have been uploaded to github, cloned and used for the reminder of the code.

In [38]:
!git clone https://github.com/borate267/lexicon-dataset.git

fatal: destination path 'lexicon-dataset' already exists and is not an empty directory.


In [39]:
# GLOBAL VARIABLES

print_data = True #Set to False if you do not want to print 

#Reading the dataset


In [40]:
train_dir = "lexicon-dataset/ta.translit.sampled.train.tsv"
test_dir = "lexicon-dataset/ta.translit.sampled.test.tsv"

# The following function reads the raw text document and returns a list of lists comprising the romanized and native versions of the words

def read_corpus(corpus_file):
  tamil_words = []
  latin_words = []
  with io.open(corpus_file, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      latin_words.append(tokens[1])
      tamil_words.append(tokens[0])
  return latin_words, tamil_words

train_source, train_target = read_corpus(train_dir)
test_source, test_target = read_corpus(test_dir)


#Shuffling the datasets, creating a vocabulary and tokenising the same

In [41]:
arr = np.arange(len(train_source))
np.random.shuffle(arr)
arr1 = np.arange(len(test_source))
np.random.shuffle(arr1)

# Holds the vocabulary
source_characters = set()
target_characters = set()

# Holds unshuffled datasets
input_texts_ns = []
target_texts_ns = []
test_input_texts_ns = []
test_target_texts_ns = []

# The target words are appended with B and E which stand for
# Beginning (start sequence character) and End (end sequence character) respectively

for (input_text, target_text) in zip(train_source, train_target):
    target_text = "B" + target_text + "E"
    input_texts_ns.append(input_text)
    target_texts_ns.append(target_text)
    for char in input_text:
        if char not in source_characters:
            source_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

for (input_text, target_text) in zip(test_source, test_target):
    target_text = "B" + target_text + "E"
    test_input_texts_ns.append(input_text)
    test_target_texts_ns.append(target_text)
    for char in input_text:
        if char not in source_characters:
            source_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

# Shuffling the datasets
input_texts = []
target_texts = []

for i in range(len(train_source)):
    input_texts.append(input_texts_ns[arr[i]])
    target_texts.append(target_texts_ns[arr[i]])

test_input_texts = []
test_target_texts = []

for i in range(len(test_source)):
    test_input_texts.append(test_input_texts_ns[arr1[i]])
    test_target_texts.append(test_target_texts_ns[arr1[i]])

# Adding the padding character
source_characters.add(" ")
target_characters.add(" ")

# Creating the vocabulary
source_characters = sorted(list(source_characters))
target_characters = sorted(list(target_characters))

# Essential parameters which will be periodically used in the reminder of the code

num_encoder_tokens = len(source_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
test_max_encoder_seq_length = max([len(txt) for txt in test_input_texts])
test_max_decoder_seq_length = max([len(txt) for txt in test_target_texts])

# Tokenising elements in the vocabulary
source_token_index = dict([(char, i) for i, char in enumerate(source_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
reverse_source_char_index = dict((i, char) for char, i in source_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

# Print out some data
if (print_data):
  print("Number of training samples:", len(input_texts))
  print("Number of testing samples: ", len(test_source))
  print("Number of unique input tokens:", num_encoder_tokens)
  print("Number of unique output tokens:", num_decoder_tokens)
  print("Max sequence length for inputs:", max_encoder_seq_length)
  print("Max sequence length for outputs:", max_decoder_seq_length)
  print("Max sequence length for test inputs:", test_max_encoder_seq_length)
  print("Max sequence length for test outputs:", test_max_decoder_seq_length)

Number of training samples: 68218
Number of testing samples:  6864
Number of unique input tokens: 27
Number of unique output tokens: 49
Max sequence length for inputs: 30
Max sequence length for outputs: 28
Max sequence length for test inputs: 23
Max sequence length for test outputs: 24


Character Embedding

**Encoder Input Sequences**: Padded to a maximum length of max_encSeqLen characters. 
**SHAPE: (len(train_source), max_encSeqLen)**

**Decoder Input Sequences**: Padded to a maximum length of max_encSeqLen characters. 
**SHAPE: (len(train_source), max_decSeqLen)**

**Decoder Target Sequences**: Padded to a maximum length of max_decSeqLen characters with a vocabulary of sizeofTamilVocab different characters. 
**SHAPE: (len(train_source), max_decSeqLen, sizeofTamilVocab)**

In [42]:
# For training
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length), dtype="float32")
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length), dtype="float32")
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = source_token_index[char]
    encoder_input_data[i, t + 1 :] = source_token_index[" "]

    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1: ] = target_token_index[" "]
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

# For testing
test_encoder_input_data = np.zeros((len(input_texts), test_max_encoder_seq_length), dtype="float32")
test_decoder_input_data = np.zeros((len(input_texts), test_max_decoder_seq_length), dtype="float32")
test_decoder_target_data = np.zeros((len(input_texts), test_max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (input_text, target_text) in enumerate(zip(test_input_texts, test_target_texts)):
    for t, char in enumerate(input_text):
        test_encoder_input_data[i, t] = source_token_index[char]
    test_encoder_input_data[i, t + 1 :] = source_token_index[" "]

    for t, char in enumerate(target_text):
        test_decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            test_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    test_decoder_input_data[i, t + 1: ] = target_token_index[" "]
    test_decoder_target_data[i, t:, target_token_index[" "]] = 1.0


#TESTING GROUND

In [43]:
x_test = test_encoder_input_data
y_test = test_target_texts
#print(np.shape(y_test))

Defining the model

In [44]:
class MyRNN(object):
  def __init__(self,cell_type = 'RNN',in_emb = 32, hidden_size=32, learning_rate= 1e-3, 
               dropout=0.4,pred_type ='greedy',epochs = 10, batch_size = 32,beam_width = 5,
               num_enc = 1,num_dec = 1):
    
    self.cell_type = cell_type
    self.in_emb = in_emb
    self.hidden_size = hidden_size
    self.learning_rate = learning_rate
    self.dropout = dropout
    self.pred_type = pred_type
    self.epochs = epochs
    self.batch_size = batch_size
    self.beam_width = beam_width
    self.num_enc = num_enc
    self.num_dec = num_dec

  def build_fit(self,encoder_input_data,decoder_input_data,decoder_target_data,x_test, y_test):

    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None, ),name = 'Enc_inputs')

    # Add an Embedding layer expecting input vocab of size num_encoder_tokens, and
    # output embedding dimension of size in_enc.
    enc_emb =  Embedding(num_encoder_tokens, self.in_emb , mask_zero = True,name = 'Enc_emb')(encoder_inputs)
    encoder_outputs = enc_emb

    # Adding num_enc number of cells of type LSTM/GRU/RNN
    if self.cell_type == 'LSTM':
      encoder_lstm = LSTM(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      encoder_outputs, state_h, state_c = encoder_lstm(encoder_outputs)
      encoder_states = [state_h, state_c]

      for i in range( 2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_lstm = LSTM(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_outputs,initial_state = encoder_states)
        encoder_states = [state_h, state_c]

    elif self.cell_type == 'GRU':
      encoder_gru = GRU(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      encoder_outputs, state_h = encoder_gru(encoder_outputs)
      encoder_states = [state_h]

      for i in range(2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_gru = GRU(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        encoder_outputs, state_h = encoder_gru(encoder_outputs, initial_state = encoder_states)
        encoder_states = [state_h]  

    elif self.cell_type == 'RNN':
      encoder_rnn = SimpleRNN(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name="Enc_hidden_1")
      encoder_outputs, state_h = encoder_rnn(encoder_outputs)
      encoder_states = [state_h]

      for i in range(2, self.num_enc +1):
        layer_name = ('Enc_hidden_%d') %i
        encoder_rnn = SimpleRNN(self.hidden_size, return_state=True,dropout = self.dropout, return_sequences=True, name=layer_name)
        encoder_outputs, state_h = encoder_rnn(encoder_outputs, initial_state = encoder_states)
        encoder_states = [state_h]  

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None,), name = 'Dec_inputs')
    dec_emb_layer = Embedding(num_decoder_tokens, self.hidden_size, mask_zero = True, name = 'Dec_emb')
    dec_emb = dec_emb_layer(decoder_inputs)
    decoder_outputs = dec_emb

    # Adding num_dec number of cells of type LSTM/GRU/RNN
    if self.cell_type == 'LSTM':
      decoder_lstm = LSTM(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      decoder_outputs, _, _ = decoder_lstm(decoder_outputs, initial_state = encoder_states)
      
      for i in range(2, self.num_dec +1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_lstm = LSTM(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        decoder_outputs, _, _ = decoder_lstm(decoder_outputs, initial_state = encoder_states)

    elif self.cell_type == 'GRU':
      decoder_gru = GRU(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      decoder_outputs, _ = decoder_gru(decoder_outputs, initial_state = encoder_states)

      for i in range(2, self.num_dec+1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_gru = GRU(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        decoder_outputs, _ = decoder_gru(decoder_outputs, initial_state = encoder_states)

    elif self.cell_type == 'RNN':
      decoder_rnn = SimpleRNN(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name="Dec_hidden_1")
      decoder_outputs, _ = decoder_rnn(decoder_outputs, initial_state = encoder_states)

      for i in range(2, self.num_dec+1):
        layer_name = ('Dec_hidden_%d') %i
        decoder_rnn = SimpleRNN(self.hidden_size, return_sequences=True, return_state=True,dropout = self.dropout, name=layer_name)
        decoder_outputs, _ = decoder_rnn(decoder_outputs, initial_state = encoder_states)

    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name = 'dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that takes encoder and decoder input 
    # to output decoder_outputs

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    model.summary()

    #plot_model(model, to_file='model.png', show_shapes=True)
    
    # Define the optimizer
    optimizer = Adam(lr=self.learning_rate, beta_1=0.9, beta_2=0.999)
    model.compile(loss = "categorical_crossentropy", optimizer = optimizer, metrics=['accuracy'])

    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=self.batch_size,
        epochs=self.epochs
        )

    #model.save("s2s")
    
    #model = keras.models.load_model("s2s")
    
    encoder_model,decoder_model = self.inference_model(model)
    data_list = [["SNO", "Input Data", "Target Data", "Predicted Data"]]

    global_total = 0
    global_correct = 0
    for i in range(len(y_test)):##
      #input_seq = val_encoder_input_data[i : i + 1]
      input_seq = x_test[i : i + 1]
      result = self.decode_sequence(encoder_model,decoder_model,input_seq)
      #target = val_target_texts[i]
      target = y_test[i]
      target = target[1:len(target)-1]
      result = result[0:len(result)-1]
      #print("Target: %s \n Result: %s" % (target, result))
      dlist = [i+1, test_input_texts[i], target, result]
      data_list.append(dlist)

      if result.strip() == target.strip():
        global_correct = global_correct + 1
      
      global_total = global_total + 1
      accuracy_epoch = global_correct/global_total
    
    with open('predictions_vanilla.csv', 'w', newline='') as file:
      writer = csv.writer(file, delimiter='\t')
      writer.writerows(data_list)
    val_accuracy = global_correct/global_total
    print(val_accuracy)
    
  def inference_model(self,model):
    encoder_inputs = model.input[0]  # input_1
    if self.cell_type == 'RNN' or self.cell_type == 'GRU':
      encoder_outputs, state_h_enc = model.get_layer('Enc_hidden_'+ str(self.num_enc)).output
      encoder_states = [state_h_enc]
      encoder_model = Model(encoder_inputs, encoder_states)

      decoder_inputs = model.input[1]  # input_1
      decoder_outputs = model.get_layer('Dec_emb')(decoder_inputs)
      decoder_states_inputs = []
      decoder_states = []

      for i in range(1,self.num_dec +1):
        decoder_state_input_h = keras.Input(shape=(self.hidden_size,))
        curr_states_inputs = [decoder_state_input_h]
        decoder = model.get_layer('Dec_hidden_'+ str(i))
        decoder_outputs, state_h_dec = decoder(decoder_outputs, initial_state=curr_states_inputs)

        decoder_states += [state_h_dec]
        decoder_states_inputs += curr_states_inputs

    elif self.cell_type == 'LSTM':
      encoder_outputs, state_h_enc, state_c_enc = model.get_layer('Enc_hidden_'+ str(self.num_enc)).output  # lstm_1
      encoder_states = [state_h_enc, state_c_enc]
      encoder_model = Model(encoder_inputs, encoder_states)

      decoder_inputs = model.input[1]  # input_1
      decoder_outputs = model.get_layer('Dec_emb')(decoder_inputs)
      decoder_states_inputs = []
      decoder_states = []

      for i in range(1,self.num_dec +1):
        decoder_state_input_h = keras.Input(shape=(self.hidden_size,))
        decoder_state_input_c = keras.Input(shape=(self.hidden_size,))
        curr_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder = model.get_layer('Dec_hidden_'+ str(i))
        decoder_outputs, state_h_dec, state_c_dec = decoder(decoder_outputs, initial_state=curr_states_inputs)

        decoder_states += [state_h_dec, state_c_dec]
        decoder_states_inputs += curr_states_inputs


    decoder_dense = model.get_layer('dense')
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    return encoder_model,decoder_model

  def decode_sequence(self,encoder_model,decoder_model,input_seq):

    states_value = [encoder_model.predict(input_seq)] * self.num_dec 
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['B']

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        if self.cell_type == 'RNN' or self.cell_type == 'GRU':
          dummy = decoder_model.predict([target_seq] + [states_value])
          output_tokens, states_value = dummy[0],dummy[1:]
          
        elif self.cell_type == 'LSTM':  
          dummy = decoder_model.predict([target_seq] + states_value)
          output_tokens, states_value = dummy[0],dummy[1:]
        
        #print(output_tokens[0,:,:])
        if self.pred_type == 'greedy':
          beam_w = 1
        elif self.pred_type == 'beam_search':
          beam_w = self.beam_width
        sampled_token_index = self.beam_search_decoder(output_tokens[0,:,:], beam_w)
        sampled_token_index = sampled_token_index[beam_w-1][0]

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit when the decoded sequence either hits max length
        # or finds stop character
        if sampled_char == 'E' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return decoded_sentence
  
  def beam_search_decoder(self,data, k):
    sequence = [[list(), 0.0]]
    # walk over each step in sequence
    for row in data:
      all_cands = list()
      # expand each current candidate
      for i in range(len(sequence)):
        seq, score = sequence[i]
        for j in range(len(row)):
          cand = [seq + [j], score - log(row[j])]
          all_cands.append(cand)
      # order all candidates by score
      ordered = sorted(all_cands, key=lambda tup:tup[1])
      sequence = ordered[:k]
    return sequence

BEST Hyperparameters 

In [45]:
best_batch_size = 128
best_beam_width = 5
best_cell_type = 'LSTM'
best_dec_search = 'greedy'
best_dropout = 0.2
best_epochs = 15
best_hidden_size = 128
best_in_emb = 128
best_learning_rate = 0.001
best_num_dec = 3
best_num_enc = 2

Compile and fit model and get predictions

In [47]:
model_rnn = MyRNN(cell_type = best_cell_type, in_emb = best_in_emb, hidden_size=best_hidden_size,
                learning_rate= best_learning_rate, dropout=best_dropout,pred_type = best_dec_search,epochs = best_epochs,
                batch_size = best_batch_size, beam_width = best_beam_width, num_enc = best_num_enc, num_dec = best_num_dec)
  
model_rnn.build_fit(encoder_input_data,decoder_input_data,decoder_target_data,x_test, y_test)

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Enc_inputs (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
Enc_emb (Embedding)             (None, None, 128)    3456        Enc_inputs[0][0]                 
__________________________________________________________________________________________________
Dec_inputs (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
Enc_hidden_1 (LSTM)             [(None, None, 128),  131584      Enc_emb[0][0]                    
____________________________________________________________________________________________

KeyboardInterrupt: ignored

In [None]:
from google.colab import files
files.download("predictions_vanilla.csv")
