In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
!tar -xf 'dakshina_dataset_v1.0.tar'

--2021-05-20 17:04:11--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 173.194.203.128, 173.194.202.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2021-05-20 17:04:28 (112 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [None]:
import time
import random
import numpy as np
from importlib import reload
import RNN
RNN = reload(RNN)
import pandas as pd
from google.colab import files

batch_size = 64  # Batch size for training.
epochs = 25  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space. #hidden states hyperparameter
# Path to the data txt file on disk.
train_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv"
val_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv"
# open and save the files to lists
with open(train_data, "r", encoding="utf-8") as f:
    train_lines = f.read().split("\n")
with open(val_data, "r", encoding="utf-8") as f:
    val_lines = f.read().split("\n")
# popping the last element of all the lists since it is empty character
train_lines.pop()
val_lines.pop()
random.shuffle(train_lines)
print(train_lines[0:2])

# embedding pre processing
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
# go through the train lines and split them into 3 and save input and target
for line in train_lines[: (len(train_lines) - 1)]:
    # because we want english to devanagiri conversion
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    # append it to the main input texts list
    input_texts.append(input_text)
    # append it to the main target texts list
    target_texts.append(target_text)
    # to find the number of unique characters in both
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
# add the space character to both
input_characters.add(" ")
target_characters.add(" ")
# sort it
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
# find the number
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
# find the maximum length of input word and target word
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)
# create an index
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
print((input_token_index))
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
print((target_token_index))
# create an 0 array for encoder input size of (input_texts,max_seqlen,tokens)
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length), dtype="float32"
)
# create decoder input
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length), dtype="float32"
)
# create decoder target
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
# for each sample convert it into character encoding i.e. if
# at that position a character is present then encode the index of that character there
# this is done for both encoder and decoder input data for further word embedding
# but target data is one hot encoded.
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    # remaining positions set as empty space
    encoder_input_data[i, t + 1:] = input_token_index[" "]
    # similarly do for decoder data
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[char]
        # check if t >0 since decoder targer data is ahead
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    # append both the remaining positions of both the datas with empty space
    decoder_input_data[i, t + 1:] = target_token_index[" "]
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

# embedding validation
# for validation data, almost same
val_input_texts = []
val_target_texts = []
for line in val_lines[: (len(val_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    val_input_texts.append(input_text)
    val_target_texts.append(target_text)
val_max_encoder_seq_length = max([len(txt) for txt in val_input_texts])
val_max_decoder_seq_length = max([len(txt) for txt in val_target_texts])
val_encoder_input_data = np.zeros(
    (len(val_input_texts), val_max_encoder_seq_length), dtype="float32"
)
val_decoder_input_data = np.zeros(
    (len(val_input_texts), val_max_decoder_seq_length), dtype="float32"
)
val_decoder_target_data = np.zeros(
    (len(val_input_texts), val_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
for i, (input_text, target_text) in enumerate(zip(val_input_texts, val_target_texts)):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t] = input_token_index[char]
    val_encoder_input_data[i, t + 1:] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        val_decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            val_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    val_decoder_input_data[i, t + 1:] = target_token_index[" "]
    val_decoder_target_data[i, t:, target_token_index[" "]] = 1.0

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

# create RNN model
model = RNN.RNN(embedding_size=256, n_encoder_tokens=num_encoder_tokens, n_decoder_tokens=num_decoder_tokens,
                n_encoder_layers=2, n_decoder_layers=3, latent_dimension=latent_dim,
                cell_type='lstm', target_token_index=target_token_index, max_decoder_seq_length=max_decoder_seq_length,
                reverse_target_char_index=reverse_target_char_index, dropout=0.2)
model.fit(encoder_input_data, decoder_input_data, decoder_target_data,
          batch_size, epochs=epochs
          )
# subset = 100
# val_accuracy = model.accuracy(val_encoder_input_data[0:subset], val_target_texts[0:subset]) if subset>0 \
#     else model.accuracy(val_encoder_input_data, val_target_texts)
# print('Validation accuracy: ', val_accuracy)

# compute test accuracy
print('Reading test data')
test_data = "dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv"
# open and save the files to lists
with open(test_data, "r", encoding="utf-8") as f:
    test_lines = f.read().split("\n")
# popping the last element of all the lists since it is empty character
test_lines.pop()
# embedding test
# for test data, almost same
test_input_texts = []
test_target_texts = []
for line in test_lines[: (len(test_lines) - 1)]:
    target_text, input_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    test_input_texts.append(input_text)
    test_target_texts.append(target_text)
test_max_encoder_seq_length = max([len(txt) for txt in test_input_texts])
test_max_decoder_seq_length = max([len(txt) for txt in test_target_texts])
test_encoder_input_data = np.zeros(
    (len(test_input_texts), test_max_encoder_seq_length), dtype="float32"
)
test_decoder_input_data = np.zeros(
    (len(test_input_texts), test_max_decoder_seq_length), dtype="float32"
)
test_decoder_target_data = np.zeros(
    (len(test_input_texts), test_max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
for i, (input_text, target_text) in enumerate(zip(test_input_texts, test_target_texts)):
    for t, char in enumerate(input_text):
        test_encoder_input_data[i, t] = input_token_index[char]
    test_encoder_input_data[i, t + 1:] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        test_decoder_input_data[i, t] = target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            test_decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    test_decoder_input_data[i, t + 1:] = target_token_index[" "]
    test_decoder_target_data[i, t:, target_token_index[" "]] = 1.0

print('Calculating test accuracy')
test_accuracy = {}
for beamSize in range(1,5):
  df = pd.DataFrame(columns=['SourceText', 'Prediction', 'GroundTruth'])
  n_correct = 0
  n_total = 0
  for seq_index in range(len(test_encoder_input_data)):
      decoded_sentence = model.beam_search(test_encoder_input_data[seq_index:seq_index+1], beam_size=beamSize)

      if test_target_texts[seq_index].strip() == decoded_sentence[0][0].strip():
          n_correct += 1

      n_total += 1
      row = {}
      row['SourceText'] = test_input_texts[seq_index].strip()
      row['GroundTruth'] = test_target_texts[seq_index].strip()
      row['Prediction'] = decoded_sentence[0][0].strip()
      df = df.append(row, ignore_index=True)
  df.to_csv('predictions_'+str(beamSize)+'.csv', index=False)  
  test_accuracy[beamSize] = (n_correct * 100.0 / n_total)
print('Test accuracy ', test_accuracy)

for beamSize in range(1,5):
  files.download('predictions_'+str(beamSize)+'.csv')
  time.sleep(30)

['கடத்தி\tkatatthi\t1', 'ஊர்களிலும்\toorkalilum\t1']
Number of samples: 68217
Number of unique input tokens: 27
Number of unique output tokens: 49
Max sequence length for inputs: 30
Max sequence length for outputs: 28
{' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{'\t': 0, '\n': 1, ' ': 2, 'ஃ': 3, 'அ': 4, 'ஆ': 5, 'இ': 6, 'ஈ': 7, 'உ': 8, 'ஊ': 9, 'எ': 10, 'ஏ': 11, 'ஐ': 12, 'ஒ': 13, 'ஓ': 14, 'க': 15, 'ங': 16, 'ச': 17, 'ஜ': 18, 'ஞ': 19, 'ட': 20, 'ண': 21, 'த': 22, 'ந': 23, 'ன': 24, 'ப': 25, 'ம': 26, 'ய': 27, 'ர': 28, 'ற': 29, 'ல': 30, 'ள': 31, 'ழ': 32, 'வ': 33, 'ஷ': 34, 'ஸ': 35, 'ஹ': 36, 'ா': 37, 'ி': 38, 'ீ': 39, 'ு': 40, 'ூ': 41, 'ெ': 42, 'ே': 43, 'ை': 44, 'ொ': 45, 'ோ': 46, 'ௌ': 47, '்': 48}
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epo

In [None]:
import time
for beamSize in range(1,5):
  files.download('predictions_'+str(beamSize)+'.csv')
  time.sleep(5)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(val_input_texts[0:5])
print(val_target_texts[0:5])
print(len(val_input_texts))
print(len(val_target_texts))
print(len(set(val_input_texts)))
print(len(set(val_target_texts)))

['ankan', 'angkor', 'angira', 'angithi', 'angrej']
['\tअंकन\n', '\tअंगकोर\n', '\tअंगिरा\n', '\tअंगीठी\n', '\tअंग्रेज\n']
4357
4357
4320
2500


In [None]:
subset = 200
val_accuracy = model.accuracy(val_encoder_input_data[0:subset], val_target_texts[0:subset])
print('Validation accuracy: ', val_accuracy)

Validation accuracy:  15.0


In [None]:
model.decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 32)     2112        decoder_input[0][0]              
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
____________________________________________________________________________________________

In [None]:
model.encoder_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, None)]            0         
_________________________________________________________________
encoder_embedding (Embedding (None, None, 32)          864       
_________________________________________________________________
encoder_hidden_1 (GRU)       [(None, None, 256), (None 222720    
Total params: 223,584
Trainable params: 223,584
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 32)     864         encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_hidden_1 (GRU)          [(None, None, 256),  222720      encoder_embedding[0][0]          
______________________________________________________________________________________________