In [115]:
# Essentials
import numpy as np
import tensorflow 
from tensorflow import keras
from tensorflow.keras import layers
import io
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Fetching the dataset
!git clone https://github.com/borate267/lexicon-dataset.git

Cloning into 'lexicon-dataset'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), done.


In [95]:
# Reading the dataset

train_dir = "lexicon-dataset/ta.translit.sampled.train.tsv"
dev_dir = "lexicon-dataset/ta.translit.sampled.dev.tsv"
test_dir = "lexicon-dataset/ta.translit.sampled.test.tsv"

# The following function reads the raw text document and returns a list of lists comprising the romanized and native versions of the words
def read_corpus(corpus_file):
  tamil_words = []
  latin_words = []
  with io.open(corpus_file, encoding ='utf-8') as f:
    for line in f:
      if '\t' not in line:
        continue
      tokens = line.rstrip().split("\t")
      latin_words.append(tokens[1])
      tamil_words.append(tokens[0])
  return latin_words, tamil_words

train_input, train_target = read_corpus(train_dir)
valid_input, valid_target = read_corpus(dev_dir)
test_input, test_target = read_corpus(test_dir)

print("Number of training samples: ", len(train_input))
print("Number of validation samples: ", len(valid_input))
print("Number of testing samples: ", len(test_input))


Number of training samples:  68218
Number of validation samples:  6827
Number of testing samples:  6864


In [119]:
# PRE-PROCESSING

#### Appending decoder inputs with <bow> and <eow>

bow = "<bow>"
eow = "<bos>"
decoder_inputs = [bow + text + eow for text in train_target] 

#### Creating vocabularies for the dataset

vocab_tamil = set()
vocab_latin = set()

for i in range(len(train_input)):
  input_str = train_input[i].lower()
  for char in input_str:
    if char not in vocab_latin:
      vocab_latin.add(char)

for i in range(len(train_target)):
  input_str = train_target[i] 
  for char in input_str:
    if char not in vocab_tamil:
      vocab_tamil.add(char)
vocab_tamil.add('<bow>')
vocab_tamil.add('<eow>')

vocab_tamil = sorted(list(vocab_tamil))
vocab_latin = sorted(list(vocab_latin))
sizeofTamilVocab = len(vocab_tamil)
sizeofLatinVocab = len(vocab_latin)
max_encSeqLen = max([len(sample) for sample in train_input])
max_decSeqLen = max([len(sample) for sample in train_target])

#print("Tamil vocabulary: ", vocab_tamil)
#print("Latin vocabulary: ", vocab_latin)
#print("Size of Tamil vocabulary: ", sizeofTamilVocab)
#print("Size of Latin vocabulary: ", sizeofLatinVocab)
#print("Maximum length of encoder size: ", max_encSeqLen)
#print("Maximum length of decoder size: ", max_decSeqLen)

#### Tokenising the encoder and decoder inputs

latin_token_index = dict([(char, i) for i, char in enumerate(vocab_latin)])
tamil_token_index = dict([(char, i) for i, char in enumerate(vocab_tamil)])
  
#### Convert sequences of characters to sequences of tokens

encoder_seq = []
decoder_seq = []

for i in range(len(train_input)):
  input_str = train_input[i].lower()
  dummy = []
  for char in input_str:
    if char in vocab_latin:
      dummy.append(latin_token_index[char])
  encoder_seq.append(dummy)
#print(latin_token_index)
#print(encoder_seq[0:2])

for i in range(len(decoder_inputs)):
  input_str = decoder_inputs[i]
  dummy = []
  for char in input_str:
    if char in vocab_tamil:
      dummy.append(tamil_token_index[char])
  decoder_seq.append(dummy)
#print(tamil_token_index)
#print(decoder_seq[0:2])

#### Padding sequences that are to be fed as input to the encoder and decoder RNN

encoder_inputdata = pad_sequences(encoder_seq, maxlen= max_encSeqLen, dtype='int32', padding='post', truncating='post')
decoder_inputdata = pad_sequences(decoder_seq, maxlen= max_decSeqLen, dtype='int32', padding='post', truncating='post')
#print(encoder_inputdata[0])
#print(decoder_inputdata[0])

#### CHARACTER EMBEDDING





[ 5  8  0 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]
[ 2 24 37 26 19 47  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


[ 5  8  0 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [None]:
# Configuration

input_emb_size = 256 #Input embedding size
num_enc = 128 #Number of encoder layers
num_dec = 128 #Number of decoder layers
hidden_size = 16 #Hidden layer size
cell_type = 'RNN' #Cell type
dropout = 0.3 #Dropout

#TODO: Beam search in decoder, add more hyperparas


In [None]:
# seq2seq model architecture

# Input embedding layer : A feedforward layer of size sizeofLatinVocab x input_emb_size


