In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN
from keras.models import Sequential
print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.4.1
Using tensorflow Addons: 0.12.1
Using keras: 2.4.0
Using pandas: 1.2.3


In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load and Pre-process Data

In [4]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [5]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.inp_word_tokenizer = inp_word_tokenizer
        self.targ_word_tokenizer = targ_word_tokenizer

In [6]:
def tokenize(words, tokenizer):
    tensor = tokenizer.texts_to_sequences(words)
    
    #Pad the smaller words
    tensor = pad_sequences(tensor, padding='post')
    
    # Return the tensor and the tokenizer
    return tensor, tokenizer

In [7]:
# Process the dataframe to 
def create_dataset(data_frame):
    input_words = []
    target_words = []
    for x, y in zip(data_frame[1], data_frame[0]):
        # Add words to respective lists
        input_words.append("@"+str(x)+"#")
        target_words.append("@"+str(y)+"#")
    return input_words, target_words

In [8]:
def load_dataset(data_frame_list):
    # Initialize the tokenizer
    input_tokenizer = Tokenizer(num_words = None, char_level = True)
    target_tokenizer = Tokenizer(num_words = None, char_level = True)
    
    dataset_list = []
    
    for df in data_frame_list:
        # Get the words list
        input_words, target_words = create_dataset(df)
        # Fit on the set of words
        input_tokenizer.fit_on_texts(input_words)
        target_tokenizer.fit_on_texts(target_words)
        dataset_list.append((input_words, target_words))
    
    words_data = []
    
    target_tokenizer.index_word.update({0:" "})
    input_tokenizer.index_word.update({0:" "})
    
    for (input_words, target_words) in dataset_list:
        # Tokenize the words
        input_tensor, inp_word_tokenizer = tokenize(input_words, input_tokenizer)
        target_tensor, targ_word_tokenizer = tokenize(target_words, target_tokenizer)
        words_data.append(LexDataset(input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer))

    return words_data

In [9]:
dataset = load_dataset([val_df, train_df, test_df])

print(f'Shape of Val input tensor: {np.shape(dataset[0].input_tensor)} | Shape of Val target tensor: {np.shape(dataset[0].target_tensor)}')
print(f'Shape of Train input tensor: {np.shape(dataset[1].input_tensor)} | Shape of Train target tensor: {np.shape(dataset[1].target_tensor)}')
print(f'Shape of Test input tensor: {np.shape(dataset[2].input_tensor)} | Shape of Test target tensor: {np.shape(dataset[2].target_tensor)}')

Shape of Val input tensor: (4358, 20) | Shape of Val target tensor: (4358, 16)
Shape of Train input tensor: (44204, 22) | Shape of Train target tensor: (44204, 21)
Shape of Test input tensor: (4502, 18) | Shape of Test target tensor: (4502, 17)


In [10]:
def convert(tk, tensor):
    for t in tensor:
        if t != 0:
            print(f'{t} ----> {tk.index_word[t]}')

In [11]:
print("Val Input Word; index to character mapping")
convert(dataset[0].inp_word_tokenizer, dataset[0].input_tensor[0])
print()
print("Val Target Word; index to character mapping")
convert(dataset[0].targ_word_tokenizer, dataset[0].target_tensor[0])

Val Input Word; index to character mapping
2 ----> @
1 ----> a
4 ----> n
13 ----> k
1 ----> a
4 ----> n
3 ----> #

Val Target Word; index to character mapping
1 ----> @
31 ----> अ
10 ----> ं
8 ----> क
6 ----> न
2 ----> #


In [12]:
num_encoder_tokens = len(dataset[0].inp_word_tokenizer.index_word)+1
num_decoder_tokens = len(dataset[0].targ_word_tokenizer.index_word)+1
num_encoder_tokens, num_decoder_tokens

(30, 67)

In [13]:
max_encoder_seq_length = max([np.shape(dataset[i].input_tensor)[1] for i in range(len(dataset))])
max_decoder_seq_length = max([np.shape(dataset[i].target_tensor)[1] for i in range(len(dataset))])

In [14]:
# dataset[0].targ_word_tokenizer.index_word

## Tensorflow Dataset from the data

In [15]:
BATCH_SIZE = 32
embedding_dim = 32
units = 128

#### Training Dataset

In [16]:
train_dataset = tf.data.Dataset.from_tensor_slices((dataset[1].input_tensor, dataset[1].target_tensor)).shuffle(len(dataset[1].input_tensor))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

#### Validation Dataset

In [17]:
val_dataset = tf.data.Dataset.from_tensor_slices((dataset[0].input_tensor, dataset[0].target_tensor)).shuffle(len(dataset[0].input_tensor))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

#### Test Dataset

In [18]:
test_dataset = tf.data.Dataset.from_tensor_slices((dataset[2].input_tensor, dataset[2].target_tensor)).shuffle(len(dataset[2].input_tensor))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 22]), TensorShape([32, 21]))

## Encoder

In [30]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size, dropout=0.2, layer_type="GRU", num_layers=1):
        super(Encoder, self).__init__()
        self.encoder_units = encoder_units
        self.batch_size = batch_size
        self.layer_type = layer_type
        self.num_layers = num_layers
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        # RNN Layer(s)
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'LSTM_encoder_1')
            for i in range(num_layers - 1):
                self.layer = LSTM(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                  name = 'LSTM_encoder_'+str(i+2))(self.layer)
        elif self.layer_type == "GRU":
            self.layer = GRU(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'GRU_encoder_1')
            for i in range(num_layers - 1):
                self.layer = GRU(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                 name = 'GRU_encoder_'+str(i+2))(self.layer)
        else:
            self.layer = SimpleRNN(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'SimpleRNN_encoder_1')
            for i in range(num_layers - 1):
                self.layer = SimpleRNN(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                       name = 'SimpleRNN_encoder_'+str(i+2))(self.layer)
    
    def call(self, x, hidden):
        x = self.embedding(x)
        if self.layer_type == "LSTM":
            output, state_h, state_c = self.layer(x, initial_state = hidden)
            return output, state_h, state_c
        else:
            output, state = self.layer(x, initial_state=hidden)
            return output, state, None

    def initialize_hidden_state(self):
        if self.layer_type == "LSTM":
            return [tf.zeros((self.batch_size, self.encoder_units)), tf.zeros((self.batch_size, self.encoder_units))] 
        else:
            return tf.zeros((self.batch_size, self.encoder_units))


In [31]:
encoder = Encoder(num_encoder_tokens, embedding_dim, units, BATCH_SIZE, 0.2, "LSTM", 1)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden, sample_cell = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)
if encoder.layer_type == "LSTM":
    print('Encoder Cell state shape: (batch size, units)', sample_cell.shape)

Encoder output shape: (batch size, sequence length, units) (32, 22, 128)
Encoder Hidden state shape: (batch size, units) (32, 128)
Encoder Cell state shape: (batch size, units) (32, 128)


## Decoder

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size, dropout=0.2, layer_type="GRU", num_layers=1, attention_type='luong'):
        super(Encoder, self).__init__()
        self.decoder_units = decoder_units
        self.batch_size = batch_size
        self.layer_type = layer_type
        self.num_layers = num_layers
        self.attention_type = attention_type
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        