In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, SimpleRNNCell, LSTMCell, GRUCell
from keras.models import Sequential
import os
import time

print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.4.1
Using tensorflow Addons: 0.12.1
Using keras: 2.4.0
Using pandas: 1.2.3


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load and Pre-process Data

In [3]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [4]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.inp_word_tokenizer = inp_word_tokenizer
        self.targ_word_tokenizer = targ_word_tokenizer

In [5]:
def tokenize(words, tokenizer):
    tensor = tokenizer.texts_to_sequences(words)
    
    #Pad the smaller words
    tensor = pad_sequences(tensor, padding='post')
    
    # Return the tensor and the tokenizer
    return tensor, tokenizer

In [6]:
# Process the dataframe to 
def create_dataset(data_frame):
    input_words = []
    target_words = []
    for x, y in zip(data_frame[1], data_frame[0]):
        # Add words to respective lists
        input_words.append("@"+str(x)+"#")
        target_words.append("@"+str(y)+"#")
    return input_words, target_words

In [7]:
def load_dataset(data_frame_list):
    # Initialize the tokenizer
    input_tokenizer = Tokenizer(num_words = None, char_level = True)
    target_tokenizer = Tokenizer(num_words = None, char_level = True)
    
    dataset_list = []
    
    for df in data_frame_list:
        # Get the words list
        input_words, target_words = create_dataset(df)
        # Fit on the set of words
        input_tokenizer.fit_on_texts(input_words)
        target_tokenizer.fit_on_texts(target_words)
        dataset_list.append((input_words, target_words))
    
    words_data = []
    
    target_tokenizer.index_word.update({0:" "})
    input_tokenizer.index_word.update({0:" "})
    
    for (input_words, target_words) in dataset_list:
        # Tokenize the words
        input_tensor, inp_word_tokenizer = tokenize(input_words, input_tokenizer)
        target_tensor, targ_word_tokenizer = tokenize(target_words, target_tokenizer)
        words_data.append(LexDataset(input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer))

    return words_data

In [8]:
dataset = load_dataset([val_df, train_df, test_df])

print(f'Shape of Val input tensor: {np.shape(dataset[0].input_tensor)} | Shape of Val target tensor: {np.shape(dataset[0].target_tensor)}')
print(f'Shape of Train input tensor: {np.shape(dataset[1].input_tensor)} | Shape of Train target tensor: {np.shape(dataset[1].target_tensor)}')
print(f'Shape of Test input tensor: {np.shape(dataset[2].input_tensor)} | Shape of Test target tensor: {np.shape(dataset[2].target_tensor)}')

Shape of Val input tensor: (4358, 20) | Shape of Val target tensor: (4358, 16)
Shape of Train input tensor: (44204, 22) | Shape of Train target tensor: (44204, 21)
Shape of Test input tensor: (4502, 18) | Shape of Test target tensor: (4502, 17)


In [9]:
def convert(tk, tensor):
    for t in tensor:
        if t != 0:
            print(f'{t} ----> {tk.index_word[t]}')

In [10]:
print("Val Input Word; index to character mapping")
convert(dataset[0].inp_word_tokenizer, dataset[0].input_tensor[0])
print()
print("Val Target Word; index to character mapping")
convert(dataset[0].targ_word_tokenizer, dataset[0].target_tensor[0])

Val Input Word; index to character mapping
2 ----> @
1 ----> a
4 ----> n
13 ----> k
1 ----> a
4 ----> n
3 ----> #

Val Target Word; index to character mapping
1 ----> @
31 ----> अ
10 ----> ं
8 ----> क
6 ----> न
2 ----> #


In [11]:
num_encoder_tokens = len(dataset[0].inp_word_tokenizer.index_word)+1
num_decoder_tokens = len(dataset[0].targ_word_tokenizer.index_word)+1
num_encoder_tokens, num_decoder_tokens

(30, 67)

In [12]:
max_encoder_seq_length = max([np.shape(dataset[i].input_tensor)[1] for i in range(len(dataset))])
max_decoder_seq_length = max([np.shape(dataset[i].target_tensor)[1] for i in range(len(dataset))])

In [13]:
# dataset[0].targ_word_tokenizer.index_word

## Tensorflow Dataset from the data

In [14]:
BATCH_SIZE = 32
embedding_dim = 16
units = 128

#### Training Dataset

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((dataset[1].input_tensor, dataset[1].target_tensor)).shuffle(len(dataset[1].input_tensor))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

#### Validation Dataset

In [16]:
val_dataset = tf.data.Dataset.from_tensor_slices((dataset[0].input_tensor, dataset[0].target_tensor)).shuffle(len(dataset[0].input_tensor))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

#### Test Dataset

In [17]:
test_dataset = tf.data.Dataset.from_tensor_slices((dataset[2].input_tensor, dataset[2].target_tensor)).shuffle(len(dataset[2].input_tensor))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 22]), TensorShape([32, 21]))

## Encoder

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size, dropout=0.2, layer_type="GRU", num_layers=1):
        super(Encoder, self).__init__()
        self.encoder_units = encoder_units
        self.batch_size = batch_size
        self.layer_type = layer_type
        self.num_layers = num_layers
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        # RNN Layer(s)
        if self.layer_type == "LSTM":
            self.layer = LSTM(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'LSTM_encoder_1', recurrent_initializer='glorot_uniform')
            for i in range(num_layers - 1):
                self.layer = LSTM(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                  name = 'LSTM_encoder_'+str(i+2), recurrent_initializer='glorot_uniform')(self.layer)
        elif self.layer_type == "GRU":
            self.layer = GRU(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'GRU_encoder_1', recurrent_initializer='glorot_uniform')
            for i in range(num_layers - 1):
                self.layer = GRU(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                 name = 'GRU_encoder_'+str(i+2), recurrent_initializer='glorot_uniform')(self.layer)
        else:
            self.layer = SimpleRNN(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, name = 'SimpleRNN_encoder_1', recurrent_initializer='glorot_uniform')
            for i in range(num_layers - 1):
                self.layer = SimpleRNN(self.encoder_units, return_state = True, return_sequences=True, dropout = dropout, 
                                       name = 'SimpleRNN_encoder_'+str(i+2), recurrent_initializer='glorot_uniform')(self.layer)
    
    def call(self, x, hidden):
        x = self.embedding(x)
        if self.layer_type == "LSTM":
            output, state_h, state_c = self.layer(x, initial_state = hidden)
            return output, state_h, state_c
        else:
            output, state = self.layer(x, initial_state=hidden)
            return output, state, None

    def initialize_hidden_state(self):
        if self.layer_type == "LSTM":
            return [tf.zeros((self.batch_size, self.encoder_units)), tf.zeros((self.batch_size, self.encoder_units))] 
        else:
            return tf.zeros((self.batch_size, self.encoder_units))


In [20]:
encoder = Encoder(num_encoder_tokens, embedding_dim, units, BATCH_SIZE, 0.2, "LSTM", 1)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden, sample_cell = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)
if encoder.layer_type == "LSTM":
    print('Encoder Cell state shape: (batch size, units)', sample_cell.shape)

Encoder output shape: (batch size, sequence length, units) (32, 22, 128)
Encoder Hidden state shape: (batch size, units) (32, 128)
Encoder Cell state shape: (batch size, units) (32, 128)


## Decoder

In [30]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size, dropout=0.2, layer_type="LSTM", num_layers=1, attention_type='luong'):
        super(Decoder, self).__init__()
        self.decoder_units = decoder_units
        self.batch_size = batch_size
        self.layer_type = layer_type
        self.num_layers = num_layers
        self.attention_type = attention_type
        
        # Embedding Layer
        self.embedding = Embedding(vocab_size, embedding_dim)
        
        # Final Dense Layer - Fully Connected: on which softmax will be applied
        self.fc = Dense(vocab_size, activation='softmax')
        
        # Define the fundamental cell for decoder recurrent structure
        if self.layer_type == "LSTM":
            self.decoder_rnn_cell = LSTMCell(self.decoder_units)
        elif self.layer_type == "GRU":
            self.decoder_rnn_cell = GRUCell(self.decoder_units)
        else:
            self.decoder_rnn_cell = SimpleRNNCell(self.decoder_units)
                    
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        
        # Create attention mechanism
        self.attention_mechanism = self.build_attention_mechanism(self.decoder_units, None, self.batch_size*[max_decoder_seq_length], 
                                                                  self.attention_type)
        
        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell()

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)
    
    
        
    def build_rnn_cell(self):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, self.attention_mechanism, attention_layer_size=self.decoder_units)
        return rnn_cell

    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_size, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_size, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state


    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_size*[max_decoder_seq_length-1])
        return outputs      

In [31]:
# Test decoder stack

decoder = Decoder(num_decoder_tokens, embedding_dim, units, BATCH_SIZE, 0.2, "LSTM", 1, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_decoder_seq_length))
decoder.attention_mechanism.setup_memory(sample_output)
if decoder.layer_type == "LSTM":
    initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_hidden, sample_cell], tf.float32)
else:
    initial_state = decoder.build_initial_state(BATCH_SIZE, sample_hidden, tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (32, 20, 67)


## Optimizer and the loss function

In [32]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, target_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss 

## Checkpoints (Object-based saving)

In [33]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

## Training Step

In [34]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)

    # Create AttentionWrapperState as initial_state for decoder
    if decoder.layer_type == "LSTM":
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    else:
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, enc_h, tf.float32)
        
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

## Training The Model

In [38]:
EPOCHS = 1
steps_per_epoch = np.shape(dataset[1].input_tensor)[0] // BATCH_SIZE

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

(32, 22) (32, 21)
tf.Tensor(
[[ 2  9 24 10 10 13  1  1  7 18  1  3  0  0  0  0  0  0  0  0  0  0]
 [ 2 14 10 10 14  1 11  4  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 23  1 26 19  1  1  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  9  6  1 14 14 18 22  5 13  5  8  9  1  3  0  0  0  0  0  0  0]
 [ 2 23 10  9  5  1  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 17  1  7  5  8  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 22  6  6  1  4  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  1  9  9 10  9  9 16 10  4  8  3  0  0  0  0  0  0  0  0  0  0]
 [ 2 19  1  1 14 20 10 10  7  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  9  6  7 10  9  6  8  6  8  1  3  0  0  0  0  0  0  0  0  0  0]
 [ 2 23 15 17  5  8 10  7  3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  9  6  1  7 16  4  1  1 13  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 24 10  9  8 14  1  4 12  3  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 14 15 22 18  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2 

ValueError: in user code:

    <ipython-input-25-d02f339dd27e>:27 train_step  *
        optimizer.apply_gradients(zip(gradients, variables))
    c:\users\shibo\.pyenv\pyenv-win\versions\3.8.8\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:598 apply_gradients  **
        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
    c:\users\shibo\.pyenv\pyenv-win\versions\3.8.8\lib\site-packages\tensorflow\python\keras\optimizer_v2\utils.py:78 filter_empty_gradients
        raise ValueError("No gradients provided for any variable: %s." %

    ValueError: No gradients provided for any variable: ['encoder/embedding/embeddings:0', 'encoder/LSTM_encoder_1/lstm_cell/kernel:0', 'encoder/LSTM_encoder_1/lstm_cell/recurrent_kernel:0', 'encoder/LSTM_encoder_1/lstm_cell/bias:0', 'decoder_2/embedding_3/embeddings:0', 'decoder_2/basic_decoder_2/decoder/dense_2/kernel:0', 'decoder_2/basic_decoder_2/decoder/dense_2/bias:0', 'decoder_2/basic_decoder_2/decoder/attention_wrapper_2/lstm_cell_3/kernel:0', 'decoder_2/basic_decoder_2/decoder/attention_wrapper_2/lstm_cell_3/recurrent_kernel:0', 'decoder_2/basic_decoder_2/decoder/attention_wrapper_2/lstm_cell_3/bias:0', 'LuongAttention/memory_layer/kernel:0', 'decoder_2/basic_decoder_2/decoder/attention_wrapper_2/attention_layer/kernel:0'].
