In [50]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN
from keras.models import Sequential
print("Using tensorflow:",tf.__version__)

Using tensorflow: 2.4.1


In [2]:
# import wandb
# from wandb.keras import WandbCallback

In [3]:
# wandb.init(project='Assignment 3', entity='iitm-cs6910-jan-may-2021-cs20m059-cs20m007')
# wandb.run.name = "LSTM-Transliteration-" + datetime.datetime.now().isoformat()
# wandb.run.save()

## Prepare and load Data

In [4]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


In [12]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.inp_word_tokenizer = inp_word_tokenizer
        self.targ_word_tokenizer = targ_word_tokenizer

In [5]:
# Process the dataframe to 
def create_dataset(data_frame):
    input_words = []
    target_words = []
    for x, y in zip(data_frame[1], data_frame[0]):
        # Add words to respective lists
        input_words.append("@"+str(x)+"#")
        target_words.append("@"+str(y)+"#")
    return input_words, target_words

In [6]:
def tokenize(words, tokenizer):
    tensor = tokenizer.texts_to_sequences(words)
    
    #Pad the smaller words
    tensor = pad_sequences(tensor, padding='post')
    
    # Return the tensor and the tokenizer
    return tensor, tokenizer

In [13]:
def load_dataset(data_frame_list):
    # Initialize the tokenizer
    input_tokenizer = Tokenizer(num_words = None, char_level = True)
    target_tokenizer = Tokenizer(num_words = None, char_level = True)
    
    dataset_list = []
    
    for df in data_frame_list:
        # Get the words list
        input_words, target_words = create_dataset(df)
        # Fit on the set of words
        input_tokenizer.fit_on_texts(input_words)
        target_tokenizer.fit_on_texts(target_words)
        dataset_list.append((input_words, target_words))
    
    words_data = []
    
    for (input_words, target_words) in dataset_list:
        # Tokenize the words
        input_tensor, inp_word_tokenizer = tokenize(input_words, input_tokenizer)
        target_tensor, targ_word_tokenizer = tokenize(target_words, target_tokenizer)
        words_data.append(LexDataset(input_tensor, target_tensor, inp_word_tokenizer, targ_word_tokenizer))

    return words_data

In [18]:
dataset = load_dataset([val_df, train_df, test_df])

print(f'Shape of Val input tensor: {np.shape(dataset[0].input_tensor)} | Shape of Val target tensor: {np.shape(dataset[0].target_tensor)}')
print(f'Shape of Train input tensor: {np.shape(dataset[1].input_tensor)} | Shape of Train target tensor: {np.shape(dataset[1].target_tensor)}')
print(f'Shape of Test input tensor: {np.shape(dataset[2].input_tensor)} | Shape of Test target tensor: {np.shape(dataset[2].target_tensor)}')

Shape of Val input tensor: (4358, 20) | Shape of Val target tensor: (4358, 16)
Shape of Train input tensor: (44204, 22) | Shape of Train target tensor: (44204, 21)
Shape of Test input tensor: (4502, 18) | Shape of Test target tensor: (4502, 17)


In [23]:
def convert(tk, tensor):
    for t in tensor:
        if t != 0:
            print(f'{t} ----> {tk.index_word[t]}')

In [24]:
print("Val Input Word; index to character mapping")
convert(dataset[0].inp_word_tokenizer, dataset[0].input_tensor[0])
print()
print("Val Target Word; index to character mapping")
convert(dataset[0].targ_word_tokenizer, dataset[0].target_tensor[0])

Val Input Word; index to character mapping
2 ----> @
1 ----> a
4 ----> n
13 ----> k
1 ----> a
4 ----> n
3 ----> #

Val Target Word; index to character mapping
1 ----> @
31 ----> अ
10 ----> ं
8 ----> क
6 ----> न
2 ----> #


In [28]:
num_encoder_tokens = len(dataset[0].inp_word_tokenizer.index_word)+1
num_decoder_tokens = len(dataset[0].targ_word_tokenizer.index_word)+1

In [33]:
max_encoder_seq_length = max([np.shape(dataset[i].input_tensor)[1] for i in range(len(dataset))])
max_decoder_seq_length = max([np.shape(dataset[1].target_tensor)[1] for i in range(len(dataset))])

## Build Model

In [54]:
def get_model(embedding_size, num_encoder_layers, num_decoder_layers, hidden_layer_size, layer_type, dropout, num_encoder_tokens, num_decoder_tokens):
    
    encoder_inputs = keras.Input(shape=(None,))
    encoder = Embedding(input_dim = num_encoder_tokens, output_dim = embedding_size)(encoder_inputs)
    
    decoder_inputs = keras.Input(shape=(None,))
    decoder = Embedding(input_dim = num_decoder_tokens, output_dim = embedding_size)(decoder_inputs)
    
    # Add Memory Layers
    if layer_type == LSTM_LAYER:
        # ENCODER ================================================================================================

        # Add LSTM layers
        for i in range(num_encoder_layers-1):
            encoder = LSTM(hidden_layer_size, return_state = True, return_sequences = True, dropout = dropout, name = 'encoder_'+str(i+1))(encoder)
        
        # Last LSTM Layer, Encoder outputs will act as decoder initial states    
        encoder_outputs, state_h, state_c = LSTM(hidden_layer_size, return_state = True, return_sequences = False, dropout = dropout, name = 'encoder_'+str(num_encoder_layers))(encoder)
        
        # We discard `encoder_outputs` and only keep the states.
        encoder_states = [state_h, state_c]
        
        # ========================================================================================================

        # DECODER ================================================================================================
        
        # First LSTM Layer, Encoder final state will act as decoder initial state    
        decoder = LSTM(hidden_layer_size, return_state = True, return_sequences = True, dropout = dropout, name = 'decoder_1')(decoder, initial_state = encoder_states)
        
        # Add LSTM layers
        for i in range(num_decoder_layers-1):
            decoder = LSTM(hidden_layer_size, return_state = True, return_sequences = True, dropout = dropout, name = 'decoder_'+str(i+2))(decoder)
        
        decoder_outputs, _, _ = decoder
        
        # ========================================================================================================
        
    elif layer_type == GRU_LAYER:
        # ENCODER ================================================================================================
        
        # Add GRU Layers
        for i in range(num_encoder_layers - 1):
            encoder = GRU(hidden_layer_size, return_state = True, return_sequences = True, dropout = dropout, name = 'encoder_'+str(i+1))(encoder)

        # Last GRU Layer, Encoder outputs will act as decoder initial states
        encoder_outputs, state_c = GRU(hidden_layer_size, return_state = True, dropout = dropout, name = 'encoder_'+str(num_encoder_layers))(encoder)

        # We discard `encoder_outputs` and only keep the state.
        encoder_states = [state_c]
        
        # ========================================================================================================
        
        # DECODER ================================================================================================

        # First GRU Layer, Encoder final state will act as decoder initial state   
        decoder = GRU(hidden_layer_size, return_sequences=True, return_state=True,dropout=dropout, name = 'decoder_1')(decoder, initial_state = encoder_states)
        
        # Add GRU layers
        for i in range(num_decoder_layers-1):
            decoder = GRU(hidden_layer_size, return_sequences=True, return_state=True,dropout=dropout, name = 'decoder_'+str(i+2))(decoder)

        decoder_outputs, _ = decoder
        
        # ========================================================================================================
        
    else:
        # ENCODER ================================================================================================
        
        # Add SimpleRNN Layers
        for i in range(num_encoder_layers-1):
            encoder = SimpleRNN(hidden_layer_size, return_state = True, return_sequences = True, dropout = dropout, name = 'encoder_'+str(i+1))(encoder)

        # Last SimpleRNN Layer, Encoder outputs will act as decoder initial states
        encoder_outputs, state_c = SimpleRNN(hidden_layer_size, return_state = True, dropout = dropout, name = 'encoder_'+str(num_encoder_layers))(encoder)

        # We discard `encoder_outputs` and only keep the state.
        encoder_states = [state_c]

        # ========================================================================================================
        
        # DECODER ================================================================================================

        # First GRU Layer, Encoder final state will act as decoder initial state   
        decoder = SimpleRNN(hidden_layer_size, return_sequences = True, return_state = True, dropout = dropout, name = 'decoder_1')(decoder, initial_state = encoder_states)
        
        # Add SimpleRNN Layers
        for i in range(num_decoder_layers-1):
            decoder = SimpleRNN(hidden_layer_size, return_sequences=True, return_state=True,dropout=dropout, name = 'decoder_'+str(i+2))(decoder)

        decoder_outputs, _ = decoder
        
        # ========================================================================================================
        
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    return model

In [47]:
LSTM_LAYER = "LSTM"
GRU_LAYER = "GRU"
SIMPLE_RNN_LAYER = "SIMPLE_RNN"