<a href="https://colab.research.google.com/github/abisubramanya27/CS6910_Assignment3/blob/master/src/Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"

--2021-04-25 18:28:05--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.81.208, 142.250.73.208, 142.250.65.80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.81.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2021-04-25 18:28:23 (106 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [2]:
!tar xopf dakshina_dataset_v1.0.tar

In [3]:
!ls dakshina_dataset_v1.0/hi/lexicons

hi.translit.sampled.dev.tsv   hi.translit.sampled.train.tsv
hi.translit.sampled.test.tsv


In [4]:
import numpy as np

def read_data(data_path, characters = False):
    # Returns the (input, output) pair from the dataset
    # If characters == True, the input/output would be in the form list of characters, else as string

    with open(data_path, "r", encoding="utf-8") as f:
        lines = [line.split("\t") for line in f.read().split("\n") if line != '']
    
    input, target = [val[1] for val in lines], [val[0] for val in lines]
    if characters:
        input, target = [list(inp_str) for inp_str in input], [list(tar_str) for tar_str in target]
    return input, target


def process_data(input, output, enc_timesteps, dec_timesteps, input_char_enc, target_char_enc):
    # Returns the input and output data in a form needed by the Keras embedding layer, where each character is encoded by an integer

    # ' ' -- space (equivalent to no meaningful input)
    encoder_input = np.array([[input_char_enc[ch] for ch in string] + [input_char_enc[' ']] * (enc_timesteps - len(string)) for string in input])
    # '\t' -- start of sequence, '\n' -- end of sequence
    decoder_input = np.array([[target_char_enc['\t']] + [target_char_enc[ch] for ch in string] + [target_char_enc['\n']] 
                                 + [target_char_enc[' ']] * (dec_timesteps - len(string) - 2) for string in output])
    decoder_target = np.zeros((decoder_input.shape[0], dec_timesteps, len(target_char_enc)), dtype='float32')

    for i in range(decoder_input.shape[0]):
        for t, char_ind in enumerate(decoder_input[i]):
            if t > 0:
                decoder_target[i,t-1,char_ind] = 1.0
        decoder_target[i,t:,target_char_enc[' ']] = 1.0

    return encoder_input, decoder_input, decoder_target


def encode_decode_characters(train_input, train_target, valid_input, valid_target):
    # Returns the encoder for characters to integer (as a dictionary) and decoder for integers to characters (as a list) for input and target data

    input_char_enc = {}
    input_char_dec = []
    max_encoder_seq_length = 1
    for string in train_input + valid_input:
        max_encoder_seq_length = max(max_encoder_seq_length, len(string))
        for char in string:
            if char not in input_char_enc:
                input_char_enc[char] = len(input_char_dec)
                input_char_dec.append(char)
    input_char_enc[' '] = len(input_char_dec)
    input_char_dec.append(' ')

    target_char_enc = {}
    target_char_dec = []
    target_char_enc['\t'] = len(target_char_dec)
    target_char_dec.append('\t')
    max_decoder_seq_length = 1
    for string in train_target + valid_target:
        max_decoder_seq_length = max(max_decoder_seq_length, len(string)+2)
        for char in string:
            if char not in target_char_enc:
                target_char_enc[char] = len(target_char_dec)
                target_char_dec.append(char)
    target_char_enc['\n'] = len(target_char_dec)
    target_char_dec.append('\n')
    target_char_enc[' '] = len(target_char_dec)
    target_char_dec.append(' ')

    print("Number of training samples:", len(train_input))
    print("Number of validation samples:", len(valid_input))
    print("Number of unique input tokens:", len(input_char_dec))
    print("Number of unique output tokens:", len(target_char_dec))
    print("Max sequence length for inputs:", max_encoder_seq_length)
    print("Max sequence length for outputs:", max_decoder_seq_length)

    return input_char_enc, input_char_dec, target_char_enc, target_char_dec, max_encoder_seq_length, max_decoder_seq_length



In [5]:
input_char_enc = {}
input_char_dec = []
target_char_enc = {}
target_char_dec = []
max_encoder_seq_length = 0
max_decoder_seq_length = 0

# Reading training and validation data
train_inp, train_out = read_data('./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv', True)
valid_inp, valid_out = read_data('./dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv', True)
# Assigning encoder and decoder for input and target characters
input_char_enc, input_char_dec, target_char_enc, target_char_dec, max_encoder_seq_length, max_decoder_seq_length = encode_decode_characters(train_inp, train_out, valid_inp, valid_out)

# Assigning training and validation encoder input, decoder input, decoder output
train_enc_input, train_dec_input, train_dec_target = process_data(train_inp, train_out, max_encoder_seq_length, max_decoder_seq_length, 
                                                                  input_char_enc, target_char_enc)
valid_enc_input, valid_dec_input, valid_dec_target = process_data(valid_inp, valid_out, max_decoder_seq_length, max_decoder_seq_length, 
                                                                  input_char_enc, target_char_enc)


Number of training samples: 44204
Number of validation samples: 4358
Number of unique input tokens: 27
Number of unique output tokens: 66
Max sequence length for inputs: 20
Max sequence length for outputs: 21


In [18]:
import tensorflow as tf
import tensorflow.keras as keras

def create_model(encoder_vocab_size, decoder_vocab_size, encoder_timesteps, decoder_timesteps,
                 inp_emb_size=16, no_enc_layers=1, no_dec_layers=1, hid_layer_size=32, cell_type='LSTM', dropout=0, cell_activation='tanh'):
    
    get_cell = {
        'RNN': keras.layers.SimpleRNN,
        'GRU': keras.layers.GRU,
        'LSTM': keras.layers.LSTM
    }
    # Encoder input and embedding
    encoder_input = keras.layers.Input(shape=(encoder_timesteps,))
    encoder_inp_emb = keras.layers.Embedding(encoder_vocab_size, inp_emb_size, input_length=encoder_timesteps)(encoder_input)

    # Encoder cell layers
    encoder_seq, *encoder_state = get_cell[cell_type](hid_layer_size, activation=cell_activation, return_sequences=True, return_state=True, 
                                                      recurrent_dropout=dropout, name="encoder_0")(
                                                            encoder_inp_emb
                                                     )
    for i in range(1, no_enc_layers):
        encoder_seq, *encoder_state = get_cell[cell_type](hid_layer_size, activation=cell_activation, return_sequences=True, return_state=True, 
                                                          recurrent_dropout=dropout, name="encoder_"+str(i))(
                                                                encoder_seq
                                                         )
    
    # Decoder input and embedding
    decoder_input = keras.layers.Input(shape=(decoder_timesteps,))
    decoder_inp_emb = keras.layers.Embedding(decoder_vocab_size, inp_emb_size, input_length=decoder_timesteps)(decoder_input)

    # Decoder cell layers
    decoder_seq, *_ = get_cell[cell_type](hid_layer_size, activation=cell_activation, return_sequences=True, return_state=True, 
                                          recurrent_dropout=dropout, name="decoder_0")(
                                                decoder_inp_emb, initial_state=encoder_state
                                         )
    for i in range(1, no_dec_layers):
        decoder_seq, *_ = get_cell[cell_type](hid_layer_size, activation=cell_activation, return_sequences=True, return_state=True, 
                                              recurrent_dropout=dropout, name="decoder_"+str(i))(
                                                    decoder_seq, initial_state=encoder_state
                                             )
    
    # Softmax FC layer
    decoder_output = keras.layers.Dense(decoder_vocab_size, activation="softmax")(
        decoder_seq
    )

    # Define the model that will turn encoder_input_data and decoder_input_data into decoder_target_data
    model = keras.Model([encoder_input, decoder_input], decoder_output)

    model.summary()
    return model

model = create_model(len(input_char_dec), len(target_char_dec), max_encoder_seq_length, max_decoder_seq_length)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 21)]         0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 20, 16)       432         input_19[0][0]                   
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 21, 16)       1056        input_20[0][0]                   
____________________________________________________________________________________________

In [None]:
a = 'abc'
print(len(a))