#### CS20M059 Shibobrota Das | CS20M007 Abhishek Kumar

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
import pandas as pd
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, GRU, SimpleRNN, SimpleRNNCell, LSTMCell, GRUCell
from keras.models import Sequential
import os
import time
from sklearn.utils import shuffle

print("Using numpy:",np.__version__)
print("Using tensorflow:",tf.__version__)
print("Using tensorflow Addons:",tfa.__version__)
print("Using keras:",keras.__version__)
print("Using pandas:",pd.__version__)

Using numpy: 1.19.5
Using tensorflow: 2.4.1
Using tensorflow Addons: 0.12.1
Using keras: 2.4.0
Using pandas: 1.2.3


In [2]:
val_df = pd.read_csv("./lexicons/hi.translit.sampled.dev.tsv", sep='\t', header=None)
train_df = pd.read_csv("./lexicons/hi.translit.sampled.train.tsv", sep='\t', header=None)
test_df = pd.read_csv("./lexicons/hi.translit.sampled.test.tsv", sep='\t', header=None)
print("Data Loaded to Dataframes!")

Data Loaded to Dataframes!


#### Dataset Samples

In [3]:
val_df.sample(n=3)

Unnamed: 0,0,1,2
3408,रीड,reed,2
2005,नक्सलवादी,naxalvaadi,1
145,अर्थनीति,arthaniti,1


In [4]:
sos = "@"
eos = "#"

In [5]:
class LexDataset:
    def __init__(self, input_tensor, target_tensor, batch_size):
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.batch = tf.data.Dataset.from_tensor_slices((self.input_tensor, self.target_tensor)).shuffle(len(self.input_tensor)).batch(batch_size, drop_remainder=True)

In [8]:
class TransliterationDatatset:
    def __init__(self, df_list, problem_type = "en-hi", batch_size = 32):
        self.problem_type = problem_type
        self.input_tokenizer = None
        self.target_tokenizer = None
        self.train = None
        self.val = None
        self.test = None
        self.batch_size = batch_size
        # Load Data
        self.load_dataset(df_list)
        
    def preprocess_word(self, w):
        return sos + str(w) + eos
    
    def create_dataset(self, data_frame):
        input_words = []
        target_words = []
        # Shuffle the data_frame before creating dataset
        df_shuffled = shuffle(data_frame)
        for x, y in zip(df_shuffled[1], df_shuffled[0]):
            input_words.append(self.preprocess_word(x))
            target_words.append(self.preprocess_word(y))
        return (input_words, target_words)
    
    def load_dataset(self, df_list):
        # df_list should have train -> val -> test in sequence
        
        self.input_tokenizer = Tokenizer(num_words = None, char_level = True)
        self.target_tokenizer = Tokenizer(num_words = None, char_level = True)
        
        ds_list = []
        
        for df in df_list:
            # Get the words list
            (input_words, target_words) = self.create_dataset(df)
            # Fit on the set of words
            self.input_tokenizer.fit_on_texts(input_words)
            self.target_tokenizer.fit_on_texts(target_words)
            ds_list.append((input_words, target_words))
                    
        self.target_tokenizer.index_word.update({0:" "})
        self.input_tokenizer.index_word.update({0:" "})
        
        for i, (input_words, target_words) in enumerate(ds_list):
            
            input_tensor = self.input_tokenizer.texts_to_sequences(input_words)
            input_tensor = pad_sequences(input_tensor, padding='post')
            
            target_tensor = self.target_tokenizer.texts_to_sequences(target_words)
            target_tensor = pad_sequences(target_tensor, padding='post')
            
            if i == 0:
                self.train = LexDataset(input_tensor, target_tensor, self.batch_size)
            elif i == 1:
                self.val = LexDataset(input_tensor, target_tensor, self.batch_size)
            else:
                self.test = LexDataset(input_tensor, target_tensor, self.batch_size)

In [9]:
dataset = TransliterationDatatset([train_df, val_df, test_df])

In [10]:
# Training data
dataset.train.input_tensor.shape, dataset.train.target_tensor.shape

((44204, 22), (44204, 21))

In [11]:
# Validation data
dataset.val.input_tensor.shape, dataset.val.target_tensor.shape

((4358, 20), (4358, 16))

In [12]:
# Test data
dataset.test.input_tensor.shape, dataset.test.target_tensor.shape

((4502, 18), (4502, 17))

#### Number of Tokens

In [13]:
# Number of tokens
num_encoder_tokens = len(dataset.input_tokenizer.index_word)+1
num_decoder_tokens = len(dataset.target_tokenizer.index_word)+1
num_encoder_tokens, num_decoder_tokens

(30, 67)

#### Maximum Sequence Lengths

In [14]:
# max seq length
max_encoder_seq_length = np.max([dataset.train.input_tensor.shape[1], dataset.val.input_tensor.shape[1], dataset.test.input_tensor.shape[1]])
max_decoder_seq_length = np.max([dataset.train.target_tensor.shape[1], dataset.val.target_tensor.shape[1], dataset.test.target_tensor.shape[1]])
max_encoder_seq_length, max_decoder_seq_length

(22, 21)

In [15]:
embedding_dim = 16
units = 128
steps_per_epoch = np.shape(dataset.train.input_tensor)[0]//dataset.batch_size
steps_per_epoch

1381

#### Example batch - dataset

In [16]:
example_input_batch, example_target_batch = next(iter(dataset.train.batch))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 22]), TensorShape([32, 21]))