In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint,TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


In [2]:
mark_start  = 'ssss '
mark_end = ' eeee'

In [3]:
data = pd.read_csv("../input/dictionary.csv")

In [4]:
def preProcess(data):
    data = str(data).lower()
    return data

In [5]:
data['english_word'] = mark_start + data['english_word'].astype(str) + mark_end

In [6]:
sanskrit_data = data['sanskrit_word'].apply(preProcess)
english_data = data['english_word'].apply(preProcess)

In [7]:
english_data[1]

'ssss  the syllable om eeee'

In [8]:
# naxunyn number of vocabulary.
num_words = 10000

In [9]:
class TokenizerWrap(Tokenizer):
    
    def __init__(self, texts, padding, reverse=False,num_words=None):
        
        Tokenizer.__init__(self, num_words=num_words)
        
        self.fit_on_texts(texts)
        
        self.index_to_word = dict(zip(self.word_index.values(),
                                     self.word_index.keys()))
        
        self.tokens = self.texts_to_sequences(texts)
        
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = "pre"
        else:
            truncating = "post"
        
        self.num_tokens = [len(x) for x in self.tokens]
        
        self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens)
        
        self.max_tokens = int(self.max_tokens)
        
        self.tokens_padded = pad_sequences(self.tokens, maxlen=self.max_tokens, padding= padding, truncating = truncating)
        
    def token_to_word(self,token):
        word = " " if token == 0 else self.index_to_word[token]
        return word
    
    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        text = " ".join(words)
        return text
    
    def text_to_tokens(self, text, reverse=False, padding=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'

        if padding:
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

In [10]:
sanskrit_tokenizer = TokenizerWrap(texts=sanskrit_data,
                              padding='pre',
                              reverse=True,
                              num_words=num_words)

In [11]:
english_tokenizer = TokenizerWrap(texts=english_data,
                              padding='post',
                              reverse=False,
                              num_words=num_words)

In [12]:
sanskrit_tokens = sanskrit_tokenizer.tokens_padded
english_tokens = english_tokenizer.tokens_padded
print(sanskrit_tokens.shape)
print(english_tokens.shape)

(9323, 2)
(9323, 10)


In [13]:
print(english_tokens[:5])

[[   1 2559  122    2    0    0    0    0    0    0]
 [   1    3 1167 1636    2    0    0    0    0    0]
 [   1   36    2    0    0    0    0    0    0    0]
 [   1    8 2560 2561  599   65    5  136 1637    2]
 [   1  455    3  456 2562   17   53    3  357    2]]


In [14]:
token_start = english_tokenizer.word_index[mark_start.strip()]
token_start

1

In [15]:
token_end = english_tokenizer.word_index[mark_end.strip()]
token_end

2

In [16]:
# Training data
encoder_input_data = sanskrit_tokens

In [17]:
decoder_input_data = english_tokens[:,:-1]
decoder_input_data.shape

(9323, 9)

In [18]:
decoder_output_data = english_tokens[:,1:]
decoder_output_data.shape

(9323, 9)

In [19]:
#Creating the encoder

encoder_input = Input(shape=(None,), name="encoder_input")
embedding_size = 128
encoder_embedding = Embedding(input_dim=num_words,
                             output_dim= embedding_size,
                             name="encoder_embedding")

In [20]:
state_size = 512

encoder_gru1 = GRU(state_size, name = "encoder_gru1", return_sequences=True)
encoder_gru2 = GRU(state_size, name = "encoder_gru2", return_sequences=True)
encoder_gru3 = GRU(state_size, name = "encoder_gru3", return_sequences=False)

In [21]:
def connect_encoder():
    net = encoder_input
    
    net = encoder_embedding(net)
    
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)
    
    encoder_output = net
    
    return encoder_output

In [22]:
encoder_output = connect_encoder()

In [23]:
#Decoder
decoder_initial_state = Input(shape=(state_size,),
                             name="decoder_initial_state")

In [24]:
decoder_input = Input(shape=(None,), name = "decoder_input")

In [25]:
decoder_embedding = Embedding(input_dim=num_words,
                             output_dim=embedding_size,
                             name="decoder_embedding")

In [26]:
decoder_gru1 = GRU(state_size,name="decoder_gru1",return_sequences=True)
decoder_gru2 = GRU(state_size,name="decoder_gru2",return_sequences=True)
decoder_gru3 = GRU(state_size,name="decoder_gru3",return_sequences=True)

In [27]:
decoder_dense = Dense(num_words,
                     activation="linear",
                     name="decoder_output")

In [28]:
def connect_decoder(initial_state):
    net = decoder_input
    
    net = decoder_embedding(net)
    
    net = decoder_gru1(net,initial_state=initial_state)
    net = decoder_gru2(net,initial_state=initial_state)
    net = decoder_gru3(net,initial_state=initial_state)
    
    decoder_output = decoder_dense(net)
    
    return decoder_output

In [29]:
decoder_output = connect_decoder(initial_state=encoder_output)

In [30]:
model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

In [31]:
model_encoder = Model(inputs=[encoder_input],
                     outputs=[encoder_output])

In [32]:
decoder_output= connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(inputs=[decoder_input,decoder_initial_state],
                     outputs=[decoder_output])

In [33]:
def sparse_cross_entropy(y_true, y_pred):
    
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)
    loss_mean = tf.reduce_mean(loss)
    
    #loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    #    logits=logits, labels=labels))
    return loss_mean

In [34]:
optimizer = RMSprop(lr=1e-3)

In [35]:
decoder_target = tf.placeholder(dtype='int32', shape=(None,None))

In [36]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

In [37]:
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

In [38]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

In [39]:

callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [40]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

In [41]:
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

Error trying to load checkpoint.
Unable to open file (unable to open file: name = '21_checkpoint.keras', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


In [42]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

In [43]:
decoder_output_data.shape

(9323, 9)

In [44]:
y_data = \
{
    'decoder_output': decoder_output_data
}

In [45]:
validation_split = 0.0050792360828931325
validation_split

0.0050792360828931325

In [46]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=10,
                validation_split=validation_split)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9275 samples, validate on 48 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f136b923e10>

In [47]:
def translate(input_text, true_output_text=None):

    input_tokens = sanskrit_tokenizer.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    initial_state = model_encoder.predict(input_tokens)

    max_tokens = english_tokenizer.max_tokens
    
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    token_int = token_start

    output_text = ''

    count_tokens = 0

    while token_int != token_end and count_tokens < max_tokens:

        decoder_input_data[0, count_tokens] = token_int

        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        decoder_output = model_decoder.predict(x_data)

        token_onehot = decoder_output[0, count_tokens, :]
        
        token_int = np.argmax(token_onehot)

        sampled_word = english_tokenizer.token_to_word(token_int)

        output_text += " " + sampled_word

        count_tokens += 1

    output_tokens = decoder_input_data[0]
    
    print("Input text:")
    print(input_text)
    print()

    print("Translated text:")
    print(output_text)
    print()

    if true_output_text is not None:
        print("True output text:")
        print(true_output_text)
        print()

In [48]:
idx = 500
translate(input_text=sanskrit_data[idx],
          true_output_text=english_data[idx])

Input text:
apraapya 

Translated text:
 maternal eeee

True output text:
ssss  failing to attain eeee

