## Seq to Seq Modelling : Hindi English Translation

In [70]:
import numpy as np
np.random.seed(42)

In [None]:
local_path="G:/UpX Academy/Deep Learning/Week6SeqToSeq/"

In [None]:
import zipfile
import io

In [None]:
zf = zipfile.ZipFile(local_path+'hin-eng.zip', 'r')

In [None]:
data = ''
with zf.open('hin.txt') as readfile:
  for line in io.TextIOWrapper(readfile, 'utf-8'):
    data += line

In [None]:
data

In [None]:
data =  data.split('\n')

In [None]:
data

In [None]:
data[0]

### Review data

In [None]:
len(data)

### Separate out enoder and decoder input

In [None]:
encoder_text = []

In [None]:
decoder_text = []

In [None]:
for line in data:
    try:
        in_txt, out_txt = line.split('\t')
        encoder_text.append(in_txt)
        
        # Add tab '<start>' as 'start sequence in target
        # And '<end>' as End
        decoder_text.append('<start> ' + out_txt + ' <end>')
    except:
        pass #ignore data which goes into error  

In [None]:
decoder_text[105:110]

In [None]:
encoder_text[105:110]

### Build input and output sequences

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

encoder Tokenizer

In [None]:
encoder_t = Tokenizer()

In [None]:
encoder_t.fit_on_texts(encoder_text)

In [None]:
encoder_seq = encoder_t.texts_to_sequences(encoder_text)

In [None]:
encoder_seq[105:110]

In [None]:
max_encoder_seq_length = max([len(txt) for txt in encoder_seq])
max_encoder_seq_length

In [None]:
encoder_vocab_size = len(encoder_t.word_index)

In [None]:
encoder_vocab_size

Decoder Tokenizer

In [None]:
decoder_t = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [None]:
decoder_t.fit_on_texts(decoder_text)

In [None]:
decoder_seq = decoder_t.texts_to_sequences(decoder_text)

In [None]:
decoder_seq[105:110]

In [None]:
max_decoder_seq_length = max([len(txt) for txt in decoder_seq])
max_decoder_seq_length

In [None]:
decoder_vocab_size = len(decoder_t.word_index)

In [None]:
decoder_vocab_size

### Pad Sequences

In [None]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
encoder_input_data = pad_sequences(encoder_seq, maxlen=max_encoder_seq_length, padding='post')

In [None]:
decoder_input_data = pad_sequences(decoder_seq, maxlen=max_decoder_seq_length, padding='post')

In [99]:
encoder_input_data.shape

(2808, 22)

In [100]:
encoder_input_data[1456]

array([  38,  743,    8,  519, 1085,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [None]:
decoder_input_data.shape

Integer to Word Converter for Decoder Data

In [None]:
int_to_word_decoder = dict((i,c) for c, i in decoder_t.word_index.items())

In [None]:
int_to_word_decoder[15]

## Build Decoder Output

In [None]:
decoder_target_data = np.zeros((decoder_input_data.shape[0], decoder_input_data.shape[1]))

In [None]:
decoder_target_data.shape

In [None]:
for i in range(decoder_input_data.shape[0]):
    for j in range(1,decoder_input_data.shape[1]):
        decoder_target_data[i][j-1] = decoder_input_data[i][j]  

In [None]:
decoder_input_data[0]

In [None]:
decoder_target_data[0]

Convert target data into one hot encode

In [None]:
from tensorflow.python.keras.utils import  to_categorical

In [None]:
decoder_target_one_hot = np.zeros((decoder_input_data.shape[0], 
                                   decoder_input_data.shape[1],
                                   len(decoder_t.word_index)+1))

In [None]:
for i in range(decoder_target_data.shape[0]):
    for j in range(decoder_target_data.shape[1]):
        decoder_target_one_hot[i][j] = to_categorical(decoder_target_data[i][j],
                                                      num_classes=len(decoder_t.word_index)+1) 

In [None]:
decoder_target_one_hot.shape

In [None]:
decoder_target_one_hot[0:1]

## Building the Training Model

In [None]:
from tensorflow.python.keras.layers import Input, LSTM, Dense, Embedding

In [None]:
#define config parameters
encoder_embedding_size = 50
decoder_embedding_size = 50
rnn_units = 256

Build Encoder

In [None]:
encoder_inputs = Input(shape=(None,))

In [None]:
encoder_inputs.shape

In [None]:
encoder_embedding = Embedding(encoder_vocab_size+1, encoder_embedding_size)

In [None]:
encoder_embedding_output = encoder_embedding(encoder_inputs)

In [None]:
x, state_h, state_c = LSTM(rnn_units,return_state=True)(encoder_embedding_output)

In [None]:
state_c.graph

In [None]:
encoder_states = [state_h, state_c]

Build Decoder

In [None]:
decoder_inputs = Input(shape=(None,))

In [None]:
decoder_embedding = Embedding(decoder_vocab_size + 1, decoder_embedding_size)

In [None]:
decoder_embedding_output = decoder_embedding(decoder_inputs)

In [None]:
decoder_rnn = LSTM(rnn_units, return_sequences=True, return_state=True)

In [None]:
#Initialize initial state with encoder_states
#Output will be all hidden sequences, last 'h' state and last 'c' state
x,_,_ = decoder_rnn(decoder_embedding_output, initial_state=encoder_states)

In [None]:
decoder_dense = Dense(decoder_vocab_size + 1, activation='softmax')

In [None]:
decoder_outputs = decoder_dense(x)

## Build model using both Encoder and Decoder

In [None]:
from tensorflow.python.keras.models import Model

In [None]:
model = Model([encoder_inputs, decoder_inputs], #2 Inputs to the model
              decoder_outputs) #Output of the model

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [71]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     119450      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     149650      input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM

In [86]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot,
          batch_size=64,
          epochs=100,
          validation_split=0.2)

Train on 2246 samples, validate on 562 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x19600de73c8>

In [None]:
model.save('seq2seq_training_translation.hd5')

In [87]:
decoder_outputs

<tf.Tensor 'dense_1/truediv:0' shape=(?, ?, 2993) dtype=float32>

## Building Model for Prediction

### Build the encoder model to predict Encoder States

In [88]:
encoder_model = Model(encoder_inputs, encoder_states)

### Build the decoder model

In [89]:
decoder_state_input_h = Input(shape=(rnn_units,))

In [90]:
decoder_state_input_c = Input(shape=(rnn_units,))

In [91]:
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

Get RNN outputs,states(s)

In [92]:
x = decoder_embedding(decoder_inputs)

In [93]:
#We will use the layer which we trained earlier
rnn_outputs, state_h, state_c = decoder_rnn(x, initial_state=decoder_states_inputs)

In [94]:
#Why do we need this?
#We need this to predict the sequence of target language words
decoder_states = [state_h, state_c]

get decoder output

In [95]:
decoder_outputs = decoder_dense(rnn_outputs)

Build decoder model

In [96]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs,  #Model inputs
                     [decoder_outputs] + decoder_states)

## Predicting Output

Build prediction function

In [97]:
def decode_sentence(input_sequence):
    
    #Get the encoder state values
    decoder_initial_states_value = encoder_model.predict(input_sequence)
    
    #Build a sequence with '<start>' - starting sequence for Decoder
    target_seq = np.zeros((1,1))    
    target_seq[0][0] = decoder_t.word_index['<start>']
    
    #flag to check if prediction should be stopped
    stop_loop = False
    
    #Initialize predicted sentence
    predicted_sentence = ''
    
    #start the loop
    while not stop_loop:
        
        predicted_outputs, h, c = decoder_model.predict([target_seq] + 
                                                        decoder_initial_states_value)
        
        #Get the predicted output with highest probability
        predicted_output = np.argmax(predicted_outputs[0,-1,:])
        
        #Get the predicted word from predicter integer
        predicted_word = int_to_word_decoder[predicted_output]
        
        #Check if prediction should stop
        if(predicted_word == '<end>' or len(predicted_sentence) > max_decoder_seq_length):
            
            stop_loop = True
            continue
                    
        #Updated predicted sentence
        if (len(predicted_sentence) == 0):
            predicted_sentence = predicted_word
        else:
            predicted_sentence = predicted_sentence + ' ' + predicted_word
            
        #Update target_seq to be the predicted word index
        target_seq[0][0] = predicted_output
        
        #Update initial states value for decoder
        decoder_initial_states_value = [h,c]
        
    
    return predicted_sentence

Call Prediction function

In [98]:
#Get a random sentence
start_num = np.random.randint(0, high=len(encoder_text) - 10)
print(start_num)

for i in range(start_num, start_num + 10):
    input_seq = encoder_input_data[i : i+1]
    predicted_sentence = decode_sentence(input_seq)
    print('--------')
    print ('Input sentence: ', encoder_text[i])
    print ('Predicted sentence: ', predicted_sentence )

1373
--------
Input sentence:  My telephone is out of order.
Predicted sentence:  मेरा फ़ोन खराब है।
--------
Input sentence:  Now they have three children.
Predicted sentence:  अब उनके तीन बच्चे हैं।
--------
Input sentence:  One language is never enough.
Predicted sentence:  एक भाषा कभी काफ़ी नहीं होती।
--------
Input sentence:  Please make yourself at home.
Predicted sentence:  इसको अपना घर ही समझो।
--------
Input sentence:  Please make yourself at home.
Predicted sentence:  इसको अपना घर ही समझो।
--------
Input sentence:  Please show me your notebook.
Predicted sentence:  मुझे लगता है कि तुम ग़लत हो।
--------
Input sentence:  Please wait for five minutes.
Predicted sentence:  कृपया पाँच मिनट इंतेज़ार करें।
--------
Input sentence:  Say which one you would like.
Predicted sentence:  बोलो तुम्हे कौनसा चाहिए।
--------
Input sentence:  Seicho Matumoto died in 1992.
Predicted sentence:  सेइचो मात्सुमोतो का निधन सन १९९२
--------
Input sentence:  She died yesterday afternoon.
Predicted sen

## Save prediction model and tokenizers

In [102]:
#Save encoder and decoder model for Prediction
encoder_model.compile(optimizer='adam', loss='mse')
decoder_model.compile(optimizer='adam', loss='categorical_crossentropy')
encoder_model.save('G:/UpX Academy/Deep Learning/Week6SeqToSeq/seq2seq_encoder_eng_hin.hd5')
decoder_model.save('G:/UpX Academy/Deep Learning/Week6SeqToSeq/seq2seq_decoder_eng_hin.hd5')



In [103]:
#Save tokenizers
import pickle

pickle.dump(encoder_t,open('G:/UpX Academy/Deep Learning/Week6SeqToSeq/encoder_tokenizer_eng','wb'))
pickle.dump(decoder_t,open('G:/UpX Academy/Deep Learning/Week6SeqToSeq/decoder_tokenizer_hin','wb'))