In [19]:
import os, sys
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt


In [20]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES = 256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

### Data Preprocessing
Neural machine translation models are often based on seq2seq architecture. The seq2seq architecture is an encoder-decoder architecture which consists of two LSTM. encoder LSTM and decoder LSTM. The input to the encoder LSTM is the sentence in the original language; the input to the decoder LSTM is the sentence in the translated laguage with a start-of-sentence token. Output is the actual targe sentence with an end-of-sentence token.

* eos = end of sentence
* sos = start of sentence

In [21]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open('datasets/fra.txt', encoding='utf-8'):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    a = line.rstrip().split('\t')
    input_sentence, output = a[0], a[1]

    output_sentence = output + ' <eos>'
    output_sentences_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentences_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))


num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [22]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Beat it.
Pars ! <eos>
<sos> Pars !


### Tokenization and Padding




In [23]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3441
Length of longest sentence in input: 5


In [24]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 9499
Length of longest sentence in the output: 12


In [25]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 5)
encoder_input_sequences[172]: [  0   0   0 304   4]


Since there are 20,000 sentences in input and each input sentence is of length 5, the shape of input is now (20000,5). Here the integer sequence for the sentence at index 172 of input sentence, you can see that there are 3 zeros followed by values 304 and 4. Sentence at index 172 is `Beat it`. The tokenizer divided the sentence into two words `beat` and `it`, converted them into integers, and then applied pre-padding by adding three zeros at the start of the corresponding integer sequence for the sentence at index 172 of input list


To verify that integer for `beat` and `it` are 304 and 4


In [26]:
print(word2idx_inputs["beat"])
print(word2idx_inputs["it"])


304
4


In [27]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 12)
decoder_input_sequences[172]: [  2 373   4   0   0   0   0   0   0   0   0   0]


In the case of decoder, the post-padding is applied, which means that zeros are appended at the end of the sentence. In encoder, zeros were padded at the beginning. The reason behind this approach is that encoder output is based on the words occurring at the end of the sentence, therefore the original words were kept at the end of the sentence and zeros were padded at the beginning. On the other hand, in the case of decoder, the processing starts from beginning of sentence, and therefore post padding is performed on decoder inputs and outputs.


### Word embedding



In [28]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'datasets/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()


Create a matrix where the row number will represent the integer value for the word and the columns will correspond to the dimensions of the word. This matrix will contain the word embeddings for the words in our input sentences.



In [29]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs)+1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [30]:
print(embeddings_dictionary['beat'])

[-0.36376    0.28693    0.94244   -0.63514    0.076384   0.83271
  0.58714    0.0082005 -1.0876    -0.13608    0.31405   -0.069519
 -0.84956    0.27327   -0.052305   0.25085   -0.25873    0.37005
 -0.59384    0.29734    0.9568     0.046776   0.62049    1.2733
  0.57751   -0.24495    0.23065   -0.67114    0.9366    -0.40403
 -0.73548    0.57319    0.22002    0.62443   -0.023422  -0.87126
 -0.87828    0.10236   -0.0058819 -0.54341   -0.084448  -1.2349
 -0.32515   -0.57239    0.2542    -0.38591    0.30615    0.15316
  0.57722   -0.8711    -0.62893    0.48035   -0.49498    0.73514
  0.3135    -2.2475    -0.36309    0.69576    0.46218    0.21857
 -0.22019   -0.60873   -0.66334    0.18873   -0.09517    0.067118
  0.23001    1.633     -0.41638    0.17992   -0.31783    0.056987
 -0.1619    -0.0047663  0.26996   -0.049623  -0.39014   -0.40589
  0.22046    0.1226     0.84783    0.36986   -1.2954     0.075642
 -1.0363    -1.0294    -0.77231    1.123     -0.16174    0.30077
  0.092628  -0.34509   

In [31]:
print(embedding_matrix[304])

[-0.36375999  0.28692999  0.94243997 -0.63514     0.076384    0.83271003
  0.58714002  0.0082005  -1.08759999 -0.13608     0.31404999 -0.069519
 -0.84956002  0.27327001 -0.052305    0.25084999 -0.25872999  0.37005001
 -0.59384     0.29734001  0.95679998  0.046776    0.62049001  1.27330005
  0.57751    -0.24495     0.23064999 -0.67114002  0.93660003 -0.40403
 -0.73548001  0.57318997  0.22002     0.62443    -0.023422   -0.87125999
 -0.87827998  0.10236    -0.0058819  -0.54341    -0.084448   -1.2349
 -0.32515001 -0.57239002  0.25420001 -0.38591     0.30614999  0.15316001
  0.57722002 -0.87110001 -0.62892997  0.48034999 -0.49498001  0.73514003
  0.31349999 -2.24749994 -0.36309001  0.69576001  0.46217999  0.21856999
 -0.22019    -0.60873002 -0.66333997  0.18873    -0.09517     0.067118
  0.23001     1.63300002 -0.41637999  0.17992    -0.31783     0.056987
 -0.1619     -0.0047663   0.26995999 -0.049623   -0.39014    -0.40588999
  0.22046     0.1226      0.84783     0.36985999 -1.29540002  0.

In [32]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

### Creating Model
First we define out outputs, as we know that output will be sequence of words. Total number of unique words in output are 9500. Therefore, each word in the output can be any of 9500 words. The length of output sentence is 12. And for each input sentence, we need a corresponding output sentence. Therefore, the final shape of output will be:
`(number of inputs, length of output sentence, number of words in output)`

In [33]:
# create empty output array
decoder_targets_one_hot = np.zeros((len(input_sentences), max_out_len, num_words_output), dtype='float32')

In [34]:
decoder_targets_one_hot.shape


(20000, 12, 9500)

To make predictions, the final layer of the model will be dense layer, therefore we need the outputs in the form of one-hot encoded vectors, since we will be using softmax activation function at the dense layer. To create such one-hot encoded output, the next step is to assign 1 to the column number that corresponds to the integer representation of the word.




In [35]:
for i ,d in enumerate(decoder_input_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i,t,word] = 1

The input to the encoder will be the sentence in English and output will be the hidden state and cell state of the LSTM

In [38]:
encoder_inputs_placeholdre = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholdre)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h,c = encoder(x)
encoder_states = [h,c]

The decoder will have two inputs: hidden state and cell state from the encoder and the input sentence, which actually will be the output sentence with an `<sos>` token appended at beginning

In [39]:
decoder_inputs_placeholder = Input(shape=(max_out_len,))
decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _,_ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

Finally, the output from the decoder LSTM is passed through a dense layer to predict decoder outputs



In [40]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [41]:
model = Model([encoder_inputs_placeholdre, decoder_inputs_placeholder], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [42]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


Here we have two types of input. `input_1` is the input placeholder for the encoder, which is embedded and passed through `lstm_1` layer, which basically is the encoder LSTM. There are three outputs from `lstm1` layer:
* the output layer
* the hidden layer
* the cell state
However, only the cell state and hidden state are passed to the decoder.

Here `lstm2` layer is decoder LSTM. the `input2` contains the output sentences with `<sos>` token appended at the start. `input2` is also passed through an embedding layer and is used as input to the decoder LSTM, `lstm2`. Finally, the output from decoder LSTM is passed through the dense layer to make predictions.



In [None]:
r = model.fit([encoder_input_sequences, decoder_input_sequences],
              decoder_targets_one_hot,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              validation_split=0.1)

Inputs on the left of Encoder/Decoder, outputs on the right.

step 1:

`I'm ill -> Encoder -> enc(h1,c1)`

`enc(h1,c1) + <sos> -> Decoder -> je + dec(h1,c1)`

step 2:

`enc(h1,c1) + je -> Decoder -> suis + dec(h2,c2)`

step 3:

`enc(h2,c2) + suis -> Decoder -> malade. + dec(h3,c3)`

step 4:

`enc(h3,c3) + malade. -> Decoder -> <eos> + dec(h4,c4)`

However, during predictions the next word will be predicted on the basis of the previous word, which in turn is also predicted in the previous time-step. Now you will understand the purpose of `<sos>` and `<eos>` tokens. While making actual predictions, the full output sequences is not available, in fact that is what we have to predict. During prediction the only word available to us is `<sos>` since all output sentences start with `<sos>`.


// Inputs on the left of Encoder/Decoder, outputs on the right.

Step 1:
`I'm ill -> Encoder -> enc(h1,c1)`

`enc(h1,c1) + <sos> -> Decoder -> je + dec(h1,c1)`

step 2:

`enc(h1,c1) + je -> Decoder -> suis + dec(h2,c2)`

step 3:

`enc(h2,c2) + suis -> Decoder -> malade. + dec(h3,c3)`

step 4:

`enc(h3,c3) + malade. -> Decoder -> <eos> + dec(h4,c4)`

In step1, the hiddne state and cell state of encoder, and the `<sos>` is used as input to the decoder. The decoder predicts a word `y1` which may or may not be true. However, as per our model, the probability of correct prediction is `0.7911`. At step2, the decoder hidden state and cell state from step1, along with `y1`, is used as input to the decoder, which predicts `y2`. The process continues until the `<eos>` token is encountered. All the predicted outputs from decoder are then concatenated to form the final output sentence.

In [None]:
encoder_model = Model(encoder_inputs_placeholdre, encoder_states)

Since now at each step, we need the decoder hidden and cell states, we will modify our model to accept the hidden and cell state as:

In [None]:
decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

Now at each time step, there will be only single word in the decoder input, we need to modify the decoder embedding layer

In [None]:
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

Next, we need to create the placeholder for decoder outputs

In [None]:
decoder_outputs, h,c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

To make predictions, the decoder output is passed through the dense layer

In [None]:
decoder_States = [h,c]
decoder_outputs = decoder_dense(decoder_outputs)

Final step is to define the update decoder model

In [None]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_States
)

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(decoder_model, to_file='model_plot_dec.png', show_shapes=True, show_layer_names=True)

In the image above lstm_2 is the modified decoder LSTM. You can see that it accepts the sentence with with one word as shown in input_5, and the hidden and cell states from the previous output (input_3 and input_4). You can see that the shape of the of the input sentence is now (none,1) since there will be only one word in the decoder input. On the contrary, during training the shape of the input sentence was (None,6) since the input contained a complete sentence with a maximum length of 5.

### Making predictions

In [None]:
idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [None]:
def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq= np.zeros((1,1))
    target_seq[0,0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    output_sentence1 = []

    for _ in range(max_out_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word1 = ''

        if idx > 0:
            word1 = idx2word_target[idx]
            output_sentence1.append(word1)

        target_seq[0, 0] = idx
        states_value = [h, c]

    return ' '.join(output_sentence1)

In the script above, we pass the input sequence to the `encoder_model`, which predicts the hidden state and the cell state, which are stored in `states_value` variable.

Next, we define a variable `target_seq`, which is a `1x1` matrix of all zeros. The `target_seq` variable contains the first word to the decoder mode, which is `<sos>`.

After that, the `eos` variable is initialized, which stores the integer value for the `<eos>` token. In the next line, the `output_sentence` list is defined, which will contain the predicted translation.

Next, we execute a for loop till the length of longest sentence in output. Here the `decoder_model` predicts the output and hidden and cell state, using the h and c state encoder, and input token `<sos>`. The index of predicted word is stored in `idx` variable. If the value of the predicted index is equal to the `<eos>` token, the loop terminates. Else if the predicted index is greater than zero, the corresponding word is retreived from the `idx2word` dictionary and is stored in `word` variable, which is then appeded to the `output_sentence` list.

The `state_value` variable is updated with the new hidden and cell state of the decoder and the index of the prediced word is stored in the `target_seq` variable. In the next loop cycle, the updated hidden and cellstates, along with the index of the previously predicted word, are used to make new predictions. The loops continues until the maximum output sequence lenght is achieved or the `<eos>` token is encountered

### Testing model

In [None]:
i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)
print('-')
print('Input:', input_sentences[i])
print('Response:', translation)