In [1]:
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt


2021-08-30 21:04:35.362396: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-30 21:04:35.362417: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES = 256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

### Data Preprocessing
Neural machine translation models are often based on seq2seq architecture. The seq2seq architecture is an encoder-decoder architecture which consists of two LSTM. encoder LSTM and decoder LSTM. The input to the encoder LSTM is the sentence in the original language; the input to the decoder LSTM is the sentence in the translated laguage with a start-of-sentence token. Output is the actual targe sentence with an end-of-sentence token.

* eos = end of sentence
* sos = start of sentence

In [8]:
input_sentences = []
output_sentences = []
output_sentences_inputs = []

count = 0
for line in open('datasets/fra.txt', encoding='utf-8'):
    count += 1

    if count > NUM_SENTENCES:
        break

    if '\t' not in line:
        continue

    a = line.rstrip().split('\t')
    input_sentence, output = a[0], a[1]

    output_sentence = output + ' <eos>'
    output_sentences_input = '<sos> ' + output

    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentences_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))


num samples input: 20000
num samples output: 20000
num samples output input: 20000


In [9]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Beat it.
Pars ! <eos>
<sos> Pars !


### Tokenization and Padding




In [11]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3441
Length of longest sentence in input: 5


In [12]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 9499
Length of longest sentence in the output: 12


In [14]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 5)
encoder_input_sequences[172]: [  0   0   0 304   4]


Since there are 20,000 sentences in input and each input sentence is of length 5, the shape of input is now (20000,5). Here the integer sequence for the sentence at index 172 of input sentence, you can see that there are 3 zeros followed by values 304 and 4. Sentence at index 172 is `Beat it`. The tokenizer divided the sentence into two words `beat` and `it`, converted them into integers, and then applied pre-padding by adding three zeros at the start of the corresponding integer sequence for the sentence at index 172 of input list


To verify that integer for `beat` and `it` are 304 and 4


In [16]:
print(word2idx_inputs["beat"])
print(word2idx_inputs["it"])


304
4


In [17]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 12)
decoder_input_sequences[172]: [  2 373   4   0   0   0   0   0   0   0   0   0]


In the case of decoder, the post-padding is applied, which means that zeros are appended at the end of the sentence. In encoder, zeros were padded at the beginning. The reason behind this approach is that encoder output is based on the words occurring at the end of the sentence, therefore the original words were kept at the end of the sentence and zeros were padded at the beginning. On the other hand, in the case of decoder, the processing starts from beginning of sentence, and therefore post padding is performed on decoder inputs and outputs.


### Word embedding



In [18]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(r'datasets/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()


Create a matrix where the row number will represent the integer value for the word and the columns will correspond to the dimensions of the word. This matrix will contain the word embeddings for the words in our input sentences.



In [21]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs)+1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [22]:
print(embeddings_dictionary['beat'])

[-0.36376    0.28693    0.94244   -0.63514    0.076384   0.83271
  0.58714    0.0082005 -1.0876    -0.13608    0.31405   -0.069519
 -0.84956    0.27327   -0.052305   0.25085   -0.25873    0.37005
 -0.59384    0.29734    0.9568     0.046776   0.62049    1.2733
  0.57751   -0.24495    0.23065   -0.67114    0.9366    -0.40403
 -0.73548    0.57319    0.22002    0.62443   -0.023422  -0.87126
 -0.87828    0.10236   -0.0058819 -0.54341   -0.084448  -1.2349
 -0.32515   -0.57239    0.2542    -0.38591    0.30615    0.15316
  0.57722   -0.8711    -0.62893    0.48035   -0.49498    0.73514
  0.3135    -2.2475    -0.36309    0.69576    0.46218    0.21857
 -0.22019   -0.60873   -0.66334    0.18873   -0.09517    0.067118
  0.23001    1.633     -0.41638    0.17992   -0.31783    0.056987
 -0.1619    -0.0047663  0.26996   -0.049623  -0.39014   -0.40589
  0.22046    0.1226     0.84783    0.36986   -1.2954     0.075642
 -1.0363    -1.0294    -0.77231    1.123     -0.16174    0.30077
  0.092628  -0.34509   

In [23]:
print(embedding_matrix[304])

[-0.36375999  0.28692999  0.94243997 -0.63514     0.076384    0.83271003
  0.58714002  0.0082005  -1.08759999 -0.13608     0.31404999 -0.069519
 -0.84956002  0.27327001 -0.052305    0.25084999 -0.25872999  0.37005001
 -0.59384     0.29734001  0.95679998  0.046776    0.62049001  1.27330005
  0.57751    -0.24495     0.23064999 -0.67114002  0.93660003 -0.40403
 -0.73548001  0.57318997  0.22002     0.62443    -0.023422   -0.87125999
 -0.87827998  0.10236    -0.0058819  -0.54341    -0.084448   -1.2349
 -0.32515001 -0.57239002  0.25420001 -0.38591     0.30614999  0.15316001
  0.57722002 -0.87110001 -0.62892997  0.48034999 -0.49498001  0.73514003
  0.31349999 -2.24749994 -0.36309001  0.69576001  0.46217999  0.21856999
 -0.22019    -0.60873002 -0.66333997  0.18873    -0.09517     0.067118
  0.23001     1.63300002 -0.41637999  0.17992    -0.31783     0.056987
 -0.1619     -0.0047663   0.26995999 -0.049623   -0.39014    -0.40588999
  0.22046     0.1226      0.84783     0.36985999 -1.29540002  0.

In [24]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)

### Creating Model
First we define out outputs, as we know that output will be sequence of words. Total number of unique words in output are 9500. Therefore, each word in the output can be any of 9500 words. The length of output sentence is 12. And for each input sentence, we need a corresponding output sentence. Therefore, the final shape of output will be:
`(number of inputs, length of output sentence, number of words in output)`

In [25]:
# create empty output array
decoder_targets_one_hot = np.zeros((len(input_sentences), max_out_len, num_words_output), dtype='float32')

In [26]:
decoder_targets_one_hot.shape


(20000, 12, 9500)