# Neural Machine Translation

Translating French to English using Neural Networks

In [1]:
import os, sys

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
BATCH_SIZE=100
EPOCHS=10
LSTM_NODES=256
NUM_SENTENCES=20000
MAX_SENTENCE_LENGTH=20
MAX_NUM_WORDS=20000
EMBEDDING_SIZE=100

In [4]:
input_sentences=[]
output_sentences=[]
output_sentences_inputs=[]

Neural Machine Translation models are built on a seq2seq architecture which is an encoder - decoder based architectureconsisiting of two LSTM networks. The input to the encoder LSTM is the sentence in the original language; the input to the decoder LSTM is the sentence in the translated language with a start-of-sentence token. The output is the actual target sentence with an end-of-sentence token.

In [5]:
count = 0
for line in open("./data/fra.txt", encoding="utf-8"):
    count += 1

    if count > NUM_SENTENCES:
        break

    if "\t" not in line:
        continue

    input_sentence, output, _ = line.rstrip().split("\t")

    output_sentence = output + " <eos>"
    output_sentences_input = "<sos> " + output
    input_sentences.append(input_sentence)
    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentences_input)


In [6]:
print(input_sentences[172])
print(output_sentences[172])
print(output_sentences_inputs[172])

Beat it.
Pars ! <eos>
<sos> Pars !


The next step is tokenizing the original and translated sentences and applying padding to the sentences that are longer or shorter than a certain length, which in case of inputs will be the length of the longest input sentence. And for the output this will be the length of the longest sentence in the output.

For tokenization, the Tokenizer class from the keras.preprocessing.text library can be used. The tokenizer class performs two tasks:

* It divides a sentence into the corresponding list of word
* Then it converts the words to integers

In [7]:
input_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)
input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

Total unique words in the input: 3449
Length of longest sentence in input: 5


In [8]:
output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the output: 9543
Length of longest sentence in the output: 12


In [9]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)
print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print("encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 5)
encoder_input_sequences[172]: [  0   0   0 304   4]


In [10]:
decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

decoder_input_sequences.shape: (20000, 12)
decoder_input_sequences[172]: [  2 370   4   0   0   0   0   0   0   0   0   0]


In [11]:
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_output_sequences.shape)
print("decoder_input_sequences[172]:", decoder_output_sequences[172])

decoder_input_sequences.shape: (20000, 12)
decoder_input_sequences[172]: [370   4   1   0   0   0   0   0   0   0   0   0]


In [12]:
print("Input Sentence: ",input_sentences[172])
print("Input Sentence Encoded: ",input_integer_seq[172])
print("Padded Input Sentence for Encoder: ",encoder_input_sequences[172])

Input Sentence:  Beat it.
Input Sentence Encoded:  [304, 4]
Padded Input Sentence for Encoder:  [  0   0   0 304   4]


In [13]:
print("Output Sentence: ",output_sentences[172])
print("Output Sentence Encoded: ",output_integer_seq[172])
print("Padded Input Decoder Sentence: ",decoder_input_sequences[172])

Output Sentence:  Pars ! <eos>
Output Sentence Encoded:  [370, 4, 1]
Padded Input Decoder Sentence:  [  2 370   4   0   0   0   0   0   0   0   0   0]


In [14]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open(
    "C:/Users/Ashwin/Data-Science/Natural-Language-Processing/Neural Machine Translation/glove.6B.100d.txt",
    encoding="utf-8",
    errors="ignore",
)

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype="float32")
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()


In [15]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = zeros((num_words, EMBEDDING_SIZE))
for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [16]:
print(embeddings_dictionary["beat"])

[-0.36376    0.28693    0.94244   -0.63514    0.076384   0.83271
  0.58714    0.0082005 -1.0876    -0.13608    0.31405   -0.069519
 -0.84956    0.27327   -0.052305   0.25085   -0.25873    0.37005
 -0.59384    0.29734    0.9568     0.046776   0.62049    1.2733
  0.57751   -0.24495    0.23065   -0.67114    0.9366    -0.40403
 -0.73548    0.57319    0.22002    0.62443   -0.023422  -0.87126
 -0.87828    0.10236   -0.0058819 -0.54341   -0.084448  -1.2349
 -0.32515   -0.57239    0.2542    -0.38591    0.30615    0.15316
  0.57722   -0.8711    -0.62893    0.48035   -0.49498    0.73514
  0.3135    -2.2475    -0.36309    0.69576    0.46218    0.21857
 -0.22019   -0.60873   -0.66334    0.18873   -0.09517    0.067118
  0.23001    1.633     -0.41638    0.17992   -0.31783    0.056987
 -0.1619    -0.0047663  0.26996   -0.049623  -0.39014   -0.40589
  0.22046    0.1226     0.84783    0.36986   -1.2954     0.075642
 -1.0363    -1.0294    -0.77231    1.123     -0.16174    0.30077
  0.092628  -0.34509   

In [17]:
print(embedding_matrix[539])

[ 0.23665    -0.041405    0.64863002  0.16824    -0.80225003  0.25167999
 -0.15488     0.44527999  1.11880004  0.031041    0.69330001  0.38863999
 -0.12191    -0.030912    0.057447   -0.22809     0.21014     0.41951999
 -0.46458    -0.040251    0.011725    0.21571    -0.36116001 -0.85667002
 -0.075501   -0.0056213  -0.71068001 -0.26758999  0.63815999 -0.78713
  0.036123    0.78338999  0.29251999  0.22616    -0.63032001  0.012733
 -0.33213001  0.0094381   0.48791999 -0.41505    -0.064688    0.36812001
  0.27667001 -0.54086    -0.93717003  0.40316999  0.25663999 -0.15063
  0.35049    -0.81292999  0.25003999  0.53745002  0.29888999  0.033292
 -0.39787    -0.32912999 -0.097228    0.16338    -0.069737    0.22385
  0.28185999  0.67523003 -0.28990999 -0.22619     0.29635    -0.38473001
  0.54737002 -0.037079   -0.40792999 -0.19731    -0.11675     0.14914
  0.18508001 -0.21537    -0.43698001  0.61523998 -0.071701   -0.031935
 -0.02658     0.41485    -0.38890001 -0.20225    -0.54961997  0.05770

In [18]:
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len)
decoder_targets_one_hot = np.zeros((
        len(input_sentences),
        max_out_len,
        num_words_output
    ),
    dtype='float32'
)

In [19]:
decoder_targets_one_hot.shape

(20000, 12, 9544)

In [20]:
decoder_input_sequences.shape

(20000, 12)

In [21]:
for i, d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [22]:
decoder_targets_one_hot

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [23]:
encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs,h,c=encoder(x)
encoder_states=[h,c]


In [24]:
decoder_inputs_placeholder=Input(shape=(max_out_len,))
decoder_embedding=Embedding(num_words_output,LSTM_NODES)
decoder_inputs_x=decoder_embedding(decoder_inputs_placeholder)

decoder_lstm=LSTM(LSTM_NODES,return_sequences=True,return_state=True)
decoder_outputs, _, _=decoder_lstm(decoder_inputs_x,initial_state=encoder_states)

In [25]:
decoder_dense=Dense(num_words_output,activation='softmax')
decoder_outputs=decoder_dense(decoder_outputs)

model=Model([
    encoder_inputs_placeholder,decoder_inputs_placeholder
], decoder_outputs)
model.compile(
    optimizer="rmsprop",
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 5)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 12)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 5, 100)       345000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 12, 256)      2443264     ['input_2[0][0]']                
                                                                                              

In [27]:
encoder

<keras.layers.recurrent_v2.LSTM at 0x2b0cf6bd2e0>

In [28]:
results=model.fit(
    [encoder_input_sequences,decoder_input_sequences],
    decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
