# Neural machine translation

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

## Download and prepare the dataset

We'll use a language dataset provided by http://www.manythings.org/anki/

In [2]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2023-08-28 16:24:31--  http://www.manythings.org/anki/fra-eng.zip
Распознаётся www.manythings.org (www.manythings.org)… 173.254.30.110
Подключение к www.manythings.org (www.manythings.org)|173.254.30.110|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 7757635 (7.4M) [application/zip]
Сохранение в: «fra-eng.zip.1»


2023-08-28 16:24:34 (3.41 MB/s) - «fra-eng.zip.1» сохранён [7757635/7757635]



In [3]:
!mkdir fra-eng
!unzip fra-eng.zip -d fra-eng/

mkdir: fra-eng: File exists
Archive:  fra-eng.zip
replace fra-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [4]:
!ls fra-eng/ -lah

ls: -lah: No such file or directory
fra-eng/:
_about.txt fra.txt


In [5]:
# Download the file
path_to_file = "fra-eng/fra.txt"

In [6]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [7]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [8]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENG, RUS]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [9]:
en, fr = create_dataset(path_to_file, None)
print(en[0])
print(fr[0])

<start> go . <end>
<start> va ! <end>


In [10]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [11]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

### Limit the size of the dataset to experiment faster (optional)


In [12]:
len(en), len(fr)

(227815, 227815)

In [13]:
# Try experimenting with the size of that dataset
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [14]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [15]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [16]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
4279 ----> voyagez
6 ----> vous
315 ----> seule
5 ----> ?
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
19 ----> are
5 ----> you
1236 ----> traveling
136 ----> alone
6 ----> ?
2 ----> <end>


### Create a tf.data dataset

In [17]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 19]), TensorShape([64, 12]))

In [19]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=False,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
# print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [21]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

  def call(self, x, hidden):
    # enc_output shape == (batch_size, max_length, hidden_size)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x, initial_state=hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state

In [22]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)



In [23]:
decoder_sample_x.shape

TensorShape([64, 8799])

In [24]:
decoder_sample_h.shape

TensorShape([64, 1024])

## Define the optimizer and the loss function

In [25]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)



## Checkpoints (Object-based saving)

In [26]:
checkpoint_dir = './training_nmt_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [27]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [28]:
EPOCHS = 30

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6005
Epoch 1 Batch 100 Loss 2.1720
Epoch 1 Batch 200 Loss 1.8914
Epoch 1 Batch 300 Loss 1.8370
Epoch 1 Batch 400 Loss 1.7023
Epoch 1 Batch 500 Loss 1.6483
Epoch 1 Batch 600 Loss 1.6539
Epoch 1 Batch 700 Loss 1.8174
Epoch 1 Batch 800 Loss 1.6562
Epoch 1 Batch 900 Loss 1.4531
Epoch 1 Batch 1000 Loss 1.2654
Epoch 1 Batch 1100 Loss 1.3141
Epoch 1 Batch 1200 Loss 1.2204
Epoch 1 Loss 1.6605
Time taken for 1 epoch 446.1971559524536 sec

Epoch 2 Batch 0 Loss 1.1915
Epoch 2 Batch 100 Loss 1.1043
Epoch 2 Batch 200 Loss 1.2825
Epoch 2 Batch 300 Loss 1.0896
Epoch 2 Batch 400 Loss 1.1058
Epoch 2 Batch 500 Loss 1.0992
Epoch 2 Batch 600 Loss 1.0320
Epoch 2 Batch 700 Loss 0.9814
Epoch 2 Batch 800 Loss 0.8737
Epoch 2 Batch 900 Loss 0.8621
Epoch 2 Batch 1000 Loss 1.0349
Epoch 2 Batch 1100 Loss 0.8576
Epoch 2 Batch 1200 Loss 0.8188
Epoch 2 Loss 0.9754
Time taken for 1 epoch 443.89236092567444 sec

Epoch 3 Batch 0 Loss 0.6267
Epoch 3 Batch 100 Loss 0.6422
Epoch 3 Batch 200 Loss 0.63

Epoch 18 Batch 1000 Loss 0.0997
Epoch 18 Batch 1100 Loss 0.0749
Epoch 18 Batch 1200 Loss 0.1389
Epoch 18 Loss 0.0785
Time taken for 1 epoch 438.77354979515076 sec

Epoch 19 Batch 0 Loss 0.0503
Epoch 19 Batch 100 Loss 0.0660
Epoch 19 Batch 200 Loss 0.0541
Epoch 19 Batch 300 Loss 0.0456
Epoch 19 Batch 400 Loss 0.0906
Epoch 19 Batch 500 Loss 0.0556
Epoch 19 Batch 600 Loss 0.0772
Epoch 19 Batch 700 Loss 0.0981
Epoch 19 Batch 800 Loss 0.0675
Epoch 19 Batch 900 Loss 0.0839
Epoch 19 Batch 1000 Loss 0.1079
Epoch 19 Batch 1100 Loss 0.0961
Epoch 19 Batch 1200 Loss 0.0817
Epoch 19 Loss 0.0773
Time taken for 1 epoch 436.8631088733673 sec

Epoch 20 Batch 0 Loss 0.0540
Epoch 20 Batch 100 Loss 0.0756
Epoch 20 Batch 200 Loss 0.0400
Epoch 20 Batch 300 Loss 0.1066
Epoch 20 Batch 400 Loss 0.0673
Epoch 20 Batch 500 Loss 0.1059
Epoch 20 Batch 600 Loss 0.0745
Epoch 20 Batch 700 Loss 0.1181
Epoch 20 Batch 800 Loss 0.1009
Epoch 20 Batch 900 Loss 0.1075
Epoch 20 Batch 1000 Loss 0.0868
Epoch 20 Batch 1100 Loss 

## Translate

* The evaluate function is similar to the training loop, except we don't use *teacher forcing* here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the *end token*.
* And store the *attention weights for every time step*.

Note: The encoder output is calculated only once for one input.

In [29]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # storing the attention weights to plot later on
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [30]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

## Restore the latest checkpoint and test

In [31]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x28a80e7d0>

In [34]:
translate('Ce site utilise Google.')

Input: <start> ce site utilise google . <end>
Predicted translation: this guy make a word . <end> 


In [36]:
translate('Cette est là pour vous.')

Input: <start> cette est l pour vous . <end>
Predicted translation: this is for you . <end> 


In [38]:
translate(u"ancien étudiant de l'université.")

Input: <start> ancien tudiant de l'universit . <end>
Predicted translation: the man kiss me open a stone . <end> 


In [40]:
translate(u'des démarches pour vous inscrire.')

Input: <start> des d marches pour vous inscrire . <end>
Predicted translation: sorry costs the effort . <end> 


In [42]:
translate(u'Préparer votre professionnelle.')

Input: <start> pr parer votre professionnelle . <end>
Predicted translation: go your friend . <end> 


In [44]:
translate(u'un nouvel élan à votre carrière.')

Input: <start> un nouvel lan votre carri re . <end>
Predicted translation: a girl is being paid to me . <end> 


In [46]:
translate(u'ou développer vos compétences.')

Input: <start> ou d velopper vos comp tences . <end>
Predicted translation: hand over your breathing . <end> 
