### Урок 10. Машинный перевод. Модель seq2seq и механизм внимания<br>
**Задание**

Разобраться с моделью перевода (без механизма внимания) как она устроена, запустить для перевода с русского на английский (при желании можно взять другие пары языков)


Подготовим окружение

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

Посмотрим на данные

In [2]:
from google.colab import files

In [3]:
uploaded = files.upload()

Saving rus.txt to rus.txt


In [4]:
path = "rus.txt"

In [5]:
def preprocess_sentence(w):
    w = w.lower().strip()

    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # заменим все пробелом, кроме (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

    w = w.strip()

    # добавление начала и конца предложения
    w = '<start> ' + w + ' <end>'
    return w

In [6]:
preprocess_sentence("I can't go.")

"<start> i can't go . <end>"

In [7]:
#создадим новый датасет
#почистим предложения
#вернем пары в формате: [ENG, RUS]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

    return zip(*word_pairs)

In [9]:
en, ru = create_dataset(path, None)
print(en[0])
print(ru[0])

<start> go . <end>
<start> марш ! <end>


In [10]:
#функция токенизации
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')

    return tensor, lang_tokenizer

In [11]:
#создание очищенных входных и выходных пар
def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

Опционально, ограничим размер датасета для большей скорости обучения

In [12]:
len(en), len(ru)

(479223, 479223)

In [14]:
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path, num_examples)

#вычислим максимальную длину целевых тензоров
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [15]:
#разделим датасет на трйн и тест
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)


print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

80000 80000 20000 20000


In [16]:
def convert(lang, tensor):
    for t in tensor:
      if t!=0:
        print ("%d ----> %s" % (t, lang.index_word[t]))

In [17]:
#сравним вводимый и целевой тензор
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
8 ----> это
9510 ----> слива
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
18 ----> this
8 ----> is
9 ----> a
3757 ----> plum
3 ----> .
2 ----> <end>


In [18]:
#создадим датасет для тензорфлоу
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 15]), TensorShape([64, 11]))

In [20]:
#Подключим класс Энкодер
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
      super(Encoder, self).__init__()
      self.batch_sz = batch_sz
      self.enc_units = enc_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
      self.gru = tf.keras.layers.GRU(self.enc_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
      x = self.embedding(x)
      output, state = self.gru(x, initial_state = hidden)
      return output, state

    def initialize_hidden_state(self):
      return tf.zeros((self.batch_sz, self.enc_units))

In [21]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

#образцы ввода
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 15, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [22]:
#еще один класс, описывающий механизм внимания
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
      super(BahdanauAttention, self).__init__()
      self.W1 = tf.keras.layers.Dense(units)
      self.W2 = tf.keras.layers.Dense(units)
      self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):

      query_with_time_axis = tf.expand_dims(query, 1)

      score = self.V(tf.nn.tanh(
      self.W1(query_with_time_axis) + self.W2(values)))


      attention_weights = tf.nn.softmax(score, axis=1)


      context_vector = attention_weights * values
      context_vector = tf.reduce_sum(context_vector, axis=1)

      return context_vector, attention_weights

In [23]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 15, 1)


In [24]:
#Вводим класс декодера
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
      super(Decoder, self).__init__()
      self.batch_sz = batch_sz
      self.dec_units = dec_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
      self.gru = tf.keras.layers.GRU(self.dec_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
      self.fc = tf.keras.layers.Dense(vocab_size)

      #используется для внимания
      self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):

      context_vector, attention_weights = self.attention(hidden, enc_output)

      x = self.embedding(x)

      x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

      #передача объединенного вектора в GPU
      output, state = self.gru(x)

      output = tf.reshape(output, (-1, output.shape[2]))

      x = self.fc(output)

      return x, state, attention_weights

In [25]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 7386)


Определим оптимизатор и функцию потерь

In [26]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

Контрольные точки объектов

In [27]:
checkpoint_dir = './training_attention_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
      enc_output, enc_hidden = encoder(inp, enc_hidden)

      dec_hidden = enc_hidden

      dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

      #передача целевой переменной в качестве следующего ввода
      for t in range(1, targ.shape[1]):
        # передача в декодер
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

        loss += loss_function(targ[:, t], predictions)

        dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [29]:
EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
      batch_loss = train_step(inp, targ, enc_hidden)
      total_loss += batch_loss

      if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                    batch,
                                                    batch_loss.numpy()))
    if (epoch + 1) % 2 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6688
Epoch 1 Batch 100 Loss 2.0629
Epoch 1 Batch 200 Loss 1.8643
Epoch 1 Batch 300 Loss 1.7428
Epoch 1 Batch 400 Loss 1.5708
Epoch 1 Batch 500 Loss 1.5245
Epoch 1 Batch 600 Loss 1.3327
Epoch 1 Batch 700 Loss 1.2896
Epoch 1 Batch 800 Loss 1.1371
Epoch 1 Batch 900 Loss 1.1635
Epoch 1 Batch 1000 Loss 1.0995
Epoch 1 Batch 1100 Loss 1.0088
Epoch 1 Batch 1200 Loss 0.8496
Epoch 1 Loss 1.4429
Time taken for 1 epoch 116.59973812103271 sec

Epoch 2 Batch 0 Loss 0.8492
Epoch 2 Batch 100 Loss 0.8622
Epoch 2 Batch 200 Loss 0.7692
Epoch 2 Batch 300 Loss 0.5689
Epoch 2 Batch 400 Loss 0.5288
Epoch 2 Batch 500 Loss 0.6445
Epoch 2 Batch 600 Loss 0.5273
Epoch 2 Batch 700 Loss 0.5473
Epoch 2 Batch 800 Loss 0.6305
Epoch 2 Batch 900 Loss 0.4630
Epoch 2 Batch 1000 Loss 0.5625
Epoch 2 Batch 1100 Loss 0.4410
Epoch 2 Batch 1200 Loss 0.4731
Epoch 2 Loss 0.6174
Time taken for 1 epoch 97.69097685813904 sec

Epoch 3 Batch 0 Loss 0.3724
Epoch 3 Batch 100 Loss 0.3731
Epoch 3 Batch 200 Loss 0.35

In [30]:
#функция оценки аналогична циклу обучения, за исключением того, что здесь мы не используем принуждение учителя.
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_inp,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
      predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                          dec_hidden,
                                                          enc_out)

      attention_weights = tf.reshape(attention_weights, (-1, ))
      attention_plot[t] = attention_weights.numpy()

      predicted_id = tf.argmax(predictions[0]).numpy()

      result += targ_lang.index_word[predicted_id] + ' '

      if targ_lang.index_word[predicted_id] == '<end>':
        return result, sentence, attention_plot

      dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [31]:
#функция для построения весов внимания
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [32]:
#функция перевода
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

   # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
   # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

Восстановим последнюю контрольную точку и протестируем

In [33]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7da518104e80>

In [35]:
translate('Хороший переводчик хорошо пишет.')

Input: <start> хороший переводчик хорошо пишет . <end>
Predicted translation: it's a good work . <end> 


In [36]:
translate('Как твои дела?')

Input: <start> как твои дела ? <end>
Predicted translation: how are you doing ? <end> 


In [37]:
translate('Я никогда такого не делаю.')

Input: <start> я никогда такого не делаю . <end>
Predicted translation: i never do that . <end> 


In [38]:
translate('Вы пойдете в кино?')

Input: <start> вы пойдете в кино ? <end>
Predicted translation: will you see a movie ? <end> 


In [39]:
translate('Сегодня плохая погода.')

Input: <start> сегодня плохая погода . <end>
Predicted translation: it's a bad today . <end> 


Результаты работы нейросети переводчика можно признать удовлетворительными, примерно две трети из ста точности попаданий в контекст. Входными данными декодера на каждом временном шаге являются его предыдущие прогнозы, а также скрытое состояние и выходные данные кодера. Когда модель предсказывает конечный токен, прогнорз останавливается, а веса внимания сохраняются для каждого временного шага. Выходной сигнал энкодера рассчитывается только один раз для одного входа.