### КУРС "Введение в обработку естественного языка"

#### дз к уроку 10 “Перевод без механизма внимания SEQ2SEQ модель”

Разобраться с моделькой перевода (без механизма внимания) как она устроена, запустить для перевода с русского на английский (при желании можно взять другие пары языков)

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import sklearn
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
devices = tf.config.list_physical_devices('GPU')
devices

[]

!wget http://www.manythings.org/anki/rus-eng.zip
!mkdir rus-eng
!unzip rus-eng.zip -d rus-eng/

In [3]:
!ls ./rus-eng/

"ls" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [4]:
path_to_file = './rus-eng/rus.txt'

In [5]:
def preprocess_sentence(w):
  w = w.lower().strip()

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [6]:
preprocess_sentence('Just do it!')

'<start> just do it ! <end>'

In [7]:
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]]  for l in lines[:num_examples]]

  return zip(*word_pairs)

In [8]:
en, ru = create_dataset(path_to_file, None)
print(en[140])
print(ru[140])

<start> i lied . <end>
<start> я солгал . <end>


In [9]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [10]:
def load_dataset(path, num_examples=None):
  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

### Let's create model and train it

In [11]:
num_examples = 100000
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [12]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, 
                                                                                                target_tensor,
                                                                                                test_size=0.2)

In [13]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print(f"{t} ----> {lang.index_word[t]}")

In [14]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
4 ----> я
75 ----> никогда
7 ----> не
820 ----> пью
181 ----> одна
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
4 ----> i
104 ----> never
211 ----> drink
119 ----> alone
3 ----> .
2 ----> <end>


#### create a tf.data and dataset

In [15]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 300
units = 1024
vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 15]), TensorShape([64, 11]))

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences = False,
                                       return_state= True,
                                       recurrent_initializer = 'glorot_uniform')
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder Hidden state shape: (batch size, units) (64, 1024)


In [19]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences = True,
                                       return_state = True,
                                       recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state

In [20]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_sample_x, decoder_sample_h = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

#### optimizer and loss function

In [21]:
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [22]:
checkpoint_dir = './training_text_translate_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [23]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

#### let's training

In [24]:
EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.6398
Epoch 1 Batch 100 Loss 2.1077
Epoch 1 Batch 200 Loss 1.7835
Epoch 1 Batch 300 Loss 1.6146
Epoch 1 Batch 400 Loss 1.5024
Epoch 1 Batch 500 Loss 1.4629
Epoch 1 Batch 600 Loss 1.3938
Epoch 1 Batch 700 Loss 1.2825
Epoch 1 Batch 800 Loss 1.2586
Epoch 1 Batch 900 Loss 1.1891
Epoch 1 Batch 1000 Loss 1.2587
Epoch 1 Batch 1100 Loss 1.0392
Epoch 1 Batch 1200 Loss 0.9754
Epoch 1 Loss 1.4506
Time taken for 1 epoch 1264.1662464141846 sec

Epoch 2 Batch 0 Loss 0.9118
Epoch 2 Batch 100 Loss 0.9356
Epoch 2 Batch 200 Loss 0.8833
Epoch 2 Batch 300 Loss 0.9317
Epoch 2 Batch 400 Loss 0.7873
Epoch 2 Batch 500 Loss 0.8788
Epoch 2 Batch 600 Loss 0.7924
Epoch 2 Batch 700 Loss 0.7030
Epoch 2 Batch 800 Loss 0.6695
Epoch 2 Batch 900 Loss 0.6539
Epoch 2 Batch 1000 Loss 0.5643
Epoch 2 Batch 1100 Loss 0.6885
Epoch 2 Batch 1200 Loss 0.6019
Epoch 2 Loss 0.7770
Time taken for 1 epoch 1518.204639673233 sec

Epoch 3 Batch 0 Loss 0.5024
Epoch 3 Batch 100 Loss 0.4817
Epoch 3 Batch 200 Loss 0.56

Epoch 18 Batch 1000 Loss 0.0663
Epoch 18 Batch 1100 Loss 0.0705
Epoch 18 Batch 1200 Loss 0.0968
Epoch 18 Loss 0.0643
Time taken for 1 epoch 1228.3260338306427 sec

Epoch 19 Batch 0 Loss 0.0275
Epoch 19 Batch 100 Loss 0.0412
Epoch 19 Batch 200 Loss 0.0482
Epoch 19 Batch 300 Loss 0.0554
Epoch 19 Batch 400 Loss 0.0550
Epoch 19 Batch 500 Loss 0.0438
Epoch 19 Batch 600 Loss 0.0730
Epoch 19 Batch 700 Loss 0.0814
Epoch 19 Batch 800 Loss 0.0759
Epoch 19 Batch 900 Loss 0.0383
Epoch 19 Batch 1000 Loss 0.0620
Epoch 19 Batch 1100 Loss 0.0585
Epoch 19 Batch 1200 Loss 0.0712
Epoch 19 Loss 0.0632
Time taken for 1 epoch 1213.810257434845 sec

Epoch 20 Batch 0 Loss 0.0356
Epoch 20 Batch 100 Loss 0.0401
Epoch 20 Batch 200 Loss 0.0718
Epoch 20 Batch 300 Loss 0.0294
Epoch 20 Batch 400 Loss 0.0666
Epoch 20 Batch 500 Loss 0.0324
Epoch 20 Batch 600 Loss 0.0445
Epoch 20 Batch 700 Loss 0.0622
Epoch 20 Batch 800 Loss 0.0543
Epoch 20 Batch 900 Loss 0.0888
Epoch 20 Batch 1000 Loss 0.1135
Epoch 20 Batch 1100 Loss 

Epoch 36 Batch 100 Loss 0.0663
Epoch 36 Batch 200 Loss 0.0319
Epoch 36 Batch 300 Loss 0.0354
Epoch 36 Batch 400 Loss 0.0374
Epoch 36 Batch 500 Loss 0.0569
Epoch 36 Batch 600 Loss 0.0541
Epoch 36 Batch 700 Loss 0.0460
Epoch 36 Batch 800 Loss 0.0508
Epoch 36 Batch 900 Loss 0.0532
Epoch 36 Batch 1000 Loss 0.0462
Epoch 36 Batch 1100 Loss 0.0475
Epoch 36 Batch 1200 Loss 0.0618
Epoch 36 Loss 0.0502
Time taken for 1 epoch 1443.7949714660645 sec

Epoch 37 Batch 0 Loss 0.0333
Epoch 37 Batch 100 Loss 0.0474
Epoch 37 Batch 200 Loss 0.0416
Epoch 37 Batch 300 Loss 0.0435
Epoch 37 Batch 400 Loss 0.0444
Epoch 37 Batch 500 Loss 0.0487
Epoch 37 Batch 600 Loss 0.0263
Epoch 37 Batch 700 Loss 0.0582
Epoch 37 Batch 800 Loss 0.0612
Epoch 37 Batch 900 Loss 0.0775
Epoch 37 Batch 1000 Loss 0.0559
Epoch 37 Batch 1100 Loss 0.0434
Epoch 37 Batch 1200 Loss 0.0532
Epoch 37 Loss 0.0494
Time taken for 1 epoch 26675.997575759888 sec

Epoch 38 Batch 0 Loss 0.0274
Epoch 38 Batch 100 Loss 0.0454
Epoch 38 Batch 200 Loss 0

In [25]:
def evaluate(sentence):
  #attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # storing the attention weights to plot later on
    predicted_id = tf.argmax(predictions[0]).numpy()
    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [26]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print(f'Input: {sentence}')
  print(f'Predicted translation: {result}')

In [27]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x26d018cafa0>

In [28]:
translate('Все было хорошо')

Input: <start> все было хорошо <end>
Predicted translation: was all right ? <end> 


In [29]:
translate('Что вы делали вчера?')

Input: <start> что вы делали вчера ? <end>
Predicted translation: what were you seen ? <end> 
