# Разобраться с задачей перевода с вниманием и без внимания (материалы в архиве с пометкой actual)

## Libs

In [1]:
import os
import time

import re

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers

## Prep data

In [2]:
PATH_TO_FILE = '../data/rus-eng/rus.txt'
NUM_EXAMPLES = 100_000

In [3]:
def preprocess_sentence(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Zа-яА-Я?.!,']+", " ", w)
    w = w.strip()
    w = '<start> ' + w + ' <end>'
    return w

def create_dataset(path, num_examples=None):
    with open(path, 'r', encoding='utf') as lines:
        lines = lines.read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')[:2]] for l in lines[:num_examples]]
    word_pairs = zip(*word_pairs)
    return word_pairs

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

def load_dataset(path, num_examples=None):
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [4]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH_TO_FILE, NUM_EXAMPLES)

X_train, X_val, y_train, y_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

## Convert in tf dataset

In [5]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64

steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

embedding_dim = 256
units = 1024

vocab_inp_size = len(inp_lang.word_index) + 1
vocab_tar_size = len(targ_lang.word_index) + 1

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# 2. попробовать поэкспериментировать с архитектурой энкодера и декодера

## Build model

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=False, return_state=True, recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.units))

In [8]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.units = units
        
        self.embedding = layers.Embedding(vocab_size, embedding_dim)
        self.gru = layers.GRU(units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.dence = layers.Dense(vocab_size)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.dence(output)
        return x, state

In [9]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

# Optimizer

In [10]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    return tf.reduce_mean(loss)

## Checkpoint

In [11]:
checkpoint_dir = './training_nmt_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder
)

# Fiting model

In [12]:
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [25]:
EPOCHS = 1
training = True

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    
if training:
    for epoch in range(EPOCHS):
        start = time.time()
        
        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for batch, (inp, targ) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = train_step(inp, targ, enc_hidden)
            total_loss += batch_loss

            if batch % 200 == 0:
                print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy():.4f}')
                checkpoint.save(file_prefix=checkpoint_prefix)
                print('checkpoint saved')

        print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch:.4f}')
        print(f'{round(time.time() - start)} sec./batch\n')

Epoch 1 Batch 0 Loss 0.4004
checkpoint saved
Epoch 1 Batch 200 Loss 0.4720
checkpoint saved
Epoch 1 Batch 400 Loss 0.3943
checkpoint saved
Epoch 1 Batch 600 Loss 0.3866
checkpoint saved
Epoch 1 Batch 800 Loss 0.3549
checkpoint saved
Epoch 1 Batch 1000 Loss 0.4530
checkpoint saved
Epoch 1 Batch 1200 Loss 0.3657
checkpoint saved
Epoch 1 Loss 0.3956
373 sec./batch



# 1. сделать выводы о качестве перевода в зависимости от длины предложений

In [14]:
def evaluate(sentence):
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs],
        maxlen=input_tensor.shape[1],
        padding='post'
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    enc_hidden = [tf.zeros((1, units))]
    enc_hidden = encoder(inputs, enc_hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(target_tensor.shape[1]):
        pred, dec_hidden = decoder(dec_input, dec_hidden)
        pred = tf.argmax(pred[0]).numpy()
        
        result += targ_lang.index_word[pred] + ' '

        if targ_lang.index_word[pred] == '<end>':
            return result, sentence

        dec_input = tf.expand_dims([pred], 0)

    return result, sentence

In [15]:
def translate(sentence):
    result, sentence = evaluate(sentence)
    
    print(f'Input: {sentence}')
    print(f'Predicted translation: {result}')

In [26]:
translate('Жирный кот.')

Input: <start> жирный кот . <end>
Predicted translation: the cat is dead . <end> 


In [27]:
translate('Мои чувства - это причина искусства.')

Input: <start> мои чувства это причина искусства . <end>
Predicted translation: i have lots of this . <end> 


In [28]:
translate('Чем больше понимаешь, тем меньше говоришь')

Input: <start> чем больше понимаешь , тем меньше говоришь <end>
Predicted translation: what more can fly ? <end> 


In [29]:
translate('минус один - это уже один, а один - это уже не ноль')

Input: <start> минус один это уже один , а один это уже не ноль <end>
Predicted translation: be quick as it as you . <end> 


#### Вывод: 
- при тестировании на недообученной модели наблюдалась цикличность одних и тех же фраз