<a href="https://colab.research.google.com/github/amukhsimov/colab_notebooks/blob/master/ml_opus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import cv2
import logging
import os
import sys
import pathlib
import re
import multiprocessing as mp
import time
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

from sklearn.model_selection import train_test_split

In [None]:
tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)

In [None]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

In [None]:
pd.options.display.width = 1000
pd.options.display.max_columns = 14
np.set_printoptions(edgeitems=7, linewidth=500)

---
###`LOAD AND PREPARE DATA:`
---

####`LOAD DATA:`

In [None]:
%%time
examples, metadata = tfds.load('ted_hrlr_translate/ru_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
train_en = train_examples.map(lambda ru, en: en)
train_ru = train_examples.map(lambda ru, en: ru)

####`DATA PREVIEW:`

In [None]:
for ru_examples, en_examples in train_examples.batch(3).skip(2).take(1):
  for ru in ru_examples.numpy():
    print(ru.decode('utf-8'))

  print()

  for en in en_examples.numpy():
    print(en.decode('utf-8'))

####`CREATE TOKENIZER:`

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 15000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)
ru_tokenizer = text.BertTokenizer('ru_vocab.txt', **bert_tokenizer_params)

with open('en_vocab.txt', 'rt') as fp:
  en_vocab = [x for x in fp.read().split('\n') if x]
with open('ru_vocab.txt', 'rt') as fp:
  ru_vocab = [x for x in fp.read().split('\n') if x]

####`ENCODING EXAMPLE:`

In [None]:
for ru_examples, en_examples in train_examples.batch(3).take(1):
  for ex in en_examples:
    print(ex.numpy())

In [None]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = en_tokenizer.tokenize(en_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

In [None]:
for ex in token_batch.to_list():
  print(ex)

####`DECODER:`

In [None]:
def decode_en(batch):
  words = en_tokenizer.detokenize(tf.cast(batch, tf.int32))
  return tf.strings.reduce_join(words, separator=' ', axis=-1)

def decode_ru(batch):
  words = ru_tokenizer.detokenize(tf.cast(batch, tf.int32))
  return tf.strings.reduce_join(words, separator=' ', axis=-1)

####`DECODING EXAMPLE:`

In [None]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

####`SETUP INPUT PIPELINE:`

In [None]:
def train_prepared(ru, en):
  inp, tar = ru, en  # (B, T), zero padded
  inp = tf.cast(inp, tf.float32)
  tar = tf.cast(tar, tf.float32)

  _ = tf.shape(tar)
  tar_B, tar_T = _[0], _[1]
  T_range = tf.expand_dims(tf.range(tar_T, dtype=tf.float32), 0)  # row vector range, (1, T)

  # `zero_indices` and `rnd_indices`:
  # here we select random prefix part of the sentence for the sake of
  # predicting just the next word

  # compute the indices of first "0" entry of a batch ("0" - means [PAD] token)
  zero_indices = tf.cast(tf.argmax(tar == 0, axis=-1), tf.float32)  # (B, )
  zero_indices = tf.expand_dims(zero_indices, 1)  # transform to column vector, (B, 1)

  # select random indices between 1. and the index of the last word (excluded)
  rnd_indices = tf.random.uniform((tar_B, 1), 1, SENTENCE_MAX_LENGTH, dtype=tf.int32) # (B, 1)
  rnd_indices = tf.cast(rnd_indices, tf.float32)
  # random between 0. and `zero_indices - 1`:
  zero_indices = rnd_indices % zero_indices
  zero_indices = tf.maximum(1., zero_indices)  # random between 1. and `zero_indices - 1`
  #
  outp_mask = tf.cast(T_range < zero_indices, tf.float32)  # (B, T)
  tar_mask = tf.cast(T_range < (zero_indices + 1.), tf.float32)  # (B, T)

  return (inp, tar * outp_mask), tar * tar_mask


def tokenize_pairs(ru, en):
  ru = ru_tokenizer.tokenize(ru)
  # Convert from ragged to dense, padding with zeros.
  ru = ru.merge_dims(-2, -1).to_tensor()

  en = en_tokenizer.tokenize(en)
  # Convert from ragged to dense, padding with zeros.
  en = en.merge_dims(-2, -1).to_tensor()

  ru = tf.pad(ru, [[0,0],[0,1]], constant_values=0.)
  en = tf.pad(en, [[0,0],[0,1]], constant_values=0.)

  return ru, en

In [None]:
BUFFER_SIZE = 500000
BATCH_SIZE = 48
SENTENCE_MAX_LENGTH = 500

def make_batches(ds: tf.data.Dataset):
  return (ds.cache()
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .map(tokenize_pairs))

In [None]:
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

---
###`CREATE MODEL:`
---

##### `POSITIONAL ENCODING:`

In [None]:
def get_positional_encoding(sentence_max_len, d_model):
    positions = np.arange(sentence_max_len).reshape((-1, 1))  # reshape to column vector
    dimensions = np.arange(d_model).reshape((1, -1))  # reshape to row vector
    
    angles = positions / (10000. ** ((dimensions // 2) * 2. / d_model))
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])
    
    return np.expand_dims(angles, 0).astype(np.float32)  # (1, sentence_max_len, d_model)

##### `ENCODER-DECODER:`

In [None]:
class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        
        assert self.d_model % self.num_heads == 0
        
        self.depth = self.d_model // self.num_heads
        
        self.wq = keras.layers.Dense(self.d_model)
        self.wk = keras.layers.Dense(self.d_model)
        self.wv = keras.layers.Dense(self.d_model)
        
        self.linear = keras.layers.Dense(self.d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))  # (B, T, H, D_depth)
        return tf.transpose(x, perm=[0, 2, 1, 3])  # (B, H, T, D_depth)
        
    def call(self, q, k, v, mask):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (B, T_q, D_model)
        k = self.wk(k)  # (B, T_k, D_model)
        v = self.wv(v)  # (B, T_v, D_model)
        
        q = self.split_heads(q, batch_size)  # (B, H, T_q, D_depth)
        k = self.split_heads(k, batch_size)  # (B, H, T_k, D_depth)
        v = self.split_heads(v, batch_size)  # (B, H, T_v, D_depth)
        
        # perform scaled dot-product attention
        d_depth = tf.cast(tf.shape(q)[-1], tf.float32)
        qk_filter = tf.matmul(q, k, transpose_b=True)  # (B, H, T_q, T_k)
        qk_filter = qk_filter / tf.sqrt(d_depth)  # (B, H, T_q, T_k)
        
        if mask is not None:  # shape of (B, T_q)
            mask = tf.cast(tf.equal(mask, 0.), tf.float32)
            qk_filter -= mask * 1e9
        
        qk_filter = tf.nn.softmax(qk_filter)  # (B, H, T_q, T_k)
        scaled_attention = tf.matmul(qk_filter, v)  # (B, H, T_q, D_depth)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (B, T_q, H, D_depth)
        
        # concat
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (B, T_q, D_model)
        # linear
        return self.linear(concat_attention)
    
class EncoderLayer(keras.layers.Layer):
    def __init__(self, d_model, n_heads, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        
        self.mha = MultiHeadAttention(d_model, n_heads)
        
        self.ffw = keras.Sequential()
        self.ffw.add(keras.layers.Dense(d_model * 4, activation='relu'))
        self.ffw.add(keras.layers.Dense(d_model))
        
        self.norm1 = keras.layers.LayerNormalization()
        self.norm2 = keras.layers.LayerNormalization()
        
        self.dropout1 = keras.layers.Dropout(dropout_rate)
        self.dropout2 = keras.layers.Dropout(dropout_rate)
        
    def call(self, x, padding_mask):
        x_mha = self.mha(x, x, x, padding_mask)
        x_mha = self.norm1(x + x_mha)
        x_mha = self.dropout1(x_mha)
        
        x_ffw = self.ffw(x_mha)
        x_ffw = self.norm2(x_mha + x_ffw)
        x_ffw = self.dropout2(x_ffw)
        
        return x_ffw
    
class DecoderLayer(keras.layers.Layer):
    def __init__(self, d_model, n_heads, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        
        self.mha1 = MultiHeadAttention(d_model, n_heads)
        self.mha2 = MultiHeadAttention(d_model, n_heads)
        
        self.ffw = keras.Sequential()
        self.ffw.add(keras.layers.Dense(d_model * 4, activation='relu'))
        self.ffw.add(keras.layers.Dense(d_model))
        
        self.norm1 = keras.layers.LayerNormalization()
        self.norm2 = keras.layers.LayerNormalization()
        self.norm3 = keras.layers.LayerNormalization()
        
        self.dropout1 = keras.layers.Dropout(dropout_rate)
        self.dropout2 = keras.layers.Dropout(dropout_rate)
        self.dropout3 = keras.layers.Dropout(dropout_rate)
        
    def call(self, x, enc_output, look_ahead_mask, padding_mask):
        x_mha1 = self.mha1(x, x, x, look_ahead_mask)
        x_mha1 = self.norm1(x + x_mha1)
        x_mha1 = self.dropout1(x_mha1)
        
        x_mha2 = self.mha2(x_mha1, enc_output, enc_output, padding_mask)
        x_mha2 = self.norm2(x_mha1 + x_mha2)
        x_mha2 = self.dropout2(x_mha2)
        
        x_ffw = self.ffw(x_mha2)
        x_ffw = self.norm2(x_mha2 + x_ffw)
        x_ffw = self.dropout3(x_ffw)
        
        return x_ffw
    
class Encoder(keras.layers.Layer):
    def __init__(self, d_model, n_heads, n_layers, dropout_rate):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        
        self.encoder_layers = [EncoderLayer(d_model, n_heads, dropout_rate) for i in range(n_layers)]
    
    def call(self, x, padding_mask):
        for encoder in self.encoder_layers:
            x = encoder(x, padding_mask)
        return x

class Decoder(keras.layers.Layer):
    def __init__(self, d_model, n_heads, n_layers, dropout_rate):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        
        self.decoder_layers = [DecoderLayer(d_model, n_heads, dropout_rate) for i in range(n_layers)]
        
    def call(self, x, enc_output, look_ahead_mask, padding_mask):
        for decoder in self.decoder_layers:
            x = decoder(x, enc_output, look_ahead_mask, padding_mask)
        return x

#####`CUSTOM TRAINING MODEL:`

##### `MODEL FUNCTION:`

In [None]:
def model_v1(num_words_l, num_words_r, sentence_max_len, dropout_rate):
    d_model = 512
    n_heads = 8
    n_layers = 6
    
    dk = tf.cast(tf.constant(d_model), tf.float32)
    
    # Encoder
    x_input = keras.Input((None, ), dtype=tf.float32)  # shape of (batch_size, T)
    padding_mask = 1. - tf.cast(tf.equal(x_input, 0.), tf.float32)  # (B, T)
    B, T = tf.shape(padding_mask)
    enc_padding_mask = tf.reshape(padding_mask, (B, 1, 1, T))
    
    pos_encoding = tf.constant(get_positional_encoding(sentence_max_len, d_model))  # position encoding, (1, T, D_model)
    
    # (batch_size, T_inp, d_model)
    x = layers.Embedding(num_words_l, d_model)(x_input)
    x *= tf.sqrt(dk)
    x += pos_encoding[:, :T, :]
    
    enc_output = Encoder(d_model, n_heads, n_layers, dropout_rate)(x, enc_padding_mask)  # (B, T_inp, D)
    
    # Decoder
    x_output = keras.Input((None, ), dtype=tf.float32)  # given N words, predict next one, (B, T)
    B, T = tf.shape(x_output)
    # look_ahead_mask = tf.linalg.band_part(tf.ones((1, T_inp, T_inp)), -1, 0)
    
    # calculate look-ahead mask
    # for example, if x_output is something like [1,4,2,0,0], then look ahead mask will be [1,1,1,1,0]
    first_zeros = tf.expand_dims(tf.argmin(x_output, axis=-1), -1)  # column vector
    first_zeros = tf.cast(first_zeros, tf.float32)
    rng = tf.expand_dims(tf.range(T), 0)  # T range
    rng = tf.cast(rng, tf.float32)
    look_ahead_mask = tf.cast(rng <= first_zeros, tf.float32)  # (B, T). Include first zeros as well
    look_ahead_mask = tf.reshape(look_ahead_mask, (B, 1, 1, T))  # (B, 1, 1, T_outp)
    
    # decoder padding mask (just crop encoders padding mask)
    dec_padding_mask = enc_padding_mask  # (B, 1, 1, T)
    
    x = layers.Embedding(num_words_r, d_model)(x_output)  # (B, T, D)
    x *= tf.sqrt(dk)
    x += pos_encoding[:, :T, :]
    
    x_decode = Decoder(d_model, n_heads, n_layers, dropout_rate)(x, enc_output, look_ahead_mask, dec_padding_mask)  # (B, T, D)
    x_final = layers.Dense(num_words_r, activation='softmax')(x_decode)  # (B, T, num_words)
    x_final = x_final * tf.reshape(look_ahead_mask, (B, T, 1))
    
    return keras.Model([x_input, x_output], x_final)

In [None]:
def loss_fn(y_true, y_pred):
    """
    y_true: shape of (B, T)
    """
    loss_mask = 1. - tf.cast(tf.equal(y_true, 0.), tf.float32)  # shape of (B, T)
    y_true = tf.one_hot(tf.cast(y_true, tf.int32), tf.shape(y_pred)[-1])
    #loss = keras.losses.sparse_categorical_crossentropy(y_true, y_pred)  # shape of (B, T)
    loss = tfa.losses.sigmoid_focal_crossentropy(y_true, y_pred)
    loss = loss * loss_mask
    return tf.reduce_sum(loss) / tf.cast(tf.shape(y_true)[0], tf.float32)  # mean by batch size

##### `CREATE MODEL:`

In [None]:
dropout_rate = 0.2

model = model_v1(len(ru_vocab), len(en_vocab), SENTENCE_MAX_LENGTH, dropout_rate)
model_name = 'transformer_v1'

remark = 'none'
log_path = os.path.join('logs', model_name, remark)
tf_writer = tf.summary.create_file_writer(log_path)
ckpt_path = os.path.join('models_ckpt', model_name, remark)
start_epoch = 0

#model.load_weights(os.path.join(ckpt_path, f'{model_name}-0000.ckpt'))

---
###`TRAIN:`
---

##### `TRAIN FUNCTION, CALLBACKS:`

In [None]:
def train(n_epochs, train_ds, val_ds,
          train_steps_per_epoch=None, val_steps_per_epoch=None):
    global start_epoch

    class CbEpochIncrement(keras.callbacks.Callback):
        def __init__(self):
            super().__init__()

        def on_epoch_end(self, epoch, logs=None):
            global start_epoch
            start_epoch += 1

        def on_train_batch_end(self, batch, logs=None):
            pass
            #time.sleep(0.1)

    model.fit(train_ds, epochs=start_epoch + n_epochs, 
              steps_per_epoch=train_steps_per_epoch, validation_steps=val_steps_per_epoch,
              callbacks=callbacks + [CbEpochIncrement()], validation_data=val_ds,
              initial_epoch=start_epoch, workers=3)


def get_callbacks():
    callback_tb = keras.callbacks.TensorBoard(log_dir=log_path)
    cb_ckpt_last = keras.callbacks.ModelCheckpoint(
        os.path.join(ckpt_path, f'{model_name}-last.ckpt'),
        save_weights_only=True, save_best_only=False
    )
    cb_ckpt_best = keras.callbacks.ModelCheckpoint(
        os.path.join(ckpt_path, f'{model_name}-best.ckpt'), monitor='val_loss', mode='min',
        save_weights_only=True, save_best_only=True
    )
    cb_lr_schedule = keras.callbacks.LearningRateScheduler(lr_schedule)
    return [
        cb_lr_schedule, callback_tb, #cb_ckpt_last, cb_ckpt_best, 
    ]


def lr_schedule(epoch, lr):
    return lr
    #if 0 <= epoch <= 10:
    #    return 1e-3
    #elif 11 <= epoch <= 47:
    #    return 5e-4
    #elif 48 <= epoch <= 60:
    #    return 1e-4
    #else:
    #    return 1e-5

####`PREPARE FOR TRAINING:`

In [None]:
model.compile(keras.optimizers.Adam(1e-3), loss_fn)

In [None]:
callbacks = get_callbacks()
start_epoch = 0

####`TRAINING:`

In [None]:
model.load_weights(f'{model_name}-0000.ckpt')

In [None]:
train(n_epochs=200, 
      train_ds=train_batches.repeat().map(train_prepared), 
      val_ds=val_batches.repeat().map(train_prepared),
      train_steps_per_epoch=train_batches.__len__().numpy(),
      val_steps_per_epoch=val_batches.__len__().numpy())

In [None]:
model.save_weights(f'{model_name}-0000.ckpt')

In [None]:
!gsutil cp ./transformer_v1-0000* gs://bucket-eu-colab/data/ml_opus/en_ru/checkpoint/

####`TESTING:`

#####`PREPARE THE DATA:`

In [None]:
train_ds=train_batches.unbatch().batch(1).repeat().map(train_prepared)

#####`EVALUATE:`

In [None]:
(inp, outp), y = train_ds.take(1).get_single_element()
y_hat = tf.argmax(model.predict([inp, outp]), axis=-1)

In [None]:
text_inp = decode_ru(inp).numpy()[0].decode('utf8')
text_outp = decode_en(outp).numpy()[0].decode('utf8')
text_y = decode_en(y).numpy()[0].decode('utf8')
text_yhat = decode_en(y_hat).numpy()[0].decode('utf8')

#####`PRINT RESULTS:`

In [None]:
print(f"Input text: {text_inp}")
print()
print(f"Outp text: {text_outp}")
print()
print(f"Target text: {text_y}")
print()
print(f"Predicted text: {text_yhat}")