In [1]:
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ROOTPATH = 'drive/My Drive/'

In [2]:
total_examples = tf.data.experimental.CsvDataset(ROOTPATH+'wmt14_60k.csv', record_defaults=['string', 'string'])

train_examples = total_examples.take(55000)
valid_examples= total_examples.skip(55000)

In [3]:
def create_tokenizers(train_examples=train_examples):
    tokenizer_src = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (src.numpy() for src, trg in train_examples), target_vocab_size=2**13)

    tokenizer_trg = tfds.features.text.SubwordTextEncoder.build_from_corpus(
        (trg.numpy() for src, trg in train_examples), target_vocab_size=2**13)
    
    return tokenizer_src, tokenizer_trg

In [None]:
tokenizer_src, tokenizer_trg = create_tokenizers()

In [None]:
tokenizer_src.save_to_file('src_vocab'), tokenizer_trg.save_to_file('trg_vocab')

(None, None)

In [4]:
tokenizer_src = tfds.features.text.SubwordTextEncoder.load_from_file(ROOTPATH+'src_vocab') 
tokenizer_trg = tfds.features.text.SubwordTextEncoder.load_from_file(ROOTPATH+'trg_vocab')

In [5]:
BUFFERSIZE = 20000
BATCHSIZE = 64
MAXLENGTH = 40

In [6]:
def encode(lang1, lang2):
  lang1 = [tokenizer_src.vocab_size] + tokenizer_src.encode(
      lang1.numpy()) + [tokenizer_src.vocab_size+1]

  lang2 = [tokenizer_trg.vocab_size] + tokenizer_trg.encode(
      lang2.numpy()) + [tokenizer_trg.vocab_size+1]
  
  return lang1, lang2

def tf_encode(src, trg):
  result_src, result_trg = tf.py_function(encode, [src, trg], [tf.int64, tf.int64])
  result_src.set_shape([None])
  result_trg.set_shape([None])

  return result_src, result_trg

def filter_max_length(x, y, max_length=MAXLENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [7]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# cache the dataset to memory to get a speedup while reading from it.

train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFERSIZE).padded_batch(BATCHSIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

valid_dataset = valid_examples.map(tf_encode)
valid_dataset = valid_dataset.filter(filter_max_length).padded_batch(BATCHSIZE)

In [8]:
len(list(iter(train_dataset))) * BATCHSIZE

30528

In [9]:
def GetAngles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def PositionalEncoding(position, d_model):
  angle_rads = GetAngles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [10]:
def createPaddingMask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [11]:
def createLookAheadMask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [12]:
def ScaledDotProductAttention(Q, K, V, mask):
      '''
      Args:
      Q: query shape == (..., len_q, d_q)
      K: key shape == (..., len_k, d_k)
      V: value shape == (..., len_v, d_v)
      mask: Float tensor with shape broadcastable 
            to (..., len_q, len_k). Defaults to None.
      '''

      d_k = tf.cast(tf.shape(K)[-1], tf.float32) # (Here d_k == d_q)
      logits = tf.linalg.matmul(Q, K, transpose_b=True) / tf.math.sqrt(d_k)

      if mask is not None:
            logits += (mask * -1e9)

      weights = tf.nn.softmax(logits, axis=-1) # logits.shape (..., len_q, len_k)
      attention = tf.linalg.matmul(weights, V) 

      return attention, weights  # output.shape (..., len_q, d_v)

In [13]:
class MultiHeadAttention(tf.keras.layers.Layer):
      def __init__(self, d_model, n_heads):
            super(MultiHeadAttention, self).__init__()
            self.d_model = d_model
            self.n_heads = n_heads

            self.d_split = self.d_model // self.n_heads

            self.w_k = tf.keras.layers.Dense(d_model)
            self.w_q = tf.keras.layers.Dense(d_model)
            self.w_v = tf.keras.layers.Dense(d_model) # for K, Q and V

            self.w_o = tf.keras.layers.Dense(d_model) # for output

      def split_heads(self, M, batch_size):
            '''
            Args: 
            M: matrix shape == (batch_size, len_seq, d_model)
            Returns: matrix with shape == (batch_size, n_heads, len_seq, d_split)
            '''
            #print(f'M in the MultiHeadAttention layer, split_heads function:{M.shape}')
            M = tf.reshape(M, (batch_size, -1, self.n_heads, self.d_split)) 
            #print(f'M in the MultiHeadAttention layer, split_heads function:{M.shape}')
            return tf.transpose(M, perm=[0, 2, 1, 3])

      def call(self, Q, K, V, mask):
            '''
            the idea is to split Q, K or V such that it is shape is 
            (batch_size, n_heads, len_seq, d_split)
            '''

            batch_size = tf.shape(Q)[0]

            Q = self.w_q(Q) # Q.shape (batch_size, len_trg, d_model)
            K = self.w_k(K) # K.shape (batch_size, len_src, d_model)
            V = self.w_v(V) # V.shape (batch_size, len_src, d_model)

            Q = self.split_heads(Q, batch_size)
            K = self.split_heads(K, batch_size)
            V = self.split_heads(V, batch_size)
            
            attention, weights = ScaledDotProductAttention(Q, K, V, mask)
            ## attention.shape (batch_size, n_heads, len_seq, d_split)

            attention = tf.transpose(attention, perm=[0, 2, 1, 3]) 
            attention = tf.reshape(attention, (batch_size, -1, self.d_model))
            ## attention.shape (batch_size, len_seq, d_model)

            output = self.w_o(attention)
            ## output.shape (batch_size, len_seq, d_model)

            return output, weights

In [14]:
def PointwiseFeedForwardNet(d_model, d_feed):
      return tf.keras.models.Sequential([
            tf.keras.layers.Dense(d_feed, activation='relu'),
            tf.keras.layers.Dense(d_model)
      ])

In [15]:
class EncoderLayer(tf.keras.layers.Layer):
      def __init__(self, d_model, d_feed, n_heads, drop_rate=0.1):
            super(EncoderLayer, self).__init__()

            self.mha = MultiHeadAttention(d_model, n_heads)
            self.pff = PointwiseFeedForwardNet(d_model, d_feed)

            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

            self.dropout1 = tf.keras.layers.Dropout(drop_rate)
            self.dropout2 = tf.keras.layers.Dropout(drop_rate)

      def call(self, src, pa_mask, train):
            out1, _ = self.mha(src, src, src, pa_mask) # (batch_size, len_src, d_model)
            out1 = self.dropout1(out1, training=train)
            add1 = self.layernorm1(src + out1) # (batch_size, len_src, d_model) 

            out2 = self.pff(add1) # (batch_size, len_src, d_model)
            out2 = self.dropout2(out2, training=train) 
            add2 = self.layernorm2(add1 + out2) # (batch_size, len_src, d_model)

            return add2

In [16]:
class DecoderLayer(tf.keras.layers.Layer):
      def __init__(self, d_model, d_feed, n_heads, drop_rate=0.1):
            super(DecoderLayer, self).__init__()

            self.mha1 = MultiHeadAttention(d_model, n_heads)
            self.mha2 = MultiHeadAttention(d_model, n_heads)

            self.pff = PointwiseFeedForwardNet(d_model, d_feed)

            self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
            self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

            self.dropout1 = tf.keras.layers.Dropout(drop_rate)
            self.dropout2 = tf.keras.layers.Dropout(drop_rate)
            self.dropout3 = tf.keras.layers.Dropout(drop_rate)

      def call(self, trg, enc, la_mask, pa_mask, train):
            out1, weights1 = self.mha1(trg, trg, trg, la_mask) # (batch_size, len_trg, d_model)
            out1 = self.dropout1(out1, training=train)
            add1 = self.layernorm1(trg + out1) # (batch_size, len_trg, d_model)

            out2, weights2 = self.mha2(add1, enc, enc, pa_mask) # (batch_size, len_trg, d_model)
            out2 = self.dropout2(out2, training=train)
            add2 = self.layernorm2(add1 + out2) # (batch_size, len_trg, d_model)

            out3 = self.pff(add2)
            out3 = self.dropout3(out3, training=train)
            add3 = self.layernorm3(add2 + out3) # (batch_size, len_trg, d_model)

            return add3, weights1, weights2

In [17]:
class EncoderNet(tf.keras.layers.Layer):
      def __init__(self, d_model, d_feed, n_heads, n_layers, 
                  vocab_size, max_positions, drop_rate=0.1):
            super(EncoderNet, self).__init__()

            self.d_model = d_model
            self.n_layers = n_layers

            self.pos_encoding = PositionalEncoding(max_positions, d_model)

            self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
            
            self.enc_layers = [EncoderLayer(d_model, d_feed, n_heads) for _ in range(n_layers)]

            self.dropout = tf.keras.layers.Dropout(drop_rate)

      def call(self, src, pa_mask, train):
            len_src = tf.shape(src)[1]

            emb = self.embedding(src)
            emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            emb += self.pos_encoding[:, :len_src, :]

            out = self.dropout(emb, training=train)
            
            for i in range(self.n_layers):
                  out = self.enc_layers[i](out, pa_mask, train)

            return out # (batch_size, len_src, d_model)

In [18]:
class DecoderNet(tf.keras.layers.Layer):
      def __init__(self, d_model, d_feed, n_heads, n_layers, 
                  vocab_size, max_positions, drop_rate=0.1):
            super(DecoderNet, self).__init__()

            self.d_model = d_model
            self.n_layers = n_layers

            self.pos_encoding = PositionalEncoding(max_positions, d_model)

            self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
            
            self.dec_layers = [DecoderLayer(d_model, d_feed, n_heads) for _ in range(n_layers)]

            self.dropout = tf.keras.layers.Dropout(drop_rate)
      
      def call(self, trg, enc, la_mask, pa_mask, train):
            len_trg = tf.shape(trg)[1]
            attn = {}

            emb = self.embedding(trg)
            emb *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
            emb += self.pos_encoding[:, :len_trg, :]

            out = self.dropout(emb, training=train)

            for i in range(self.n_layers):
                  out, weights1, weights2 = self.dec_layers[i](out, enc, la_mask, pa_mask, train)

                  attn['decoder_layer{}_block1'.format(i+1)] = weights1
                  attn['decoder_layer{}_block2'.format(i+1)] = weights2

            return out, attn

In [19]:
class Transformer(tf.keras.Model):
      def __init__(self, d_model, d_feed, n_heads, n_layers, 
                  src_vsize, trg_vsize, maxp_src, maxp_trg):
            super(Transformer, self).__init__()

            self.encoder = EncoderNet(d_model, d_feed, n_heads, n_layers, src_vsize, maxp_src)
            self.decoder = DecoderNet(d_model, d_feed, n_heads, n_layers, trg_vsize, maxp_trg)

            self.dense = tf.keras.layers.Dense(trg_vsize)

      def call(self, src, trg, la_mask, enc_mask, dec_mask, train):
            enc = self.encoder(src, enc_mask, train)
            out2, attn = self.decoder(trg, enc, la_mask, dec_mask, train)

            logits = self.dense(out2)

            return logits, attn #(batch_size, len_trg, trg_vsize)

In [20]:
n_layers = 4
d_model = 128
d_feed = 512
n_heads = 8

src_vsize = tokenizer_src.vocab_size + 2
trg_vsize = tokenizer_trg.vocab_size + 2
drop_rate = 0.1

In [21]:
src_vsize, trg_vsize

(8255, 8222)

In [22]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, w_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.w_steps = w_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.w_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [23]:
l_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(l_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [24]:
cat_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [25]:
def loss_full(y_true, y_pred):
    '''
    y_true.shape == (batch_size, len_seq)
    y_pred.shape == (batch_size, len_seq, trg_vsize)

    bascially calculate the cross-entropy loss but discount the padding loss

    returns loss_total (a scalar representing actual loss without padding consideration)
    '''

    loss_obj = cat_loss(y_true, y_pred) #loss_obj.shape == (batch_size, len_seq)
    
    not_mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    not_mask = tf.cast(not_mask, dtype=loss_obj.dtype)

    loss_total = loss_obj * not_mask

    return tf.reduce_sum(loss_total) / tf.reduce_sum(not_mask)

In [26]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [27]:
transformer_v1 = Transformer(d_model, d_feed, n_heads, n_layers, 
                             src_vsize, trg_vsize, src_vsize, trg_vsize)

In [28]:
def createMasks(src_seq, trg_seq):
    '''
    pase_mask: padding source encoder mask 
    pasd_mask: padding source decoder mask (used in mha2 - decoder)
    patd_mask: padding target decoder mask (used in mha1 - decoder)
    '''
    #print(src_seq, trg_seq)

    pase_mask = createPaddingMask(src_seq)
    pasd_mask = createPaddingMask(src_seq)
    patd_mask = createPaddingMask(trg_seq)
    
    la_mask = createLookAheadMask(tf.shape(trg_seq)[1])
    comb_mask = tf.maximum(patd_mask, la_mask) 

    return comb_mask, pase_mask, pasd_mask

In [29]:
temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

combined_mask, paenc_mask, padec_mask = createMasks(temp_input, temp_target)
fn_out, _ = transformer_v1(temp_input, temp_target, combined_mask, paenc_mask, padec_mask, True)

fn_out.shape  # (batch_size, len_trg, target_vocab_size)

TensorShape([64, 36, 8222])

In [30]:
ckpt = tf.train.Checkpoint(transformer=transformer_v1, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, "./checkpoints/train", max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [31]:
EPOCHS = 20

In [32]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=step_signature)
def train_step(src_seq, trg_seq):
    trg_inp = trg_seq[:, :-1]
    trg_true = trg_seq[:, 1:]

    combined_mask, paenc_mask, padec_mask = createMasks(src_seq, trg_inp)

    with tf.GradientTape() as tape:
        trg_pred, _ = transformer_v1(src_seq, trg_inp, combined_mask, paenc_mask, padec_mask, True)
        ent_loss = loss_full(trg_true, trg_pred)

        gradients = tape.gradient(ent_loss, transformer_v1.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, transformer_v1.trainable_variables))

    train_loss(ent_loss)
    train_accuracy(trg_true, trg_pred)

In [33]:
transformer_v1.decoder.dec_layers

ListWrapper([<__main__.DecoderLayer object at 0x7fe2a1927ef0>, <__main__.DecoderLayer object at 0x7fe2a18caba8>, <__main__.DecoderLayer object at 0x7fe2a18ec860>, <__main__.DecoderLayer object at 0x7fe2a188e518>])

In [34]:
transformer_v1.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_net (EncoderNet)     multiple                  1849728   
_________________________________________________________________
decoder_net (DecoderNet)     multiple                  2110720   
_________________________________________________________________
dense_64 (Dense)             multiple                  1060638   
Total params: 5,021,086
Trainable params: 5,021,086
Non-trainable params: 0
_________________________________________________________________


In [35]:
src_vsize, trg_vsize

(8255, 8222)

In [36]:
for e in range(EPOCHS):
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()

    for (b, (src_, trg_))  in enumerate(train_dataset):
        train_step(src_, trg_)

        if b % 50 == 0:
            print(f'Epoch: {e+1}, Batch: {b}, Loss: {train_loss.result():.4f}, Acc: {train_accuracy.result():.4f}')
    
    saved_path = ckpt_manager.save()

    print(f'Model saved at epoch {e+1} at {saved_path}')
    print(f'Time taken for epoch: {time.time() - start} seconds')

Epoch: 1, Batch: 0, Loss: 9.0163, Acc: 0.0000
Epoch: 1, Batch: 50, Loss: 8.9748, Acc: 0.0026


KeyboardInterrupt: ignored

In [42]:
def predict_seq(inp_seq):
  start_token = [tokenizer_src.vocab_size]
  end_token = [tokenizer_src.vocab_size + 1]
  
  # inp sentence is german, hence adding the start and end token
  inp_seq = start_token + tokenizer_src.encode(inp_seq) + end_token
  enc_inp = tf.expand_dims(inp_seq, 0)
  
  # as the target is english, the first word to the transformer should be the
  # english start token.
  dec_inp = [tokenizer_trg.vocab_size]
  dec_out = tf.expand_dims(dec_inp, 0)
    
  for i in range(MAXLENGTH):
    print(dec_out.shape, enc_inp.shape)
    combined_mask, paenc_mask, padec_mask = createMasks(enc_inp, dec_out)
  
    # dec_pred.shape == (batch_size, len_seq, trg_vsize)
    dec_pred, attn = transformer_v1(enc_inp, dec_out, 
                                 combined_mask, paenc_mask, padec_mask, False)
    
    # select the last word from the seq_len dimension
    dec_pred = dec_pred[: ,-1:, :]  # (batch_size, 1, vocab_size)

    pred_id = tf.cast(tf.argmax(dec_pred, axis=-1), tf.int32)
    
    if pred_id == tokenizer_trg.vocab_size+1:
      return tf.squeeze(dec_out, axis=0)
    
    dec_out = tf.concat([dec_out, pred_id], axis=-1)

  return tf.squeeze(dec_out, axis=0)

In [43]:
valid_sample = next(iter(train_dataset))

src_sample = tokenizer_src.decode([i for i in valid_sample[0][32] if i < tokenizer_src.vocab_size])
trg_sample = tokenizer_trg.decode([i for i in valid_sample[1][32] if i < tokenizer_trg.vocab_size])

src_sample, trg_sample

('Experience teaches us that big countries sometimes do not do things very well , and that small countries sometimes do things very well indeed .',
 'Erfahrungsgemäß sind die großen Länder bisweilen nicht ganz so erfolgreich , während die kleinen Länder mitunter vorzügliche Arbeit leisten .')

In [44]:
def translate_text(sentence, plot=''):
    result = predict_seq(sentence)

    output = tokenizer_trg.decode([i for i in result 
                                            if i < tokenizer_trg.vocab_size])  

    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(output))

In [45]:
translate_text(src_sample)

(1, 1) (1, 30)
(1, 2) (1, 30)
(1, 3) (1, 30)
(1, 4) (1, 30)
(1, 5) (1, 30)
(1, 6) (1, 30)
(1, 7) (1, 30)
(1, 8) (1, 30)
(1, 9) (1, 30)
(1, 10) (1, 30)
(1, 11) (1, 30)
(1, 12) (1, 30)
(1, 13) (1, 30)
(1, 14) (1, 30)
(1, 15) (1, 30)
(1, 16) (1, 30)
(1, 17) (1, 30)
(1, 18) (1, 30)
(1, 19) (1, 30)
(1, 20) (1, 30)
(1, 21) (1, 30)
(1, 22) (1, 30)
(1, 23) (1, 30)
(1, 24) (1, 30)
(1, 25) (1, 30)
(1, 26) (1, 30)
(1, 27) (1, 30)
(1, 28) (1, 30)
(1, 29) (1, 30)
(1, 30) (1, 30)
(1, 31) (1, 30)
(1, 32) (1, 30)
(1, 33) (1, 30)
(1, 34) (1, 30)
(1, 35) (1, 30)
(1, 36) (1, 30)
(1, 37) (1, 30)
(1, 38) (1, 30)
(1, 39) (1, 30)
(1, 40) (1, 30)
Input: Experience teaches us that big countries sometimes do not do things very well , and that small countries sometimes do things very well indeed .
Predicted translation:                                         
