In [1]:
### Reference : https://www.tensorflow.org/tutorials/text/transformer?hl=zh-cn

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import time
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pprint import pprint
from IPython.display import clear_output

In [3]:
output_dir = "E:\\Coding\\Projects\\transformer\\Data file"
en_vocab_file = os.path.join(output_dir, "en_vocab")
zh_vocab_file = os.path.join(output_dir, "zh_vocab")
checkpoint_path = os.path.join(output_dir, "checkpoints")
log_dir = os.path.join(output_dir, 'logs')
download_dir = "tensorflow-datasets/downloads"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [4]:
## reference:https://www.tensorflow.org/datasets/catalog/wmt_t2t_translate
translate_builder = tfds.builder("wmt19_translate/zh-en")
pprint(translate_builder.subsets)

{NamedSplit('train'): ['newscommentary_v14',
                       'wikititles_v1',
                       'uncorpus_v1',
                       'casia2015',
                       'casict2011',
                       'casict2015',
                       'datum2015',
                       'datum2017',
                       'neu2017'],
 NamedSplit('validation'): ['newstest2018']}


In [5]:
## I use sub-dataset:newscommentary
## download the data
## extract the data to csv. file
## load every row in the file
## shuffle data
## change the data to TFR data, reference: https://www.tensorflow.org/guide/data#consuming_tfrecord_data

config = tfds.translate.wmt.WmtConfig(version=tfds.core.Version('0.0.3', experiments={tfds.core.Experiment.S3: False}),
                                      language_pair=("zh", "en"),subsets={tfds.Split.TRAIN: ["newscommentary_v14"]})

builder = tfds.builder("wmt_translate", config=config)
builder.download_and_prepare(download_dir=output_dir)



In [6]:
## use 30% of original dataset as train data and 1% as val data because original dataset is too huge

train_part = 30
val_part = 1
drop_part = 100 - train_part - val_part

split = tfds.Split.TRAIN.subsplit([train_part, val_part, drop_part])
split

(NamedSplit('train')(tfds.percent[0:30]),
 NamedSplit('train')(tfds.percent[30:31]),
 NamedSplit('train')(tfds.percent[31:100]))

In [7]:
## as_supervised: bool, if True, the returned tf.data.Dataset will have a 2-tuple structure (input, label). 
## If False, the default, the returned tf.data.Dataset will have a dictionary with all the features.

examples = builder.as_dataset(split=split, as_supervised=True)
train_examples, val_examples, _ = examples

print(train_examples)
print(val_examples)

<_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>
<_OptionsDataset shapes: ((), ()), types: (tf.string, tf.string)>


In [8]:
## sentence in the train data are presented by unicode in two languages as tf.tensor type

for en, zh in train_examples.take(3):
    print(en)
    print(zh)
    print('-' * 10)

tf.Tensor(b'Making Do With More', shape=(), dtype=string)
tf.Tensor(b'\xe5\xa4\x9a\xe5\x8a\xb3\xe5\xba\x94\xe5\xa4\x9a\xe5\xbe\x97', shape=(), dtype=string)
----------
tf.Tensor(b'If the Putins, Erdo\xc4\x9fans, and Orb\xc3\xa1ns of the world want to continue to benefit economically from the open international system, they cannot simply make up their own rules.', shape=(), dtype=string)
tf.Tensor(b'\xe5\xa6\x82\xe6\x9e\x9c\xe6\x99\xae\xe4\xba\xac\xe3\x80\x81\xe5\x9f\x83\xe5\xb0\x94\xe5\xa4\x9a\xe5\xae\x89\xe5\x92\x8c\xe6\xac\xa7\xe5\xb0\x94\xe7\x8f\xad\xe5\xb8\x8c\xe6\x9c\x9b\xe7\xbb\xa7\xe7\xbb\xad\xe4\xba\xab\xe6\x9c\x89\xe5\xbc\x80\xe6\x94\xbe\xe5\x9b\xbd\xe9\x99\x85\xe4\xbd\x93\xe7\xb3\xbb\xe6\x8f\x90\xe4\xbe\x9b\xe7\x9a\x84\xe7\xbb\x8f\xe6\xb5\x8e\xe5\x88\xa9\xe7\x9b\x8a\xef\xbc\x8c\xe5\xb0\xb1\xe4\xb8\x8d\xe8\x83\xbd\xe7\xae\x80\xe5\x8d\x95\xe5\x9c\xb0\xe5\x88\xb6\xe5\xae\x9a\xe8\x87\xaa\xe5\xb7\xb1\xe7\x9a\x84\xe8\xa7\x84\xe5\x88\x99\xe3\x80\x82', shape=(), dtype=string)
-------

In [9]:
## decode

sample_examples = []
num_samples = 10

for en_t, zh_t in train_examples.take(num_samples):
    en = en_t.numpy().decode("utf-8")
    zh = zh_t.numpy().decode("utf-8")
    
    print(type(en_t))
    print(en)
    print(zh)
    print('-' * 100)
  
   
    sample_examples.append((en, zh))

<class 'tensorflow.python.framework.ops.EagerTensor'>
Making Do With More
多劳应多得
----------------------------------------------------------------------------------------------------
<class 'tensorflow.python.framework.ops.EagerTensor'>
If the Putins, Erdoğans, and Orbáns of the world want to continue to benefit economically from the open international system, they cannot simply make up their own rules.
如果普京、埃尔多安和欧尔班希望继续享有开放国际体系提供的经济利益，就不能简单地制定自己的规则。
----------------------------------------------------------------------------------------------------
<class 'tensorflow.python.framework.ops.EagerTensor'>
This ceiling can be raised only in a deep depression or other exceptional circumstances, allowing for counter-cyclical policy so long as it is agreed that the additional deficit is cyclical, rather than structural.
只有在发生深度萧条或其他反常事件时，这一上限才能做出调整，以便让反周期政策实施足够的长度，使人们一致认为增加的赤字是周期性的，而不是结构性的。
----------------------------------------------------------------------------------------------------
<cla

In [10]:
## build corpus

try:
    subword_encoder_en = tfds.features.text.SubwordTextEncoder.load_from_file(en_vocab_file)
    print(f'loading data from corpus： {en_vocab_file}')
except:
    print('building corpus....')
    subword_encoder_en = tfds.features.text.SubwordTextEncoder.build_from_corpus((en.numpy() for en, _ in train_examples), 
                         target_vocab_size=2**13) 
  
    subword_encoder_en.save_to_file(en_vocab_file)
  


print(f'corpus size：{subword_encoder_en.vocab_size}')
print(f'first ten subwords：{subword_encoder_en.subwords[:10]}')

loading data from corpus： E:\Coding\Projects\transformer\Data file\en_vocab
corpus size：8135
first ten subwords：[', ', 'the_', 'of_', 'to_', 'and_', 's_', 'in_', 'a_', 'that_', 'is_']


In [11]:
## demo

sample_string = 'Taiwan is a nice country and I study in the Netherlands.'
indices = subword_encoder_en.encode(sample_string)
decoded_string = subword_encoder_en.decode(indices)

print(indices)
assert decoded_string == sample_string
pprint((sample_string, decoded_string))

[2700, 7911, 10, 8, 1169, 42, 161, 5, 125, 2026, 7, 2, 7428, 7925]
('Taiwan is a nice country and I study in the Netherlands.',
 'Taiwan is a nice country and I study in the Netherlands.')


In [12]:
print("{0:10}{1:6}".format("Index", "Subword"))
print("-" * 15)
for idx in indices:
    subword = subword_encoder_en.decode([idx])
    print('{0:5}{1:6}'.format(idx, ' ' * 5 + subword))

Index     Subword
---------------
 2700     Taiwan
 7911      
   10     is 
    8     a 
 1169     nic
   42     e 
  161     country 
    5     and 
  125     I 
 2026     study 
    7     in 
    2     the 
 7428     Netherlands
 7925     .


In [13]:
## max_subword_length=1:every Chinese character can be seen as a subword in corpus.

try:
    subword_encoder_zh = tfds.features.text.SubwordTextEncoder.load_from_file(zh_vocab_file)
    print(f'loading data from corpus： {zh_vocab_file}')
except:
    print('building corpus....')
    subword_encoder_zh = tfds.features.text.SubwordTextEncoder.build_from_corpus((zh.numpy() for _, zh in train_examples), 
                         target_vocab_size=2**13, max_subword_length=1) # a Chinese character is a subword in corpus
    
    subword_encoder_zh.save_to_file(zh_vocab_file)

    
print(f'corpus size：{subword_encoder_zh.vocab_size}')
print(f'first ten subwords：{subword_encoder_zh.subwords[:10]}')

loading data from corpus： E:\Coding\Projects\transformer\Data file\zh_vocab
corpus size：4201
first ten subwords：['的', '，', '。', '国', '在', '是', '一', '和', '不', '这']


In [14]:
## demo 

sample_string = sample_examples[0][1]
indices = subword_encoder_zh.encode(sample_string)
print(sample_string)
print(indices)

多劳应多得
[48, 557, 116, 48, 81]


In [15]:
## add BOS and EOS to the sequence, use subword_encoder_en.vocab_size/+1 as index of BOS/EOS

def encode(en_t, zh_t):
   
    en_indices = [subword_encoder_en.vocab_size] + subword_encoder_en.encode(en_t.numpy())\
               + [subword_encoder_en.vocab_size + 1]
    
    zh_indices = [subword_encoder_zh.vocab_size] + subword_encoder_zh.encode(zh_t.numpy())\
               + [subword_encoder_zh.vocab_size + 1]
  
    return en_indices, zh_indices

In [16]:
## tensor doesn't have the numpy attribute, can't directly use dataset.map(en_t,zh_t)
## tf.py_function allows expressing computations in a TensorFlow graph as Python functions.

def tf_encode(en_t, zh_t):
 
    return tf.py_function(encode, [en_t, zh_t], [tf.int64, tf.int64])

In [17]:
## build tf.filter

max_len = 40

def filter_max_length(en, zh, max_length=max_len):
    
  # tf.logical_and == (True,True), only return when en and zh are both True

    return tf.logical_and(tf.size(en) <= max_length,tf.size(zh) <= max_length)

In [18]:
batch_size = 128
buffer_size = 15000

train_encode = train_examples.map(tf_encode)  # output:(en_seq, zh_seq)
train_filt = train_encode.filter(filter_max_length).cache()  # output：(en_index, zh_index)
train_shuffle = train_filt.shuffle(buffer_size)  # 將例子洗牌確保隨機性
train_dataset = train_shuffle.padded_batch(batch_size, padded_shapes=([-1], [-1])).prefetch(tf.data.experimental.AUTOTUNE)

val_encode = train_examples.map(tf_encode)
val_filt = train_encode.filter(filter_max_length).cache() 
val_shuffle = train_filt.shuffle(buffer_size) 
val_dataset = train_shuffle.padded_batch(batch_size, padded_shapes=([-1], [-1])).prefetch(tf.data.experimental.AUTOTUNE)

In [19]:
## make demo examples

demo_examples = [
    ("You know nothing.", "你什么也不知道。"),
    ("Long may sunshine!", "愿阳光明媚！"),
]
pprint(demo_examples)

[('You know nothing.', '你什么也不知道。'), ('Long may sunshine!', '愿阳光明媚！')]


In [20]:
batch_size_demo = 2
demo_examples = tf.data.Dataset.from_tensor_slices((
    [en for en, _ in demo_examples], [zh for _, zh in demo_examples]
))


# add padding token

demo_dataset = demo_examples.map(tf_encode)\
  .padded_batch(batch_size_demo, padded_shapes=([-1], [-1]))

# show

inp, tar = next(iter(demo_dataset))
print('inp:', inp)
print('' * 10)
print('tar:', tar)

inp: tf.Tensor(
[[8135 2634 7911  562  581 2150 7925 8136    0]
 [8135 1210   64   98 5105 1405  687 7912 8136]], shape=(2, 9), dtype=int64)

tar: tf.Tensor(
[[4201  621  338  231   43    9  418  273    3 4202]
 [4201  524 1211  830  189 3276 1629 4202    0    0]], shape=(2, 10), dtype=int64)


In [21]:
## position encoding

# PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
# PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
# pos.shape == (sentence_len,1)
# d_model.shape == (1,d_model)
# d_model == word embedding units
# get_angles.shape:(sentence_len, d_model)

def get_angles(pos, i, d_model):
    
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model)) 
    
    return pos * angle_rates

def positional_encoding(position, d_model):
    
    # position  == max_seq_len
    
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], 
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
    
    # sin.shape == (sentence_len,d_model/2)
    # cos.shape == (sentence_len,d_model/2)
    
    sin = np.sin(angle_rads[:,0::2]) 
    cos = np.cos(angle_rads[:, 1::2]) 
    
    # pos_embedding.shape ==(sentence_len,d_model)
    
    pos_encoding = tf.concat([sin,cos],axis=-1)
    
    # pos_embedding.shape ==(1, sentence_len, d_model)
    
    pos_encoding = pos_encoding[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

In [22]:
## embedding layer demo

vocab_size_en_demo = subword_encoder_en.vocab_size + 2
vocab_size_zh_demo = subword_encoder_zh.vocab_size + 2

d_model_demo = 4
embedding_layer_en = tf.keras.layers.Embedding(vocab_size_en_demo, d_model_demo)
embedding_layer_zh = tf.keras.layers.Embedding(vocab_size_zh_demo, d_model_demo)

emb_inp = embedding_layer_en(inp)
emb_tar = embedding_layer_zh(tar)
emb_inp, emb_tar

(<tf.Tensor: shape=(2, 9, 4), dtype=float32, numpy=
 array([[[ 0.01505223, -0.01816765,  0.04863017,  0.03451839],
         [-0.00471901, -0.03347461,  0.00661217,  0.04263944],
         [-0.03741904,  0.0162987 , -0.02005308, -0.04318934],
         [-0.02287551,  0.0213968 ,  0.02386638,  0.01451081],
         [ 0.04134926,  0.04590184, -0.04658299,  0.02239406],
         [ 0.04545938,  0.04026873,  0.01543689,  0.03578669],
         [ 0.00587896,  0.04076074, -0.03942049, -0.01949289],
         [-0.03144058, -0.01643362,  0.00941877, -0.02804999],
         [ 0.03923755,  0.00184336,  0.03753285, -0.00180028]],
 
        [[ 0.01505223, -0.01816765,  0.04863017,  0.03451839],
         [-0.0191102 , -0.01843125,  0.04677094, -0.01295252],
         [-0.03195452, -0.00884617,  0.04023999,  0.01639371],
         [ 0.04439774,  0.03465059, -0.01233953,  0.03018674],
         [-0.03772911,  0.04060903, -0.00965424,  0.02662456],
         [-0.04792821,  0.04831452,  0.02385906,  0.02973593],


In [23]:
## build padding mask

def create_padding_mask(seq):

    mask = tf.cast(tf.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :] #　broadcasting to fit the attention layer

In [24]:
## scaled_dot_product_attention

def scaled_dot_product_attention(q, k, v, mask):
   
    # dot product q and k 
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
    dk = tf.cast(tf.shape(k)[-1], tf.float32)  
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)  # scale by sqrt(dk)

    # add mask
    if mask is not None:
        scaled_attention_logits += (mask * -1e9) # * -1e9 make the value after doing softmax round to 0

    # do the softmax and sum up the v vector to 1
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
  
    # do the weighted average of v with attention_weights
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [25]:
"""
if attention weights ==(3,3) 
eg: [[1,2,3]
     [2,3,4]
     [3,4,6]]
     
--------------->

   [[1,0,0]
    [2,3,0]
    [3,4,6]]
    
former words would not see latter words

"""

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [26]:
## matrix demo

print(1-tf.linalg.band_part(tf.ones((3, 3)), -1, 0))

tf.Tensor(
[[0. 1. 1.]
 [0. 0. 1.]
 [0. 0. 0.]], shape=(3, 3), dtype=float32)


In [27]:
## build multiheadAttention

# 在初始的時候指定輸出維度 `d_model` & `num_heads，
# 在呼叫的時候輸入 `v`, `k`, `q` 以及 `mask`
# 輸出跟 scaled_dot_product_attention 函式一樣有兩個：
# output.shape            == (batch_size, seq_len_q, d_model)
# attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)

"""
convert Q, K and V to (....., d_model) shape ，
seperate each of them to N (....,depth) tensors and put them in scale dot product layer to get N results,
and concat them.

q->Wq->Q->split->q0,q1,q2

"""

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # seperate 'd_model' to N heads
        self.d_model = d_model 
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads # -1 dim of every tensor
        
        # linear transformation 
        
        self.Wq = tf.keras.layers.Dense(d_model)
        self.Wk = tf.keras.layers.Dense(d_model)
        self.Wv = tf.keras.layers.Dense(d_model)
        
        # linear transformation after concatenate
            
        self.dense = tf.keras.layers.Dense(d_model) 
        
        
    def split_heads(self, x, batch_size):
    
        # original x.shape == (batch_size, senquence_len, d_model)
        # d_model = num_heads * depth
        # we need x ->(batch_size, num_heads, seq_len, depth)

        
        # (batch_size, seq_len, num_heads, depth)
        reshaped_x = tf.reshape(x, shape=(batch_size, -1, self.num_heads, self.depth))
        
        # transpose the shape to (batch_size, num_heads, seq_len, depth), we need this shape for calculation of scale dot product
        
        split_output = tf.transpose(reshaped_x, perm=[0, 2, 1, 3]) 
        
        return split_output
    
    def call(self, v, k, q, mask):
        
        batch_size = tf.shape(q)[0]
        

        
        q = self.Wq(q)  # (batch_size, seq_len, d_model) 
        k = self.Wk(k)  # (batch_size, seq_len, d_model)
        v = self.Wv(v)  # (batch_size, seq_len, d_model)
        

        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
        # scaled_attention_output.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        

        # seq_len_q  == seq_len_v
        
        scaled_attention_output, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention_output, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        
        # reshape
        
        concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
        
        # now the shape is (batch_size, seq_len_q, d_model)

        
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights

In [28]:
## Build Feed Forward 

def point_wise_feed_forward_network(d_model, dff):
  
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [29]:
## build encoder layer

# there are N EncoderLayers in Encoder，there are two sub-layers: MHA & FFN in Encoder layer.

"""
x->self attention->add & normalize  & dropout
 ->feed forward->add & normalize & dropout
"""

class EncoderLayer(tf.keras.layers.Layer):
    
    # set dropout rate to 0.1
    
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        # layer norm
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
        # dropout layer
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        
        # x.shape == (batch_Size,seq_len, embedding_units)
        # attn.shape == (batch_size, seq_len,d_model)
        # output1.shape == (batch_size, seq_len,d_model)
        
        # sub-layer 1: MHA

        
        attn_output, attn = self.mha(x, x, x, mask)  #self_attention
        attn_output = self.dropout1(attn_output, training=training) # dropout 
        out1 = self.layernorm1(x + attn_output)  # add & norm, 且embedding_units == d_model
    
        # sub-layer 2: FFN
        # ffm_output.shape == (batch_size, seq_len,d_model)
        # output2.shape == (batch_size, seq_len,d_model)
        
        ffn_output = self.ffn(out1) 
        ffn_output = self.dropout2(ffn_output, training=training)  # training
        out2 = self.layernorm2(out1 + ffn_output)
    
        return out2

In [30]:
## build decoder layer

"""

x->self attention->add & nomalize & dropout ->out1
 out1, encoding_output-> attention -> add & nomalize & dropout ->out2
 out2 ->ffn ->add & nomalize & dropout->out3
 
"""

# Decoder:N DecoderLayers，
# DecoderLayer: 3 sub-layers : MHA, MHA focusing on encoder , & FFN

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads) # attention of encoder & decoder
        self.ffn = point_wise_feed_forward_network(d_model, dff)
 
        # LayerNorm
    
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
        # Dropout
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, training, 
           combined_mask, inp_padding_mask):
        
        # x.shape == (batch_size,seq_len, d_model)       
        # attn_weights1 : (batch_size, num_heads, target_seq_len, target_seq_len)
        # attn_weights2 : (batch_size, num_heads, target_seq_len, input_seq_len)
        # we also need look ahead mask and padding mask 
        
        attn1, attn_weights1 = self.mha1(x, x, x, combined_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
    
        # sub-layer 2: Decoder layer attention for Encoder output
        
        attn2, attn_weights2 = self.mha2(
                                enc_output, enc_output, out1, inp_padding_mask)  # (batch_size, target_seq_len, d_model)
        
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
        
        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
        
        return out3, attn_weights1, attn_weights2

In [31]:
## build encoder

# - num_layers: N

class Encoder(tf.keras.layers.Layer):
    
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
    
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        
        # pos_encoding.shape == (1,input_vocab_size, d_model)
        
        self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
    
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        
        # input x.shape == (batch_size, input_seq_len)
        # output.shape of every layer:(batch_size, input_seq_len, d_model)
        
        input_seq_len = tf.shape(x)[1]
    
        # after embedding:x.shape == (batch_size, input_seq_len, d_model)
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :input_seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i, enc_layer in enumerate(self.enc_layers):
            x = enc_layer(x, training, mask)
        
        # x.shape == (batch_size, input_seq_len, d_model)
          
        return x 

In [32]:
## build decoder

class Decoder(tf.keras.layers.Layer):
    

    
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, 
               rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
        self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, 
               combined_mask, inp_padding_mask):
    
        tar_seq_len = tf.shape(x)[1]
        attention_weights = {}  # to store attention weights
        
        x = self.embedding(x)  # (batch_size, tar_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :tar_seq_len, :]
        x = self.dropout(x, training=training)
    
        for i, dec_layer in enumerate(self.dec_layers):
            x, block1, block2 = dec_layer(x, enc_output, training,
                                    combined_mask, inp_padding_mask)
        
            attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2
    
        # x.shape == (batch_size, tar_seq_len, d_model)
        
        return x, attention_weights

In [33]:
## process of training

# 1.initialize model
# 2.define loss optimizer, define learning rate schedule
# 3.train step 
# 4.train process

In [34]:
# learning rate = (d_model ** -0.5) * min(step_num ** -0.5, step_num *warm_up ** -1.5)
# learning rate 先增後減

class learning_rate_schedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    # warmup_steps == 4000 in thesis
    def __init__(self, d_model, warmup_steps=4000):
        super(learning_rate_schedule, self).__init__()
    
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
    
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) 
    

In [35]:
"""

set reduction to none because we don't want loss_object sum up the error in every position。
we don't want the loss of <pad> token.

"""

def loss_function(real, pred):
    
    # set 1 to the non-zero value in seq 
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                                  from_logits=True, reduction='none')
    mask = tf.math.logical_not(tf.math.equal(real, 0))    
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask  # just calculate the loss of non padding value
    
    return tf.reduce_mean(loss_)

In [36]:
## prepare all the masks for Transformer

def create_masks(inp, tar):
    
    # padding mask for encoder
    
    enc_padding_mask = create_padding_mask(inp)
  
    # padding mask for Decoder layer
    
    dec_padding_mask = create_padding_mask(inp)
  
    # combined_mask: padding mask of zh_seq and look ahead mask 
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return enc_padding_mask, combined_mask, dec_padding_mask

In [37]:
## build checkpoint directory path

run_id = 'WMT19en-zh'
checkpoint_path = os.path.join(checkpoint_path, run_id)
log_dir = os.path.join(log_dir, run_id)

In [38]:
## build transformer

"""

input of Encoder: en_seq.shape == （batch_size, inp_seq_len）
input of Decoder: zh_seq.shape == （batch_size, inp_seq_len）

output of Decoder:（batch_size, tar_seq_len, target_vocab_size）
target_vocab_size represents the probability distribution of Chinese characters

"""

class Transformer(tf.keras.Model):

    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, rate)
    
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        
        # set lr rate
        self.learning_rate = learning_rate_schedule(d_model)
        
        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
        
        self.val_loss = tf.keras.metrics.Mean(name='val_loss')
        self.val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')
        
        # optimizer
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98, 
                                                  epsilon=1e-9)
        
        
    def call(self, inp, tar, training, enc_padding_mask, 
           combined_mask, dec_padding_mask):

        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
        # dec_output.shape == (batch_size, tar_seq_len, d_model)
    
        dec_output, attention_weights = self.decoder(
                                        tar, enc_output, training, combined_mask, dec_padding_mask)
    
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
        return final_output, attention_weights
    
    #define train_step
    
    @tf.function
    def train_step(self, inp, tar):
        
        # eg:use <start> to predict next word 
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
        
        # masks
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
        
        with tf.GradientTape() as tape:
            
            predictions, _ = self.call(inp, tar_inp, True, 
                                        enc_padding_mask, 
                                        combined_mask, 
                                        dec_padding_mask)
            
            loss = loss_function(tar_real, predictions)
        
        gradients = tape.gradient(loss, self.trainable_variables)    
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        

        self.train_loss(loss) # store the loss
        self.train_accuracy(tar_real, predictions) 
    
    def val_step(self, inp, tar):
        
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
        
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
        
        with tf.GradientTape() as tape:
            
            predictions, _ = self.call(inp, tar_inp, False, 
                                        enc_padding_mask, 
                                        combined_mask, 
                                        dec_padding_mask)
            
            loss = loss_function(tar_real, predictions)
        
        gradients = tape.gradient(loss, self.trainable_variables)    
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        

        self.val_loss(loss) # store the loss
        self.val_accuracy(tar_real, predictions) 
    

    def train(self, num_epochs):
        
        ckpt = tf.train.Checkpoint(transformer=self,
                           optimizer=self.optimizer)

        ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

        if ckpt_manager.latest_checkpoint:
            ckpt.restore(ckpt_manager.latest_checkpoint)
  
            last_epoch = int(ckpt_manager.latest_checkpoint.split("-")[-1])
            print(f'loading checkpoint..., model has trained {last_epoch} epochs。')
        else:
            last_epoch = 0
            print("No checkpoint.")
        
        summary_writer = tf.summary.create_file_writer(log_dir)
        
        print(f"this Transformer has trained {last_epoch} epochs。")
        print(f"the rest of epochs：{min(0, last_epoch - num_epochs)}")
        
        for epoch in range(last_epoch, num_epochs):
            
            start_time = time.time()
            
            self.train_loss.reset_states()
            self.train_accuracy.reset_states()
            
            self.val_loss.reset_states()
            self.val_accuracy.reset_states()
            
            for (step_idx, (inp, tar)) in enumerate(train_dataset):
                
                self.train_step(inp, tar) 
            
            for (step_idx, (inp, tar)) in enumerate(val_dataset):
                
                self.val_step(inp, tar) 
                
             # save the model for every epoch 
            if (epoch + 1) % 1 == 0:
                ckpt_save_path = ckpt_manager.save()
                print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                                     ckpt_save_path))
    
             # write the loss and accuracy to TensorBoard 
    
            with summary_writer.as_default():
                tf.summary.scalar("train_loss", self.train_loss.result(), step=epoch + 1)
                tf.summary.scalar("train_acc", self.train_accuracy.result(), step=epoch + 1)
                tf.summary.scalar("val_loss", self.val_loss.result(), step=epoch + 1)
                tf.summary.scalar("val_acc", self.val_accuracy.result(), step=epoch + 1)  
           
            print('Epoch {}, Running time: {:.2f}, Train Loss {:.4f},Train Accuracy {:.4f}'.format(epoch + 1, 
                                                                                time.time() - start_time, 
                                                                                self.train_loss.result(), 
                                                                                self.train_accuracy.result()))
            
            print('Epoch {}, Running time: {:.2f}, Val Loss {:.4f},Val Accuracy {:.4f}'.format(epoch + 1, 
                                                                                time.time() - start_time,
                                                                                self.val_loss.result(),
                                                                                self.val_accuracy.result()))
            print('-'*100)
            print('-'*100)

In [39]:
## create object of model

d_model = 128
num_layers = 4 
dff = 512
num_heads = 8

input_vocab_size = subword_encoder_en.vocab_size + 2
target_vocab_size = subword_encoder_zh.vocab_size + 2
dropout_rate = 0.1  

print("input_vocab_size:", input_vocab_size)
print("target_vocab_size:", target_vocab_size)

transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)

input_vocab_size: 8137
target_vocab_size: 4203


In [40]:
transformer.train(40)

No checkpoint.
this Transformer has trained 0 epochs。
the rest of epochs：-40
Instructions for updating:
Use tf.identity instead.


Instructions for updating:
Use tf.identity instead.


Saving checkpoint for epoch 1 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-1
Epoch 1, Running time: 201.74, Train Loss 4.9183,Train Accuracy 0.0321
Epoch 1, Running time: 201.74, Val Loss 3.7334,Val Accuracy 0.1062
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Saving checkpoint for epoch 2 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-2
Epoch 2, Running time: 146.69, Train Loss 3.1741,Train Accuracy 0.1615
Epoch 2, Running time: 146.69, Val Loss 2.7112,Val Accuracy 0.2086
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Saving checkpoint for epoch 3 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-3
Epoch 3,

Saving checkpoint for epoch 20 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-20
Epoch 20, Running time: 123.08, Train Loss 1.0325,Train Accuracy 0.4254
Epoch 20, Running time: 123.08, Val Loss 0.4986,Val Accuracy 0.5407
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Saving checkpoint for epoch 21 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-21
Epoch 21, Running time: 123.83, Train Loss 1.0218,Train Accuracy 0.4270
Epoch 21, Running time: 123.84, Val Loss 0.4712,Val Accuracy 0.5468
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Saving checkpoint for epoch 22 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-2

Saving checkpoint for epoch 39 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-39
Epoch 39, Running time: 121.44, Train Loss 0.9275,Train Accuracy 0.4398
Epoch 39, Running time: 121.44, Val Loss 0.1937,Val Accuracy 0.6111
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Saving checkpoint for epoch 40 at E:\Coding\Projects\transformer\Data file\checkpoints\WMT19en-zh\ckpt-40
Epoch 40, Running time: 120.98, Train Loss 0.9246,Train Accuracy 0.4398
Epoch 40, Running time: 120.98, Val Loss 0.1854,Val Accuracy 0.6134
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------


In [43]:
## tensorboard visualization

%load_ext tensorboard
%tensorboard --logdir {logdir}

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 10060), started 4:47:10 ago. (Use '!kill 10060' to kill it.)

In [44]:
## evaluate the sentence

"""
eval : A B C D ->E
       A B C D E ->F
       
we don't need teacher forcing when doing evaluation,
we add the latest prediction to the Chinese sequence Transformer 
     
"""


def evaluate(inp_sentence):
  
    # add <start>, <end>
    start_token = [subword_encoder_en.vocab_size]
    end_token = [subword_encoder_en.vocab_size + 1]
  
    inp_sentence = start_token + subword_encoder_en.encode(inp_sentence) + end_token
    
    # encoder_input.shape == (1,inp_seq_len)
    
    encoder_input = tf.expand_dims(inp_sentence, 0) # add batch
    
    # decoder_input.shape == (1,1)
    decoder_input = [subword_encoder_zh.vocab_size]
    
    output = tf.expand_dims(decoder_input, 0)  
  
    # auto-regressive, add the precidtion to seq and input to Transformer repeatly
    
    for i in range(max_len):
        
        # for every newly generated word create mask
        
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                            encoder_input, output)
  
        # predictions.shape == (batch_size, seq_len, vocab_size)
    
        predictions, attention_weights = transformer(encoder_input, 
                                                    output,
                                                    False,
                                                    enc_padding_mask,
                                                    combined_mask,
                                                    dec_padding_mask)
        
        predictions = predictions[: , -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
        # when get <end> token, then stop loop 
        
        if tf.equal(predicted_id, subword_encoder_zh.vocab_size + 1):
            
            return tf.squeeze(output, axis=0), attention_weights 
     
        # decoder can see the new predicted_id
        output = tf.concat([output, predicted_id], axis=-1)

        # get rid of batch dimension
        
    return tf.squeeze(output, axis=0), attention_weights

In [48]:
## translate

sentence = "China, India and others have continuing economic growth."

predicted_seq, _ = evaluate(sentence)

def translate(predicted_seq):
    
    target_vocab_size = subword_encoder_zh.vocab_size
    predicted_seq_without_bos_eos = [idx for idx in predicted_seq if idx < target_vocab_size]
    predicted_sentence = subword_encoder_zh.decode(predicted_seq_without_bos_eos)
    
    return predicted_sentence


predicted_seq, _ = evaluate(sentence)
predicted_sentence = translate(predicted_seq)
print("sentence:", sentence)
print("-" * 20)
print("predicted_seq:", predicted_seq)
print("-" * 20)
print("predicted_sentence:", predicted_sentence)

sentence: China, India and others have continuing economic growth.
--------------------
predicted_seq: tf.Tensor(
[4201   16    4    2  386  101    8   34   32    4   33    1   22   52
  107   84  377    5  521  292    3], shape=(21,), dtype=int32)
--------------------
predicted_sentence: 中国，印度和其他国家的经济增长仍在继续。
