In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/practice

In [None]:
!git init
!git config --global user.email "gusdudac@gmail.com"
!git config --global user.name "jiseung ahn"

!git status

* Tensorflow document에서 transformer 예제 따라하며 내용 파악
    * Input -> Positional Embedding -> Encoder -> Decoder

    * Encoder : Multihead Self Attention -> Regidual&normalization -> FFNN
    
    * Decoder : Masked Self Attention -> Regidual&normalization -> FFNN -> Encoder/Decoder Multihead Self Attention -> Regidual&normalization -> FFNN


In [None]:
# 라이브러리 설치
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install protobuf~=3.20.3
!pip install -q tensorflow_datasets
!pip install -q -U tensorflow-text tensorflow

In [None]:
# 라이브러리 설치
import logging
import time

import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import tensorflow as tf
import tensorflow_text

In [None]:
# 예제 불러오기
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                              with_info=True,
                              as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

In [None]:
# 영어, 포르투갈어 3개 문장 순서대로 출력
for pt_examples, en_examples in train_examples.batch(3).take(1):
    print('>Portuguese: ')
    for i, pt in enumerate(pt_examples.numpy()):
        print("[%i]" % i, pt.decode('utf-8'))

    print('>English: ')
    for i, en in enumerate(en_examples.numpy()):
        print("[%i]" % i, en.decode('utf-8'))

In [None]:
# 토크나이저 모델 불러오기
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

In [None]:
# 토크나이저 생성
tokenizers = tf.saved_model.load(model_name)

In [None]:
# 토크나이즈 결과 확인
# 각 단어를 token index로 변환
encoded = tokenizers.en.tokenize(en_examples)
encoded

In [None]:
# raw데이터-encode 데이터 decode 변환 결과 비교
en_examples, \
tokenizers.en.detokenize(encoded)

In [None]:
# 예시 데이터의 문장 길이 확인
length=[]
for pt_examples, en_examples in train_examples.batch(1024):
    pt_tokens = tokenizers.pt.tokenize(pt_examples)
    length.append(pt_tokens.row_lengths())

    en_tokens = tokenizers.en.tokenize(en_examples)
    length.append(en_tokens.row_lengths())
    print('.', end='', flush=True)

In [None]:
all_lengths = np.concatenate(length)

plt.hist(all_lengths, np.linspace(0, 500, 101))
plt.ylim(plt.ylim())
max_length = max(all_lengths)
plt.plot([max_length, max_length], plt.ylim())
plt.title(f'Maximum tokens per example: {max_length}');

In [None]:
# token길이 최대 128까지 자르기
MAX_TOKENS=128 # sequence length : 128
def prepare_batch(pt, en):
    pt = tokenizers.pt.tokenize(pt)      # Output is ragged.
    pt = pt[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    pt = pt.to_tensor()  # Convert to 0-padded dense Tensor

    en = tokenizers.en.tokenize(en)
    en = en[:, :(MAX_TOKENS+1)]
    en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
    en_labels = en[:, 1:].to_tensor()   # Drop the [START] tokens

    return (pt, en_inputs), en_labels

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
# Create training and validation set batches.
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

In [None]:
# positional Embedding
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:,np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth  # (1, depth)

    angle_rates = 1/(10000**depths)
    angle_rads = positions * angle_rates

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=1)

    return tf.cast(pos_encoding, dtype=tf.float32)

* 변수 positions - 현재 시퀀스 순서
    * array([[0],
       [1],
       [2],
       [3]])

* 변수 depths : 현재 시퀀스 순서에 따른 가중치로 이해, 순서가 뒤로 갈수록 값이 커지며 token의 공간상 거리 의미 추가
    * ex) array([[0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]])

* Embedding : 자연어처리에서 사람이 쓰는 자연어를 기계가 이해할 수 있도록 숫자형태인 vector로 바꾸는 과정 혹은 일련의 전체 과정
    * Input의 shape 변경, 차원의 확장
    * ex) (3,4) -> (3, 4, 512)

In [None]:
# input data의 공간상 거리적 의미 부여
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    # call함수는 생성된 클래스의 input에 대한 연산 수행
    def call(self, x):
        length = tf.shape(x)[1] # input sequence length
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # scaling positional encoding
        x = x + self.pos_encoding[tf.newaxis, :length, :] # input + positional embedding, shape 변환
        return x

In [None]:
# self-attention base process
# Multihead Attention - Regidual connection(positional embedding 정보 + mha output) - normalization
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [None]:
# Part of Encoder-Decoder Self Attention
# context : Encoder output
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query = x,
            key=context, # Encoder output -> Decoder input으로
            value=context, # Encoder output -> Decoder input으로
            return_attention_scores=True
        )

        self.last_attn_scores = attn_scores # dot_product(Query,Key.T)

        x = self.add([x, attn_output]) # Regidual connection
        x = self.layernorm(x) # normalization

        return x

In [None]:
# Part of Encoder Self Attention
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [None]:
# Part of Decoder Masked Self Attention
# input : 예측하고자 하는 대상
class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query =x,
            value=x,
            key=x,
            use_causal_mask=True) # masked self-attention
        x= self.add([x, attn_output])
        x= self.layernorm(x)
        return x

In [None]:
# FFNN layer
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate = 0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layernorm(x)
        return x

In [None]:
# Make Encoder Layer
# GlobalSelfAttention - FFNN
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads,
                 dff, dropout_rate=0.1):
        super().__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim = d_model,
            dropout = dropout_rate
        )

        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
                 dff, vocab_size, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(
            vocab_size = vocab_size, d_model=d_model)

        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads = num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        x = self.pos_embedding(x)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads,
                 dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(
            num_heads = num_heads,
            key_dim=d_model,
            dropout = dropout_rate
        )

        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim = d_model,
            dropout = dropout_rate
        )

        self.fnn = FeedForward(d_model, dff)

    def call(self, x, context):
        x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context = context)

        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.fnn(x)

        return x

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
                 dff, vocab_size, dropout_rate = 0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                 d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [DecoderLayer(
            d_model = d_model,
            num_heads = num_heads,
            dff=dff,
            dropout_rate = dropout_rate
        )
        for _ in range(num_layers)]

        self.last_attn_scores = None

    def call(self, x, context):
        x = self.pos_embedding(x)
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        return x

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()

        self.encoder = Encoder(
            num_layers=num_layers,
            d_model = d_model,
            num_heads=num_heads,
            dff=dff,
            vocab_size=input_vocab_size,
            dropout_rate=dropout_rate
        )

        self.decoder = Decoder(
            num_layers=num_layers,
            d_model = d_model,
            num_heads=num_heads,
            dff=dff,
            vocab_size=target_vocab_size,
            dropout_rate=dropout_rate
        )

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs):
        context, x = inputs

        context = self.encoder(context)

        x = self.decoder(x, context)

        logits = self.final_layer(x)

        try:
        # Drop the keras mask, so it doesn't scale the losses/metrics.
        # b/250038731
            del logits._keras_mask
        except AttributeError:
            pass

            # Return the final output and the attention weights.
        return logits

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    dropout_rate=dropout_rate)

In [None]:
output = transformer((pt, en))

print(en.shape)
print(pt.shape)
print(output.shape)

In [None]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)  # (batch, heads, target_seq, input_seq)

In [None]:
transformer.summary()

In [None]:
# Customize learning_rate
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.fit(train_batches,
                epochs=20,
                validation_data=val_batches)