In [1]:
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization

In [2]:
def get_angles(pos, k, d):
    i = k // 2
    angles = pos / 10000 ** (2 * i / d)

    return angles

In [3]:
def positional_encoding(positions, d):
    angle_rads = get_angles(np.arange(positions)[:, np.newaxis],
                            np.arange(d)[np.newaxis, :],
                            d)

    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [4]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(look_ahead_mask, dec_target_padding_mask)

    dec_padding_mask = create_padding_mask(inp)

    return enc_padding_mask, combined_mask, dec_padding_mask


In [5]:
def scaled_dot_product_attention(q, k, v, mask):
    """
    Calculate the attention weights.
      q, k, v must have matching leading dimensions.
      k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
      The mask has different shapes depending on its type(padding or look ahead)
      but it must be broadcastable for addition.

    Arguments:
        q -- query shape == (..., seq_len_q, depth)
        k -- key shape == (..., seq_len_k, depth)
        v -- value shape == (..., seq_len_v, depth_v)
        mask: Float tensor with shape broadcastable
              to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
        output -- attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (1 - mask) * (-1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)

    return output, attention_weights

In [6]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(fully_connected_dim, activation='relu'),
        tf.keras.layers.Dense(embedding_dim)
    ])

In [7]:
class EncoderLayer(tf.keras.layers.Layer):
    """
    The encoder layer is composed by a multi-head self-attention mechanism,
    followed by a simple, positionwise fully connected feed-forward network.
    This architecture includes a residual connection around each of the two
    sub-layers, followed by layer normalization.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)

    def call(self, x, training, mask):
        """
        Forward pass for the Encoder Layer

        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not
                    treated as part of the input
        Returns:
            encoder_layer_out -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
        """

        self_mha_output = self.mha(query=x, value=x, key=x, attention_mask=mask, training=training)

        skip_x_attention = self.layernorm1(x + self_mha_output)

        ffn_output = self.ffn(skip_x_attention)

        ffn_output = self.dropout_ffn(ffn_output, training=training)

        encoder_layer_out = self.layernorm2(skip_x_attention + ffn_output)

        return encoder_layer_out


In [8]:
class Encoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the input to an embedding layer
    and using positional encoding to then pass the output through a stack of
    encoder Layers

    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                self.embedding_dim)


        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps)
                           for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)

    def call(self, x, training, mask):
        """
        Forward pass for the Encoder

        Arguments:
            x -- Tensor of shape (batch_size, input_seq_len)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            mask -- Boolean mask to ensure that the padding is not
                    treated as part of the input
        Returns:
            x -- Tensor of shape (batch_size, input_seq_len, embedding_dim)
        """
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))

        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x

In [9]:
class DecoderLayer(tf.keras.layers.Layer):
    """
    The decoder layer is composed by two multi-head attention blocks,
    one that takes the new input and uses self-attention, and the other
    one that combines it with the output of the encoder, followed by a
    fully connected block.
    """
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.mha2 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        """
        Forward pass for the Decoder Layer

        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            out3 -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
            attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
        """

        mult_attn_out1, attn_weights_block1 = self.mha1(query=x, value=x, key=x, attention_mask=look_ahead_mask, return_attention_scores=True)
        Q1 = self.layernorm1(x + mult_attn_out1)

        mult_attn_out2, attn_weights_block2 = self.mha2(query=Q1, value=enc_output, key=enc_output, attention_mask=padding_mask, return_attention_scores=True)

        ffn_output = self.ffn(mult_attn_out2)

        ffn_output = self.dropout_ffn(ffn_output)

        out3 = self.layernorm3(mult_attn_out2 + ffn_output)

        return out3, attn_weights_block1, attn_weights_block2


In [10]:
class Decoder(tf.keras.layers.Layer):
    """
    The entire Encoder starts by passing the target input to an embedding layer
    and using positional encoding to then pass the output through a stack of
    decoder Layers

    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(target_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.dec_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps)
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)

    def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):
        """
        Forward  pass for the Decoder

        Arguments:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, embedding_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
        Returns:
            x -- Tensor of shape (batch_size, target_seq_len, embedding_dim)
            attention_weights - Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        """

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):

            x, block1, block2 = self.dec_layers[i](x, enc_output, training=training,
                                                 look_ahead_mask=look_ahead_mask, padding_mask=padding_mask)

            attention_weights['decoder_layer{}_block1_self_att'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = block2

        return x, attention_weights

In [11]:
class Transformer(tf.keras.Model):
    """
    Complete transformer with an Encoder and a Decoder
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
               target_vocab_size, max_positional_encoding_input,
               max_positional_encoding_target, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               input_vocab_size=input_vocab_size,
                               maximum_position_encoding=max_positional_encoding_input,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.decoder = Decoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               target_vocab_size=target_vocab_size,
                               maximum_position_encoding=max_positional_encoding_target,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = Dense(target_vocab_size, activation='softmax')

    def call(self, inputs, training=False):
      input_sentence, output_sentence = inputs

      enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(input_sentence, output_sentence)

      enc_output = self.encoder(input_sentence, training=training, mask=enc_padding_mask)
      dec_output, attention_weights = self.decoder(
          output_sentence, enc_output, training=training,
          look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask
      )

      final_output = self.final_layer(dec_output)

      return final_output

In [12]:
from datasets import load_dataset
dataset = load_dataset("bongsoo/news_talk_en_ko")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/89.0 [00:00<?, ?B/s]

news_talk_en_ko_train_130000.tsv:   0%|          | 0.00/345M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1299999 [00:00<?, ? examples/s]

In [13]:
import pandas as pd

df = pd.DataFrame(dataset['train'])

df = df.rename(columns={"Skinner's reward is mostly eye-watering.": "English", "스키너가 말한 보상은 대부분 눈으로 볼 수 있는 현물이다.": "Korean"})

In [14]:
all_chars = ''.join(df['English'])

result1 = ''.join(sorted(set(all_chars)))

In [15]:
all_chars = ''.join(df['Korean'])

result2 = ''.join(sorted(set(all_chars)))

In [16]:
print(result1)
print(result2)

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz|~ £¥§¨¬­®°±²³´µ·¹º¼¾ÁÂÅÆÇÉÎÑÓÕÖ×ØÜÞàáâãäåæçèéêëìíîïðñòóôöøùúûüýāăąĆćČčĐēěğīĭİıŁłŌōŏőřŚŞşŠšũūŬůžơưǎǐǒǔǫǽțɾʻˈ˚̊ΔΛΟΠΩαβδθμСᝧḪṇạảầậặẻễịọỏốồổỗộớờợủứừỳỹ​‎‐‑–—―‘’“”•…‧‬‭′⁄₁₂₃₩€⃗℃℉ℓ№™⅓⅔ⅠⅡⅢⅩ∕∼≡ⓧ△○★☆♡♥➋➌➍➎➏「」『』いうかしろ・ㆍ㎃㎉㎍㎎㎏㎒㎓㎖㎘㎚㎛㎜㎝㎞㎠㎡㎢㎥㎸㎾㎿㏊一三上中久之九乳亂亡交享京人付令份伉低作來俱倉偉傳儷八公出列功勝北博古台史司吉吾命和唐啞國園團垠城報場士夏外夢大天夷妓妾娥婢子孝孫學安寒小山岩島川州平年度廣建德必恩情愛慷憑成所文新於旗日旭昇显時智曲書曾最有杆李東梁梅歲死殉殞殯殲毅母民水江汽沖治洞派流浪海港渾溪滅滴漠灣炫無然父物玄王理瓚生産畵癒着睛石砂社神祥票秘稷立納結經緣縣繩美義習老耳聖聞聾股臺與舞芳茶菅蘇號衆行街襄視觴記說謝象車轉车辰近逢進逸遊運采野金釣長长門附限陸隱雅集靑面頻風食香魚鷹黒點齋龍龙蘭ﬁﬂ️﻿＆－｢｣ﾎ￦
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz~ £¥§®°±²³·º¾ÇÎÓÕ×Øäçéøüİıŏşšž˚˜ΒΩδμᅟᅥᆪỏố​‐–―‘’“”•․…‧‪‬′‵₁₂€⃗℃℉ℓ™⅓⅔ⅠⅡⅢⅣⅥⅩ→∙∼≡≪≫ⓧ□△○★☆♡♥➡　〈〉《》「」『』〜〮いうかきしのはろ・ㄷㄹㅅㅆㅇㅈㅏㅔㅠㅡㆍ㈔㈜㍱㎃㎈㎉㎍㎎㎏㎐㎑㎒㎓㎔㎖㎗㎘㎚㎛㎜㎝㎞㎟㎠㎡㎢㎥㎧㎨㎩㎫㎳㎷㎸㎽㎾㎿㏃㏄㏈㏊㏏一丁七丈三上下不世丘中丸丹主丽乃久之乙九书乳亂事二云五井亘亞亡亢交亥亦享京亭亮人仁今从他付仙代令以仲任企伉伊伏休伸似位低住佐佑佛作佳使來例供侵俉俊保信修俯俱俵倉倍們倒候倚借倡倦倪倫假偉停健傅傘備傳傷僊像僑僕僞僧價儒儷允元兄充兆先光克免兎兒入內全兩八公六共兵具冀円冬冶凍凡凤凰凱出刀分刑列创初別利制刻剋前剛割創劃劇劉劍力功加助勅勇勒動務勝勞勢勸勿化北匠匡區十千升午卉半協南博

In [17]:
import re

def clean_english(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9.,?!'\s]", '', text)   # Keep English, digits, punctuation
    text = re.sub(r"\s+", " ", text).strip()       # Normalize spaces
    return text

def clean_korean(text):
    text = str(text)
    text = re.sub(r"[^가-힣0-9.,?!'\s]", '', text)  # Keep Korean, digits, punctuation
    text = re.sub(r"\s+", " ", text).strip()       # Normalize spaces
    return text

# Apply cleaning
df['English'] = df['English'].apply(clean_english)
df['Korean'] = df['Korean'].apply(clean_korean)

In [18]:
# Concatenate all rows into single string per language
english_text = ''.join(df['English'])
korean_text = ''.join(df['Korean'])

# Get sorted unique characters
eng_vocab = ''.join(sorted(set(english_text)))
kor_vocab = ''.join(sorted(set(korean_text)))

print("English vocab:", eng_vocab)
print("Korean vocab:", kor_vocab)

English vocab:  !',.0123456789?abcdefghijklmnopqrstuvwxyz
Korean vocab:  !',.0123456789?가각간갇갈갉감갑값갓갔강갖갗같갚갛개객갠갤갬갭갯갰갱갸갹걀걍걔걘거걱건걷걸검겁것겄겅겆겉겊겋게겐겔겜겝겟겠겡겨격겪견결겸겹겼경곁계곗곘고곡곤곧골곪곯곰곱곳공곶과곽관괂괄괌괏광괘괜괭괴괵괸굉교굣굥구국군굳굴굵굶굼굽굿궁궂궃궈궉권궐궜궤궬궷귀귄귈귐귑귓규균귤그극근귿글긁금급긋긍기긱긴긷길김깁깃깄깅깊까깍깎깐깔깜깝깟깠깡깥깨깬깰깸깹깻깼깽꺄꺅꺌꺠꺼꺽꺾껀껄껌껍껏껐껑께껫껴꼈꼐꼬꼭꼰꼴꼼꼽꼿꽁꽂꽃꽈꽉꽝꽤꽥꽹꾀꾄꾈꾳꾸꾹꾼꿀꿇꿈꿉꿋꿍꿎꿔꿨꿩꿰꿴꿸뀄뀌뀐뀔뀜뀝끄끈끊끌끓끔끕끗끙끝끼끽낀낄낌낍낑나낙낚난낟날낡남납낫났낭낮낯낱낳내낵낸낼냄냅냇냈냉냐냑냔냘냠냥너넉넋넌널넒넓넘넙넛넜넝넣네넥넨넬넴넵넷넸넹녀녁년녈념녔녕녘녜녠노녹논놀놈놉놋농높놓놔놨뇄뇌뇐뇔뇜뇝뇨뇰뇽누눅눈눌눔눕눗눙눠눴뉘뉜뉠뉩뉴뉼늄늉느늑는늗늘늙늠늡능늦늪늬니닉닌닏닐님닙닛닝닢다닥닦단닫달닭닮닳담답닷당닺닻닼닿대댁댄댈댐댑댓댔댕댜더덕덖던덜덞덟덤덥덧덩덫덮데덱덴델뎀뎁뎃뎄뎅뎌뎐뎠뎬도독돈돋돌돍돎돔돕돗동돛돝돠돤돨돼됀됍됐되된될됨됩됫됬됭됴두둑둔둘둠둡둣둥둬뒀뒤뒨뒴뒷뒹듀듄듈듐듕드득든듣들듦듬듭듯등듸디딕딘딛딜딤딥딧딨딩딪딫따딱딴딸땀땁땃땄땅땋때땐땔땜땠땡떄떠떡떤떨떫떱떳떴떵떻떼뗀뗄뗍뗏뗐뗑또똑똘똠똥똬뙈뙜뙤뚜뚝뚠뚤뚫뚱뛰뛴뛸뜀뜁뜨뜩뜬뜯뜰뜸뜹뜻띄띈띌띔띕띠띡띤띨띱띵라락란랄람랍랏랐랑랖랗래랙랜랠램랩랫랬랭랴략랸량러럭런럴럼럽럿렀렁렇레렉렌렐렘렙렛렜렝려력련렬렴렵렷렸령례롄로록론롣롤롬롭롯롱롹뢰뢴룀룅료룝룟룡루룩룬룰룸룹룻룽뤄뤘뤠뤼뤽륄륌륍륑류륙륜률륨륩륫륭르륵른를름릅릇릉릍릎릐리릭린릴림립릿링마막만많맏말맑맘맙맛망맞맟맡맣매맥맨맬맴맵맷맸맹맺먀먄먕머먹먼멀멈멉멋멍멎멓메멕멘멜멤멥멧멨멩며멱면멸몃몄명몆몇몐모목몫몬몰몸몹못몽뫼묀묄묏묘묠묫무묵묶문묻물묽묾뭄뭅뭇뭉뭍뭐뭔뭘뭡뭣뮈뮌뮐뮤뮨뮬뮴므믄믈믐믓믜미믹민믿밀밈밉밋밌밍밎및밑바박밖반받발밝밟밤밥밧방밭배백밴밸뱀뱁뱃뱄뱅뱉뱌뱡버벅번벋벌범법벗벙벚벛베벡벤벧벨벰벳벴벵벼벽변별볍볏볐병볕볜보복볶본볼봄봅봇봉봐봣봤봬뵀뵈뵌뵐뵙뵤부북분붇불붉붐붑붓붕붙뷔뷧뷰뷴뷸븀브븐블븕비빅빈빌빍빔빕빗빙빚빛빠빡빤빨빰빱

In [19]:
# English Tokenizer

vocabulary_size = 10000
max_len = 100

tokenizer_English = tf.keras.layers.TextVectorization(max_tokens=vocabulary_size, output_sequence_length=max_len)
tokenizer_English.adapt(df["English"])

In [20]:
# Korean Tokenizer

vocabulary_size = 10000
max_len = 100

tokenizer_Korean = tf.keras.layers.TextVectorization(max_tokens=vocabulary_size, output_sequence_length=max_len)
tokenizer_Korean.adapt([f"SOS {s} EOS" for s in df['Korean']])

In [21]:
X_train = tokenizer_English(df['English'][:1_000_000])
X_valid = tokenizer_English(df['English'][1_000_000:])
X_train_dec = tokenizer_Korean([f"SOS {s}" for s in df['Korean'][:1_000_000]])
X_valid_dec = tokenizer_Korean([f"SOS {s}" for s in df['Korean'][1_000_000:]])

Y_train = tokenizer_Korean([f"{s} EOS" for s in df['Korean'][:1_000_000]])
Y_valid = tokenizer_Korean([f"{s} EOS" for s in df['Korean'][1_000_000:]])

In [22]:
def map_fn(inputs, target):
    # inputs = (X_train, X_train_dec) -- X_train_dec is full target sequence with SOS and EOS tokens
    encoder_input, full_target = inputs
    # Split decoder input and target for teacher forcing
    decoder_input = full_target[:, :-1]
    decoder_target = full_target[:, 1:]
    return (encoder_input, decoder_input), decoder_target


BATCH_SIZE = 64

train_dataset = tf.data.Dataset.from_tensor_slices(((X_train, X_train_dec), Y_train))
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE).map(map_fn).prefetch(tf.data.AUTOTUNE)

valid_dataset = tf.data.Dataset.from_tensor_slices(((X_valid, X_valid_dec), Y_valid))
valid_dataset = valid_dataset.batch(BATCH_SIZE).map(map_fn).prefetch(tf.data.AUTOTUNE)

In [27]:
# Vocabulary sizes
input_vocab_size = len(tokenizer_English.get_vocabulary())
target_vocab_size = len(tokenizer_Korean.get_vocabulary())

# Positional encoding max lengths (same as tokenizer max length)
max_positional_encoding_input = 100
max_positional_encoding_target = 100

transformer = Transformer(
    num_layers=4,
    embedding_dim=256,
    num_heads=8,
    fully_connected_dim=1024,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    max_positional_encoding_input=max_positional_encoding_input,
    max_positional_encoding_target=max_positional_encoding_target,
    dropout_rate=0.1,
    layernorm_eps=1e-6
)

In [28]:
transformer.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [29]:
history = transformer.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=1, # because it takes to much time to train the model for n epoch I trained on 1 epoch
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True)
    ]
)



[1m  932/15625[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:14:20[0m 549ms/step - accuracy: 0.8836 - loss: 1.2736

KeyboardInterrupt: 

In [36]:
korean_sentences_with_tokens = [f"SOS {s} EOS" for s in df['Korean']]

temp_tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size - 4,
    output_sequence_length=max_len
)
temp_tokenizer.adapt(korean_sentences_with_tokens)

raw_vocab = temp_tokenizer.get_vocabulary()
special_tokens = ["", "[UNK]", "SOS", "EOS"]
final_vocab = special_tokens + [token for token in raw_vocab if token not in special_tokens]

tokenizer_Korean = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_sequence_length=max_len,
    vocabulary=final_vocab
)

In [37]:
vocab = tokenizer_Korean.get_vocabulary()
sos_id = vocab.index("SOS")
eos_id = vocab.index("EOS")

In [46]:
def translate(sentence, transformer, tokenizer_English, tokenizer_Korean, max_target_len=100):
    encoder_input = tf.cast(tokenizer_English([sentence]), tf.int32)
    vocab = tokenizer_Korean.get_vocabulary()
    sos_id = vocab.index("SOS")
    eos_id = vocab.index("EOS")
    decoder_input = tf.convert_to_tensor([[sos_id]], dtype=tf.int32)

    for _ in range(max_target_len):
        predictions = transformer([encoder_input, decoder_input], training=False)
        predicted_id = tf.argmax(predictions[:, -1:, :], axis=-1)
        predicted_id = tf.cast(predicted_id, tf.int32)
        decoder_input = tf.concat([decoder_input, predicted_id], axis=-1)
        if predicted_id[0][0].numpy() == eos_id:
            break

    output_tokens = decoder_input.numpy()[0][1:]
    id_to_word = dict(enumerate(tokenizer_Korean.get_vocabulary()))
    translated_tokens = [id_to_word[i] for i in output_tokens if i != eos_id]
    return " ".join(translated_tokens)

# Conclusion

Achieved 88% accuracy