In [1]:
class Config:
    def __init__(self):
        self.BATCH_SIZE = 50
        self.EPOCHS = 30
        self.NUM_OF_INPUTS = 300

        self.NUM_CHARACTER_LEVEL_LAYERS = 4
        self.NUM_WORD_LEVEL_LAYERS = 12

        self.CHARACTER_LEVEL_D_MODEL = 32
        self.WORD_LEVEL_D_MODEL = 128

        self.NUM_HEADS = 3
        self.DFF = 256

        self.MAX_WORD_LENGTH = 16
        self.MAX_SENTENCE_LENGTH = 64

        self.VOCAB_SIZE = 10000
        self.CHARACTER_VOCAB_SIZE = 1000


config = Config()

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-19 00:14:48.387575: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 00:14:48.724764: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Tokenizer

In [2]:
class CustomTokenizer:
    def __init__(self):
        self.tokenizer = Tokenizer()
        
    def texts_to_sequences(self, texts):
        return 
    def sequences_to_texts(self, sequences):
        return
    def fit_texts(self, texts):
        return 

# Global Attention and FeedForward

In [3]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [4]:
class GlobalSelfAttention(BaseAttention):
    def call(self, inputs):
        x, mask = inputs[0], inputs[1]

        attn_output = self.mha(query=x, value=x, key=x, attention_mask=mask)

        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [5]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

# Positional Embedding layer

In [6]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth

    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

# Encoder layers

In [7]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1, name='EncoderLayer'):
        super().__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads, key_dim=d_model, dropout=dropout_rate, name=name
        )

        self.ffn = FeedForward(d_model, dff)

    def call(self, inputs):
        x = self.self_attention(inputs)
        x = self.ffn(x)
        return x


In [8]:
class Encoder(tf.keras.layers.Layer):
    def __init__(
            self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1,
            name='Encoder'
    ):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.enc_layers = [
            EncoderLayer(
                d_model=d_model, num_heads=num_heads, dff=dff,
                dropout_rate=dropout_rate,
                name=f'encoder_layer_{i + 1}'
            )
            for i in range(num_layers)
        ]
        # self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        x, mask = inputs
        print("Encoder start")
        print("input shape", x.shape)
        print("mask shape", mask.shape)

        for i in range(self.num_layers):
            inputs = self.enc_layers[i]((x, mask))
        print("Encoder end")
        return inputs  # Shape (batch_size, seq_len, d_model).

# Hierarchical Transformer Encoder model

In [9]:
class HierarchicalTransformerEncoder(tf.keras.models.Model):
    def __init__(self, *, num_character_level_layers, num_word_level_layers,
                 character_level_d_model, word_level_d_model, num_heads, dff,
                 max_word_length, max_sentence_length,
                 vocab_size, character_vocab_size, dropout_rate=0.1):
        super().__init__()

        self.word_pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                      d_model=word_level_d_model)
        self.character_pos_embedding = PositionalEmbedding(vocab_size=character_vocab_size,
                                                           d_model=character_level_d_model)

        self.character_level_encoder = Encoder(num_layers=num_character_level_layers,
                                               d_model=character_level_d_model,
                                               num_heads=num_heads, dff=dff,
                                               vocab_size=character_vocab_size,
                                               dropout_rate=dropout_rate,
                                               name='character_level_encoder')

        self.flatten_layer = tf.keras.layers.Flatten(input_shape=(max_word_length, character_level_d_model),
                                                     name='flatten_character_level_encoders')
        self.linear_layer = tf.keras.layers.Dense(units=word_level_d_model, activation=None,
                                                  name='linear_character_level_encoders')

        self.combined_layer = tf.keras.layers.Concatenate(axis=-1, name='combined_character_and_word_level_encoders')

        self.word_level_encoder = Encoder(num_layers=num_word_level_layers,
                                          d_model=(word_level_d_model * 2),
                                          num_heads=num_heads, dff=dff,
                                          vocab_size=vocab_size,
                                          dropout_rate=dropout_rate,
                                          name='word_level_encoder')

        self.correction_layer = tf.keras.layers.Dense(vocab_size, activation='softmax', name='correction_layer', )

    def call(self, inputs):
        print("Start")
        (word_level_inputs, word_level_mask_inputs), (character_level_inputs, character_level_mask_inputs) = inputs

        word_embedding_outputs = self.word_pos_embedding(word_level_inputs)  # (batch_size, max_len, word_level_d_model)
        print("Shape của word_embedding_outputs:", word_embedding_outputs.shape)

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: self.character_level_encoder(
                (self.character_pos_embedding(sentence[0]), sentence[1])),
            (character_level_inputs, character_level_mask_inputs),
            dtype=tf.float32
        )
        # (batch_size, max_sen_len, max_word_length, character_level_d_model)

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: tf.map_fn(
                lambda word: tf.squeeze(self.linear_layer(self.flatten_layer(tf.expand_dims(word, axis=0))), axis=0),
                sentence,
                dtype=tf.float32
            ),
            character_level_encoder_outputs,
            dtype=tf.float32)
        # (batch_size, max_sen_len, word_level_d_model)
        print("Shape của character_level_encoder_outputs:", character_level_encoder_outputs.shape)

        concat_output = self.combined_layer([word_embedding_outputs, character_level_encoder_outputs])
        # (batch_size, max_sen_len, word_level_d_model * 2)
        print("Shape của concat_output:", concat_output.shape)

        word_level_output = self.word_level_encoder((concat_output, word_level_mask_inputs))
        # (batch_size, max_sen_len, word_level_d_model + character_level_d_model)
        print("Shape của word_level_output:", word_level_output.shape)

        correction_output = self.correction_layer(word_level_output)  # (batch_size, max_sen_len, vocab_size)
        print("Shape của correction_output:", correction_output.shape)

        # detection_output = tf.squeeze(self.detection_layer(word_level_output), axis=-1)  # (batch_size, max_sen_len)
        # print("Shape của detection_output:", detection_output.shape)
        print("End")
        return correction_output


@tf.function
def training_step(model, optimizer, x, y):
    with tf.GradientTape() as tape:
        correction_output = model(x)
        loss1 = correction_loss(y, correction_output)
        # loss2 = detection_loss(y[1], detection_output)
        total_loss = loss1

    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return total_loss


def correction_loss(true_outputs, pred_outputs):
    softmax_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)(true_outputs, pred_outputs)
    return softmax_loss


def detection_loss(true_detection_infos, pred_detection_infos):
    sigmoid_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)(true_detection_infos, pred_detection_infos)
    return sigmoid_loss

# Data preprocessing

In [10]:
data = pd.read_csv('data/vi_processed.csv')

input_sentences = []

for index, row in data.iterrows():
    if len(input_sentences) == config.NUM_OF_INPUTS: break
    input_sentences.append(row.correct_text)

correct_texts = input_sentences[:config.NUM_OF_INPUTS]

In [11]:
word_level_tokenizer = Tokenizer(num_words=config.VOCAB_SIZE, oov_token='<UNK>', lower=True, split=' ', )

character_level_tokenizer = Tokenizer(num_words=config.CHARACTER_VOCAB_SIZE, lower=True, char_level=True)

In [12]:
word_level_tokenizer.fit_on_texts(input_sentences)
character_level_tokenizer.fit_on_texts(input_sentences)

In [13]:
input_sequences = word_level_tokenizer.texts_to_sequences(input_sentences)

# Get character-level words lengths.
input_words_lengths = []

# Get character-level sequences.
character_level_input_sequences = []

for sequence in input_sequences:
    character_level_input_sequence = []
    words_lengths = []
    for word_token in sequence:
        word = word_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                              else config.MAX_WORD_LENGTH))

    # Add padding for each word.
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    character_level_input_sequences.append(character_level_input_sequence)

    input_words_lengths.append(words_lengths)

# Get word-level sentences lengths.
input_sentences_lengths = []
for sequence in input_sequences: input_sentences_lengths.append(
    (len(sequence) if len(sequence) <= config.MAX_SENTENCE_LENGTH
     else config.MAX_SENTENCE_LENGTH))

# Add padding for each.
input_sequences = pad_sequences(input_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')

character_level_input_sequences = pad_sequences(character_level_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                                padding='post', truncating='post')
input_words_lengths = pad_sequences(input_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                    truncating='post')

input_sequences_np = np.array(input_sequences)
character_level_input_sequences_np = np.array(character_level_input_sequences)

input_words_lengths_np = np.array(input_words_lengths)
input_sentences_lengths_np = np.array(input_sentences_lengths)

In [14]:
def generate_sequence(length, max_length):
    one_tensor_length = tf.cast(tf.cast(length, tf.float32) * 0.85, tf.int32)

    masked_sequence = tf.random.shuffle(tf.sequence_mask(one_tensor_length, maxlen=length, dtype=tf.float32))

    padded_sequence = tf.pad(masked_sequence, paddings=[[0, max_length - length]], mode='CONSTANT',
                             constant_values=0.0)

    return padded_sequence


# Character level mask
character_level_mask_tensor = tf.map_fn(
    lambda sentence: tf.map_fn(
        lambda length: generate_sequence(length, config.MAX_WORD_LENGTH),
        sentence,
        dtype=tf.float32),
    tf.convert_to_tensor(input_words_lengths_np),
    dtype=tf.float32)
character_level_mask_tensor = tf.expand_dims(character_level_mask_tensor, 2)

# Word level mask
word_level_mask_tensor = tf.map_fn(
    lambda length: generate_sequence(length, config.MAX_SENTENCE_LENGTH),
    tf.convert_to_tensor(input_sentences_lengths_np),
    dtype=tf.float32
)
word_level_mask_tensor = tf.expand_dims(word_level_mask_tensor, 1)

# Model building

In [15]:
model = HierarchicalTransformerEncoder(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                       num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                       character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                       word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                       num_heads=config.NUM_HEADS, dff=config.DFF,
                                       max_word_length=config.MAX_WORD_LENGTH,
                                       max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                       vocab_size=config.VOCAB_SIZE,
                                       character_vocab_size=config.CHARACTER_VOCAB_SIZE)

  super().__init__(**kwargs)


In [16]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {
            'd_model': float(self.d_model),
            'warmup_steps': self.warmup_steps
        }

    @classmethod
    def from_config(cls, config):
        return cls(d_model=config['d_model'], warmup_steps=config['warmup_steps'])


learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [17]:
# from tqdm import tqdm
# 
# for e in range(config.EPOCHS):
#     pbar = tqdm(range(0, len(input_sequences_np), config.BATCH_SIZE))
# 
#     for i in pbar:
#         input_batch = [
#             [input_sequences_np[i:i + config.BATCH_SIZE], input_sentences_lengths_np[i:i + config.BATCH_SIZE]],
#             [character_level_input_sequences_np[i:i + config.BATCH_SIZE],
#              input_words_lengths_np[i:i + config.BATCH_SIZE]]
#         ]
#         output_batch = input_sequences_np[i:i + config.BATCH_SIZE]
# 
#         total_loss = training_step(model, optimizer, input_batch, output_batch)
# 
#         pbar.set_description(f'Epoch {e + 1}/{config.EPOCHS}')
#         pbar.set_postfix_str(f'loss: {total_loss:.4f}')
#         pbar.refresh()


In [18]:
print('input_sequences_np shape:', input_sequences_np.shape)
print('character_level_input_sequences_np shape:', character_level_input_sequences_np.shape)
print('word_level_mask_tensor shape:', word_level_mask_tensor.shape)
print('character_level_mask_tensor shape: ', character_level_mask_tensor.shape)

input_sequences_np shape: (300, 64)
character_level_input_sequences_np shape: (300, 64, 16)
word_level_mask_tensor shape: (300, 1, 64)
character_level_mask_tensor shape:  (300, 64, 1, 16)


In [19]:
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['acc'])

model.fit(
    [[input_sequences_np, word_level_mask_tensor], [character_level_input_sequences_np, character_level_mask_tensor]],
    input_sequences_np, epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE)

Epoch 1/30
Start
Shape của word_embedding_outputs: (50, 64, 128)
Instructions for updating:
Use fn_output_signature instead
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)




Encoder end
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)
Encoder end
Shape của character_level_encoder_outputs: (50, 64, 128)
Shape của concat_output: (50, 64, 256)
Encoder start
input shape (50, 64, 256)
mask shape (50, 1, 64)
Encoder end
Encoder start
input shape (50, 64, 256)
mask shape (50, 1, 64)
Encoder end
Shape của word_level_output: (50, 64, 256)
Shape của correction_output: (50, 64, 10000)
End
Start
Shape của word_embedding_outputs: (50, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)
Encoder end
Shape của character_level_encoder_outputs: (50, 64, 128)
Shape của concat_output: (50, 64, 256)
Encoder start
input shape (50, 64, 256)
mask shape (50, 1, 64)
Encoder end
Shape của word_level_output: (50, 64, 256)
Shape của correction_output: (50, 64, 10000)
End




Start
Shape của word_embedding_outputs: (50, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)
Encoder end
Shape của character_level_encoder_outputs: (50, 64, 128)
Shape của concat_output: (50, 64, 256)
Encoder start
input shape (50, 64, 256)
mask shape (50, 1, 64)
Encoder end
Shape của word_level_output: (50, 64, 256)
Shape của correction_output: (50, 64, 10000)
End
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 916ms/step - acc: 0.0000e+00 - loss: 9.3030
Epoch 2/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 968ms/step - acc: 0.0000e+00 - loss: 9.2896
Epoch 3/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - acc: 7.2173e-05 - loss: 9.2534
Epoch 4/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 980ms/step - acc: 3.4970e-05 - loss: 9.1951
Epoch 5/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 957ms/step - acc: 1.4881e-05 - loss: 9.1165
Epoch 6/30
[1m6/6[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x73cbabdb8970>

In [23]:
test_input_sentences = [
    'Nếu làm được như vậy thì chắc chắn sẽ không còn trường nào tùy tiện thu tiền cao, gây sự lo lắng của phụ huynh và ai không có tiền thì không cần đóng.']
test_input_sequences = word_level_tokenizer.texts_to_sequences(test_input_sentences)

# Get character-level words lengths.
test_input_words_lengths = []

# Get character-level sequences.
test_character_level_input_sequences = []

for sequence in test_input_sequences:
    character_level_input_sequence = []
    test_words_lengths = []
    for word_token in sequence:
        word = word_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        test_words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                                   else config.MAX_WORD_LENGTH))

    # Add padding for each word.
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    test_character_level_input_sequences.append(character_level_input_sequence)

    test_input_words_lengths.append(test_words_lengths)

# Get word-level sentences lengths.
test_input_sentences_lengths = []
for sequence in test_input_sequences: test_input_sentences_lengths.append(
    (len(sequence) if len(sequence) <= config.MAX_SENTENCE_LENGTH
     else config.MAX_SENTENCE_LENGTH))

# Add padding for each.
test_input_sequences = pad_sequences(test_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                     truncating='post')

test_character_level_input_sequences = pad_sequences(test_character_level_input_sequences,
                                                     maxlen=config.MAX_SENTENCE_LENGTH,
                                                     padding='post', truncating='post')
test_input_words_lengths = pad_sequences(test_input_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                         truncating='post')

test_input_sequences_np = np.array(test_input_sequences)
test_character_level_input_sequences_np = np.array(test_character_level_input_sequences)

test_input_words_lengths_np = np.array(test_input_words_lengths)
test_input_sentences_lengths_np = np.array(test_input_sentences_lengths)

In [1]:
# Character level mask
test_character_level_mask_tensor = tf.map_fn(
    lambda sentence: tf.map_fn(
        lambda length: generate_sequence(length, config.MAX_WORD_LENGTH),
        sentence,
        dtype=tf.float32),
    tf.convert_to_tensor(test_input_words_lengths_np),
    dtype=tf.float32)
character_level_mask_tensor = tf.expand_dims(character_level_mask_tensor, 2)

# Word level mask
test_word_level_mask_tensor = tf.map_fn(
    lambda length: generate_sequence(length, config.MAX_SENTENCE_LENGTH),
    tf.convert_to_tensor(test_input_sentences_lengths_np),
    dtype=tf.float32
)
test_word_level_mask_tensor = tf.expand_dims(test_word_level_mask_tensor, 1)

NameError: name 'tf' is not defined

In [21]:
print('test_input_sequences_np shape:', test_input_sequences_np.shape)
print('test_character_level_input_sequences_np shape:', test_character_level_input_sequences_np.shape)
print('test_input_sentences_lengths_np shape:', test_input_sentences_lengths_np.shape)
print('test_input_words_lengths_np shape:', test_input_words_lengths_np.shape)

test_input_sequences_np shape: (1, 64)
test_character_level_input_sequences_np shape: (1, 64, 16)
test_input_sentences_lengths_np shape: (1,)
test_input_words_lengths_np shape: (1, 64)


In [24]:
test_outputs = model.predict(
    [[input_sequences_np, word_level_mask_tensor], [character_level_input_sequences_np, character_level_mask_tensor]])

Start
Shape của word_embedding_outputs: (32, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)




Encoder end
Shape của character_level_encoder_outputs: (32, 64, 128)
Shape của concat_output: (32, 64, 256)
Encoder start
input shape (32, 64, 256)
mask shape (32, 1, 64)
Encoder end
Shape của word_level_output: (32, 64, 256)
Shape của correction_output: (32, 64, 10000)
End
[1m 9/10[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 188ms/stepStart
Shape của word_embedding_outputs: (None, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 1, 16)
Encoder end
Shape của character_level_encoder_outputs: (None, 64, 128)
Shape của concat_output: (None, 64, 256)
Encoder start
input shape (None, 64, 256)
mask shape (None, 1, 64)
Encoder end
Shape của word_level_output: (None, 64, 256)
Shape của correction_output: (None, 64, 10000)
End
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 335ms/step


In [25]:
test_outputs

array([[[1.34092160e-02, 8.15533713e-05, 8.95445293e-04, ...,
         8.30802164e-05, 5.97702492e-05, 8.05788077e-05],
        [1.43980421e-02, 7.85101292e-05, 8.88241571e-04, ...,
         8.29801138e-05, 5.60226290e-05, 7.99706904e-05],
        [1.29829124e-02, 7.26265353e-05, 8.92338809e-04, ...,
         8.06297612e-05, 5.97115904e-05, 8.38059059e-05],
        ...,
        [2.03308202e-02, 8.03431467e-05, 7.39415758e-04, ...,
         8.45377435e-05, 5.80677406e-05, 7.99034751e-05],
        [2.03096252e-02, 8.02201321e-05, 7.40636024e-04, ...,
         8.36973486e-05, 5.73797806e-05, 7.68717582e-05],
        [2.02360619e-02, 7.96054228e-05, 7.44595891e-04, ...,
         8.25097959e-05, 5.75685845e-05, 7.47031081e-05]],

       [[1.34867355e-02, 7.52851483e-05, 9.59397294e-04, ...,
         8.10614511e-05, 5.40063447e-05, 8.01358547e-05],
        [1.34922639e-02, 7.41325057e-05, 9.10002855e-04, ...,
         8.62505112e-05, 5.93672303e-05, 8.35198953e-05],
        [1.32182389e-02, 

In [26]:
for sentence in test_outputs[:20]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)

<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <U