In [1]:
class Config:
    def __init__(self):
        self.BATCH_SIZE = 100
        self.EPOCHS = 10
        self.NUM_OF_INPUTS = 600

        self.NUM_CHARACTER_LEVEL_LAYERS = 4
        self.NUM_WORD_LEVEL_LAYERS = 12

        self.CHARACTER_LEVEL_D_MODEL = 32
        self.WORD_LEVEL_D_MODEL = 128

        self.NUM_HEADS = 3
        self.DFF = 256

        self.MAX_WORD_LENGTH = 16
        self.MAX_SENTENCE_LENGTH = 64

        self.VOCAB_SIZE = 10000
        self.CHARACTER_VOCAB_SIZE = 1000


config = Config()

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-04-20 16:52:34.390925: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 16:52:34.465008: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Global Attention and FeedForward

In [3]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [4]:
class GlobalSelfAttention(BaseAttention):
    def call(self, inputs):
        x, mask = inputs[0], inputs[1]

        attn_output = self.mha(query=x, value=x, key=x, attention_mask=mask)

        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [5]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

# Positional Embedding layer

In [6]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth

    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

# Encoder layers

In [7]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1, name='EncoderLayer'):
        super().__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads, key_dim=d_model, dropout=dropout_rate, name=name
        )

        self.ffn = FeedForward(d_model, dff)

    def call(self, inputs):
        x = self.self_attention(inputs)
        x = self.ffn(x)
        return x


In [8]:
class Encoder(tf.keras.layers.Layer):
    def __init__(
            self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1,
            name='Encoder'
    ):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.enc_layers = [
            EncoderLayer(
                d_model=d_model, num_heads=num_heads, dff=dff,
                dropout_rate=dropout_rate,
                name=f'encoder_layer_{i + 1}'
            )
            for i in range(num_layers)
        ]
        # self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        x, mask = inputs
        print("Encoder start")
        print("input shape", x.shape)
        print("mask shape", mask.shape)

        for i in range(self.num_layers):
            inputs = self.enc_layers[i]((x, mask))
        print("Encoder end")
        return inputs  # Shape (batch_size, seq_len, d_model).

# Hierarchical Transformer Encoder model

In [9]:
from tensorflow import keras


@keras.utils.register_keras_serializable()
class HierarchicalTransformerEncoder(tf.keras.models.Model):
    def __init__(self, *, num_character_level_layers, num_word_level_layers,
                 character_level_d_model, word_level_d_model, num_heads, dff,
                 max_word_length, max_sentence_length,
                 vocab_size, character_vocab_size, dropout_rate=0.1, **kwargs):
        super(HierarchicalTransformerEncoder, self).__init__(**kwargs)

        self.num_character_level_layers = num_character_level_layers
        self.num_word_level_layers = num_word_level_layers
        self.character_level_d_model = character_level_d_model
        self.word_level_d_model = word_level_d_model
        self.num_heads = num_heads
        self.dff = dff
        self.max_word_length = max_word_length
        self.max_sentence_length = max_sentence_length
        self.vocab_size = vocab_size
        self.character_vocab_size = character_vocab_size
        self.dropout_rate = dropout_rate

        self.word_pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                      d_model=word_level_d_model)
        self.character_pos_embedding = PositionalEmbedding(vocab_size=character_vocab_size,
                                                           d_model=character_level_d_model)

        self.character_level_encoder = Encoder(num_layers=num_character_level_layers,
                                               d_model=character_level_d_model,
                                               num_heads=num_heads, dff=dff,
                                               vocab_size=character_vocab_size,
                                               dropout_rate=dropout_rate,
                                               name='character_level_encoder')

        self.flatten_layer = tf.keras.layers.Flatten(input_shape=(max_word_length, character_level_d_model),
                                                     name='flatten_character_level_encoders')
        self.linear_layer = tf.keras.layers.Dense(units=word_level_d_model, activation=None,
                                                  name='linear_character_level_encoders')

        self.combined_layer = tf.keras.layers.Concatenate(axis=-1, name='combined_character_and_word_level_encoders')

        self.word_level_encoder = Encoder(num_layers=num_word_level_layers,
                                          d_model=(word_level_d_model * 2),
                                          num_heads=num_heads, dff=dff,
                                          vocab_size=vocab_size,
                                          dropout_rate=dropout_rate,
                                          name='word_level_encoder')

        self.correction_layer = tf.keras.layers.Dense(vocab_size, activation='softmax', name='correction_layer', )

    def call(self, inputs):
        print("Start")
        (word_level_inputs, word_level_mask_inputs), (character_level_inputs, character_level_mask_inputs) = inputs

        word_embedding_outputs = self.word_pos_embedding(word_level_inputs)  # (batch_size, max_len, word_level_d_model)
        print("Shape của word_embedding_outputs:", word_embedding_outputs.shape)

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: self.character_level_encoder(
                (self.character_pos_embedding(sentence[0]), sentence[1])),
            (character_level_inputs, character_level_mask_inputs),
            dtype=tf.float32
        )
        # (batch_size, max_sen_len, max_word_length, character_level_d_model)

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: tf.map_fn(
                lambda word: tf.squeeze(self.linear_layer(self.flatten_layer(tf.expand_dims(word, axis=0))), axis=0),
                sentence,
                dtype=tf.float32
            ),
            character_level_encoder_outputs,
            dtype=tf.float32)
        # (batch_size, max_sen_len, word_level_d_model)
        print("Shape của character_level_encoder_outputs:", character_level_encoder_outputs.shape)

        concat_output = self.combined_layer([word_embedding_outputs, character_level_encoder_outputs])
        # (batch_size, max_sen_len, word_level_d_model * 2)
        print("Shape của concat_output:", concat_output.shape)

        word_level_output = self.word_level_encoder((concat_output, word_level_mask_inputs))
        # (batch_size, max_sen_len, word_level_d_model + character_level_d_model)
        print("Shape của word_level_output:", word_level_output.shape)

        correction_output = self.correction_layer(word_level_output)  # (batch_size, max_sen_len, vocab_size)
        print("Shape của correction_output:", correction_output.shape)

        # detection_output = tf.squeeze(self.detection_layer(word_level_output), axis=-1)  # (batch_size, max_sen_len)
        # print("Shape của detection_output:", detection_output.shape)
        print("End")
        return correction_output

    def get_config(self):
        config = {
            'num_character_level_layers': self.num_character_level_layers,
            'num_word_level_layers': self.num_word_level_layers,
            'character_level_d_model': self.character_level_d_model,
            'word_level_d_model': self.word_level_d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'max_word_length': self.max_word_length,
            'max_sentence_length': self.max_sentence_length,
            'vocab_size': self.vocab_size,
            'character_vocab_size': self.character_vocab_size,
            'dropout_rate': self.dropout_rate
        }
        base_config = super(HierarchicalTransformerEncoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    @classmethod
    def from_config(cls, config):
        return cls(**config)


@tf.function
def training_step(model, optimizer, x, y):
    with tf.GradientTape() as tape:
        correction_output = model(x)
        loss1 = correction_loss(y, correction_output)
        # loss2 = detection_loss(y[1], detection_output)
        total_loss = loss1

    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return total_loss


def correction_loss(true_outputs, pred_outputs):
    softmax_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)(true_outputs, pred_outputs)
    return softmax_loss


def detection_loss(true_detection_infos, pred_detection_infos):
    sigmoid_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)(true_detection_infos, pred_detection_infos)
    return sigmoid_loss

# Data preprocessing

## Custom Tokenizer

In [10]:
class CustomTokenizer:
    def __init__(self, word_vocab_size, max_word_len, max_sentence_len):
        self.index_word = {1: '[UNK]'}
        self.word_count = {'[UNK]': 1}
        self.word_index = {'[UNK]': 1}
        self.word_vocab_size = word_vocab_size

        self.character_index = {'[UNK]': 1}
        self.index_character = {1: '[UNK]'}

        self.signs = [',', '.', '!', '\"', "\'", "?", ";"]

        self.max_word_len = max_word_len
        self.max_sentence_len = max_sentence_len

    def preprocess_texts(self, texts):
        texts_preprocessed = []
        for text in texts:
            for sign in self.signs:
                text = text.replace(sign, ' ')

            text = text.lower()
            texts_preprocessed.append(text)
        return texts_preprocessed

    def sort(self):
        self.word_count = sorted(list(self.word_count.items())[1:], key=lambda x: x[1], reverse=True)
        for i, (word, count) in enumerate(self.word_count):
            self.index_word[i + 2] = word
            self.word_index[word] = i + 2

        self.word_count = dict(self.word_count)

    def texts_to_sequences(self, texts):
        word_level_sequences = []
        character_level_sequences = []

        sentence_lengths = []
        word_lengths = []

        texts = self.preprocess_texts(texts)
        for text in texts:
            words = text.split(' ')
            words = [i for i in words if i != '']

            tokens = [self.word_index.get(word, 1)
                      if (self.word_index.get(word, 1) <= self.word_vocab_size) else 1
                      for word in words]
            word_level_sequences.append(tokens)
            sentence_lengths.append(min(len(tokens), self.max_sentence_len))

            sentence_tokens = []
            word_length = []
            for word in words:
                tokens = [self.character_index.get(i, 1) for i in word]
                word_length.append(min(len(tokens), self.max_word_len))
                sentence_tokens.append(tokens)
            character_level_sequences.append(sentence_tokens)
            word_lengths.append(word_length)

        return (word_level_sequences, sentence_lengths), (character_level_sequences, word_lengths)

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            texts.append([self.index_word.get(token, 1) for token in sequence])
        return texts

    def fit_on_texts(self, texts):

        texts = self.preprocess_texts(texts)

        for text in texts:

            # Update word vocab.
            words = text.split(' ')
            for word in words:
                if word == '': continue
                self.word_count[word] = self.word_count.get(word, 0) + 1

            # Update character vocab
            for character in text:
                if character == ' ': continue
                if self.character_index.get(character, -1) == -1:
                    self.character_index[character] = len(self.character_index) + 1

                self.index_character[self.character_index[character]] = character

        self.sort()

## Build Dataset.

In [11]:
import random


class Dataset:
    def __init__(self, sentences, tokenizer):
        self.sentences = sentences
        self.tokenizer = tokenizer

    def build_dataset(self):
        ((word_level_sequences, sentence_lengths),
         (character_level_sequences, word_lengths)) = word_level_tokenizer.texts_to_sequences(self.sentences)

        target_word_level_sequences = pad_sequences(word_level_sequences.copy(), maxlen=config.MAX_SENTENCE_LENGTH,
                                                    padding='post', truncating='post')

        tmp_word_level_sequences = []
        word_level_masks = []
        for sequence in word_level_sequences:
            sequence, mask = self.random_word(sequence)
            tmp_word_level_sequences.append(sequence)
            word_level_masks.append(mask)

        # Get word_level sequences
        word_level_sequences = tmp_word_level_sequences
        word_level_sequences = pad_sequences(word_level_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                             padding='post', truncating='post')
        # Get word_level mask matrix.
        word_level_masks = pad_sequences(word_level_masks, maxlen=config.MAX_SENTENCE_LENGTH,
                                         padding='post', truncating='post')
        word_level_masks = tf.expand_dims(word_level_masks, 1)
        word_level_masks = tf.tile(word_level_masks, [1, config.MAX_SENTENCE_LENGTH, 1])

        # Get character_level mask matrix.
        tmp_character_level_sequences = []
        character_level_masks = []
        for character_level_sequence in character_level_sequences:
            sentence_tmp = []
            mask_tmp = []
            # With each sentence
            for word in character_level_sequence:
                # With each word
                character_level_sequence, mask = self.random_word(word, word_level=False)
                sentence_tmp.append(character_level_sequence)
                mask_tmp.append(mask)

            while len(sentence_tmp) < config.MAX_SENTENCE_LENGTH:
                sentence_tmp.append([])
                mask_tmp.append([])
            while len(sentence_tmp) > config.MAX_SENTENCE_LENGTH:
                del sentence_tmp[-1]
                del mask_tmp[-1]
            tmp_character_level_sequences.append(
                pad_sequences(sentence_tmp, maxlen=config.MAX_WORD_LENGTH,
                              padding='post', truncating='post'))
            character_level_masks.append(
                pad_sequences(mask_tmp, maxlen=config.MAX_WORD_LENGTH,
                              padding='post', truncating='post'))

        character_level_sequences = tf.convert_to_tensor(tmp_character_level_sequences)
        character_level_masks = tf.expand_dims(character_level_masks, 2)

        character_level_masks = tf.tile(character_level_masks, [1, 1, config.MAX_WORD_LENGTH, 1])

        return (word_level_sequences, word_level_masks), (
            character_level_sequences, character_level_masks), target_word_level_sequences

    def random_word(self, sequence, word_level=True):
        output = []
        mask = []

        # 15% of the tokens would be replaced
        for token in sequence:
            prob = random.random()

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    if word_level:
                        output.append(0)
                    else:
                        output.append(0)
                    mask.append(0)

                # 10% chance change token to random token
                elif prob < 0.9:
                    if word_level:
                        random_token = random.randrange(len(self.tokenizer.word_index) + 1)
                        if random_token > self.tokenizer.word_vocab_size:
                            random_token = 0
                            mask.append(0)
                        else:
                            mask.append(1)
                    else:
                        random_token = random.randrange(len(self.tokenizer.character_index) + 1)
                        mask.append(1)

                    output.append(random_token)

                # 10% chance change token to current token
                else:
                    mask.append(1)
                    output.append(token)

            else:
                mask.append(1)
                output.append(token)

        mask = np.array(mask)

        return output, mask

In [12]:
data = pd.read_csv('data/vi_processed.csv')

input_sentences = []

for index, row in data.iterrows():
    if len(input_sentences) == config.NUM_OF_INPUTS: break
    input_sentences.append(row.correct_text)

correct_texts = input_sentences[:config.NUM_OF_INPUTS]

In [13]:
word_level_tokenizer = CustomTokenizer(word_vocab_size=config.VOCAB_SIZE, max_word_len=config.MAX_WORD_LENGTH,
                                       max_sentence_len=config.MAX_SENTENCE_LENGTH)
word_level_tokenizer.fit_on_texts(input_sentences)

In [14]:
dataset = Dataset(input_sentences, word_level_tokenizer)
data = dataset.build_dataset()

In [15]:
(input_word_level_sequences_np, word_level_mask_tensor), (
    input_character_level_sequences_np, character_level_mask_tensor), target_sequences = data

In [16]:
character_level_mask_tensor.shape

TensorShape([600, 64, 16, 16])

# Model building

In [17]:
model = HierarchicalTransformerEncoder(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                       num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                       character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                       word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                       num_heads=config.NUM_HEADS, dff=config.DFF,
                                       max_word_length=config.MAX_WORD_LENGTH,
                                       max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                       vocab_size=config.VOCAB_SIZE,
                                       character_vocab_size=config.CHARACTER_VOCAB_SIZE)

  super().__init__(**kwargs)


In [18]:
from tensorflow import keras


@keras.utils.register_keras_serializable()
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {
            'd_model': float(self.d_model),
            'warmup_steps': self.warmup_steps
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)


learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [19]:
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['acc'])

model.fit(
    [[input_word_level_sequences_np, word_level_mask_tensor],
     [input_character_level_sequences_np, character_level_mask_tensor]],
    target_sequences, epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE)

Epoch 1/10
Start
Shape của word_embedding_outputs: (100, 64, 128)
Instructions for updating:
Use fn_output_signature instead
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)




Encoder end
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (100, 64, 128)
Shape của concat_output: (100, 64, 256)
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Shape của word_level_output: (100, 64, 256)
Shape của correction_output: (100, 64, 10000)
End
Start
Shape của word_embedding_outputs: (100, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (100, 64, 128)
Shape của concat_output: (100, 64, 256)
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Shape của word_level_output: (100, 64, 256)
Shape của correction_output: (100, 64, 10000)
End




Start
Shape của word_embedding_outputs: (100, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (100, 64, 128)
Shape của concat_output: (100, 64, 256)
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Shape của word_level_output: (100, 64, 256)
Shape của correction_output: (100, 64, 10000)
End
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2s/step - acc: 0.0000e+00 - loss: 9.2092
Epoch 2/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - acc: 0.0000e+00 - loss: 9.1956
Epoch 3/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - acc: 0.0000e+00 - loss: 9.1564
Epoch 4/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - acc: 0.0000e+00 - loss: 9.0962
Epoch 5/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2s/step - acc: 0.0000e+00 - loss: 9.0133
Epoch 6/10
[1m6/6[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7ce2537fcd60>

# Test model

In [20]:
test_outputs = model.predict(
   [[input_word_level_sequences_np, word_level_mask_tensor],
     [input_character_level_sequences_np, character_level_mask_tensor]])

Start
Shape của word_embedding_outputs: (32, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (32, 64, 128)
Shape của concat_output: (32, 64, 256)
Encoder start
input shape (32, 64, 256)
mask shape (32, 64, 64)
Encoder end
Shape của word_level_output: (32, 64, 256)
Shape của correction_output: (32, 64, 10000)
End
[1m18/19[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 165ms/stepStart
Shape của word_embedding_outputs: (None, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (None, 64, 128)
Shape của concat_output: (None, 64, 256)
Encoder start
input shape (None, 64, 256)
mask shape (None, 64, 64)
Encoder end
Shape của word_level_output: (None, 64, 256)
Shape của correction_output: (None, 64, 10000)
End
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 244ms/step


In [21]:
for sentence in test_outputs[:20]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)

<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <U

In [22]:
model.save('model.keras')

In [23]:
new_model = tf.keras.models.load_model('model.keras')

Start
Shape của word_embedding_outputs: (100, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (100, 64, 128)
Shape của concat_output: (100, 64, 256)
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Encoder start
input shape (100, 64, 256)
mask shape (100, 64, 64)
Encoder end
Shape của word_level_output: (100, 64, 256)
Shape của correction_output: (100, 64, 10000)
End


In [24]:
test_outputs = new_model.predict(
    [[input_word_level_sequences_np, word_level_mask_tensor],
     [input_character_level_sequences_np, character_level_mask_tensor]])

Start
Shape của word_embedding_outputs: (32, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (32, 64, 128)
Shape của concat_output: (32, 64, 256)
Encoder start
input shape (32, 64, 256)
mask shape (32, 64, 64)
Encoder end
Shape của word_level_output: (32, 64, 256)
Shape của correction_output: (32, 64, 10000)
End
[1m18/19[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 155ms/stepStart
Shape của word_embedding_outputs: (None, 64, 128)
Encoder start
input shape (64, 16, 32)
mask shape (64, 16, 16)
Encoder end
Shape của character_level_encoder_outputs: (None, 64, 128)
Shape của concat_output: (None, 64, 256)
Encoder start
input shape (None, 64, 256)
mask shape (None, 64, 64)
Encoder end
Shape của word_level_output: (None, 64, 256)
Shape của correction_output: (None, 64, 10000)
End
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 225ms/step


In [25]:
for sentence in test_outputs[:20]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)

<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 
<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <U