In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import random
import os

2024-04-23 23:02:28.310856: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-23 23:02:28.364033: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def write_dict_data_to_file(file_path, data, indent=2):
    with open(file_path, "w", encoding='utf-8') as outfile:
        json.dump(data, outfile, ensure_ascii=False, indent=indent)

In [3]:
os.makedirs('model', exist_ok=True)
os.makedirs('model/tokenizer', exist_ok=True)

# Set up config.

In [4]:
class Config:
    def __init__(self):
        self.BATCH_SIZE = 100
        self.EPOCHS = 10
        self.NUM_OF_INPUTS = 5000

        self.NUM_CHARACTER_LEVEL_LAYERS = 4
        self.NUM_WORD_LEVEL_LAYERS = 12

        self.CHARACTER_LEVEL_D_MODEL = 64
        self.WORD_LEVEL_D_MODEL = 512

        self.NUM_HEADS = 3
        self.DFF = 256

        self.MAX_WORD_LENGTH = 8
        self.MAX_SENTENCE_LENGTH = 64

        self.VOCAB_SIZE = 8000
        self.CHARACTER_VOCAB_SIZE = 500

    def save_config(self):
        write_dict_data_to_file("model/config.json", {
            "BATCH_SIZE": self.BATCH_SIZE,
            "EPOCHS": self.EPOCHS,
            "NUM_OF_INPUTS": self.NUM_OF_INPUTS,

            "NUM_CHARACTER_LEVEL_LAYERS": self.NUM_CHARACTER_LEVEL_LAYERS,
            "NUM_WORD_LEVEL_LAYERS": self.NUM_WORD_LEVEL_LAYERS,

            "WORD_LEVEL_D_MODEL": self.WORD_LEVEL_D_MODEL,
            "CHARACTER_LEVEL_D_MODEL": self.CHARACTER_LEVEL_D_MODEL,

            "NUM_HEADS": self.NUM_HEADS,
            "DFF": self.DFF,

            "MAX_WORD_LENGTH": self.MAX_WORD_LENGTH,
            "MAX_SENTENCE_LENGTH": self.MAX_SENTENCE_LENGTH,

            "VOCAB_SIZE": self.VOCAB_SIZE,
            "CHARACTER_VOCAB_SIZE": self.CHARACTER_VOCAB_SIZE,
        }, indent=4)
        return self

    @staticmethod
    def load_config():
        instance = Config()
        config = json.load(open('model/config.json'))

        instance.BATCH_SIZE = config["BATCH_SIZE"]
        instance.EPOCHS = config["EPOCHS"]

        instance.NUM_OF_INPUTS = config["NUM_OF_INPUTS"]
        instance.NUM_CHARACTER_LEVEL_LAYERS = config["NUM_CHARACTER_LEVEL_LAYERS"]
        instance.NUM_WORD_LEVEL_LAYERS = config["NUM_WORD_LEVEL_LAYERS"]

        instance.WORD_LEVEL_D_MODEL = config["WORD_LEVEL_D_MODEL"]
        instance.CHARACTER_LEVEL_D_MODEL = config["CHARACTER_LEVEL_D_MODEL"]

        instance.NUM_HEADS = config["NUM_HEADS"]
        instance.DFF = config["DFF"]

        instance.MAX_WORD_LENGTH = config["MAX_WORD_LENGTH"]
        instance.MAX_SENTENCE_LENGTH = config["MAX_SENTENCE_LENGTH"]

        instance.VOCAB_SIZE = config["VOCAB_SIZE"]
        instance.CHARACTER_VOCAB_SIZE = config["CHARACTER_VOCAB_SIZE"]

        return instance


# Positional Embedding

In [5]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth

    angle_rates = 1 / (10000 ** depths)
    angle_rads = positions * angle_rates

    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)


class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, positional=True):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.positional = positional
        if positional is True:
            self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        if self.positional is True:
            x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

# Attention

In [6]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [7]:
class GlobalSelfAttention(BaseAttention):
    def call(self, inputs):
        x, mask = inputs[0], inputs[1]

        attn_output = self.mha(query=x, value=x, key=x, attention_mask=mask)

        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

# FeedForward and Encoders

In [8]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

In [9]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1, name='EncoderLayer'):
        super().__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads, key_dim=d_model, dropout=dropout_rate, name=name
        )

        self.ffn = FeedForward(d_model, dff)

    def call(self, inputs):
        x = self.self_attention(inputs)
        x = self.ffn(x)
        return x

In [10]:
class Encoder(tf.keras.layers.Layer):
    def __init__(
            self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1,
            name='Encoder'
    ):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.enc_layers = [
            EncoderLayer(
                d_model=d_model, num_heads=num_heads, dff=dff,
                dropout_rate=dropout_rate,
                name=f'encoder_layer_{i + 1}'
            )
            for i in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs):
        x, mask = inputs
        x = self.dropout(x)
        for i in range(self.num_layers):
            inputs = self.enc_layers[i]((x, mask))
        return inputs  # Shape (batch_size, seq_len, d_model).

# Construct Model.

In [11]:
@keras.utils.register_keras_serializable()
class HierarchicalTransformerEncoderModel(tf.keras.models.Model):
    def __init__(self, *, num_character_level_layers, num_word_level_layers,
                 character_level_d_model, word_level_d_model, num_heads, dff,
                 max_word_length, max_sentence_length,
                 vocab_size, character_vocab_size, dropout_rate=0.1, **kwargs):
        super(HierarchicalTransformerEncoderModel, self).__init__(**kwargs)

        self.num_character_level_layers = num_character_level_layers
        self.num_word_level_layers = num_word_level_layers
        self.character_level_d_model = character_level_d_model
        self.word_level_d_model = word_level_d_model
        self.num_heads = num_heads
        self.dff = dff
        self.max_word_length = max_word_length
        self.max_sentence_length = max_sentence_length
        self.vocab_size = vocab_size
        self.character_vocab_size = character_vocab_size
        self.dropout_rate = dropout_rate

        self.word_pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                      d_model=word_level_d_model)
        self.character_pos_embedding = PositionalEmbedding(vocab_size=character_vocab_size,
                                                           d_model=character_level_d_model,
                                                           positional=False)

        self.character_level_encoder = Encoder(num_layers=num_character_level_layers,
                                               d_model=character_level_d_model,
                                               num_heads=num_heads, dff=dff,
                                               vocab_size=character_vocab_size,
                                               dropout_rate=dropout_rate,
                                               name='character_level_encoder')

        self.flatten_layer = tf.keras.layers.Flatten(input_shape=(max_word_length, character_level_d_model),
                                                     name='flatten_character_level_encoders')
        self.linear_layer = tf.keras.layers.Dense(units=word_level_d_model, activation=None,
                                                  name='linear_character_level_encoders')

        self.combined_layer = tf.keras.layers.Concatenate(axis=-1, name='combined_character_and_word_level_encoders')

        self.word_level_encoder = Encoder(num_layers=num_word_level_layers,
                                          d_model=(word_level_d_model * 2),
                                          num_heads=num_heads, dff=dff,
                                          vocab_size=vocab_size,
                                          dropout_rate=dropout_rate,
                                          name='word_level_encoder')

        self.correction_layer = tf.keras.layers.Dense(vocab_size, activation='softmax', name='correction_layer', )

    def call(self, inputs):
        # print("Start")
        word_level_inputs, character_level_inputs = inputs

        word_embedding_outputs = self.word_pos_embedding(word_level_inputs)  # (batch_size, max_len, word_level_d_model)
        word_level_mask = self.word_pos_embedding.compute_mask(word_level_inputs)
        word_level_mask = word_level_mask[:, tf.newaxis, :]

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: self.character_level_encoder(
                (self.character_pos_embedding(sentence),
                 self.character_pos_embedding.compute_mask(sentence)[:, tf.newaxis, :])),
            character_level_inputs,
            dtype=tf.float32
        )
        # (batch_size, max_sen_len, max_word_length, character_level_d_model)

        character_level_encoder_outputs = tf.map_fn(
            lambda sentence: tf.map_fn(
                lambda word: tf.squeeze(self.linear_layer(self.flatten_layer(tf.expand_dims(word, axis=0))), axis=0),
                sentence,
                dtype=tf.float32
            ),
            character_level_encoder_outputs,
            dtype=tf.float32)
        # (batch_size, max_sen_len, word_level_d_model)
        # print("Shape của character_level_encoder_outputs:", character_level_encoder_outputs.shape)

        concat_output = self.combined_layer([word_embedding_outputs, character_level_encoder_outputs])
        # (batch_size, max_sen_len, word_level_d_model * 2)
        # print("Shape của concat_output:", concat_output.shape)

        word_level_output = self.word_level_encoder((concat_output, word_level_mask))
        # (batch_size, max_sen_len, word_level_d_model + character_level_d_model)
        # print("Shape của word_level_output:", word_level_output.shape)

        correction_output = self.correction_layer(word_level_output)  # (batch_size, max_sen_len, vocab_size)
        # print("Shape của correction_output:", correction_output.shape)

        # detection_output = tf.squeeze(self.detection_layer(word_level_output), axis=-1)  # (batch_size, max_sen_len)
        # print("Shape của detection_output:", detection_output.shape)
        # print("End")
        return correction_output

    def get_config(self):
        config = {
            'num_character_level_layers': self.num_character_level_layers,
            'num_word_level_layers': self.num_word_level_layers,
            'character_level_d_model': self.character_level_d_model,
            'word_level_d_model': self.word_level_d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'max_word_length': self.max_word_length,
            'max_sentence_length': self.max_sentence_length,
            'vocab_size': self.vocab_size,
            'character_vocab_size': self.character_vocab_size,
            'dropout_rate': self.dropout_rate
        }
        base_config = super(HierarchicalTransformerEncoderModel, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    @classmethod
    def from_config(cls, config):
        return cls(**config)


@tf.function
def training_step(model, optimizer, x, y):
    with tf.GradientTape() as tape:
        correction_output = model(x)
        loss1 = correction_loss(y, correction_output)
        # loss2 = detection_loss(y[1], detection_output)
        total_loss = loss1

    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return total_loss


def correction_loss(true_outputs, pred_outputs):
    softmax_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)(true_outputs, pred_outputs)
    return softmax_loss


def detection_loss(true_detection_infos, pred_detection_infos):
    sigmoid_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)(true_detection_infos, pred_detection_infos)
    return sigmoid_loss

# Dataset control

In [12]:
class Dataset:
    def __init__(self, sentences, tokenizer, config):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.config = config

    def build_dataset(self):
        (word_level_sequences, character_level_sequences) = self.tokenizer.texts_to_sequences(self.sentences)

        target_word_level_sequences = pad_sequences(word_level_sequences.copy(), maxlen=self.config.MAX_SENTENCE_LENGTH,
                                                    padding='post', truncating='post')

        tmp_word_level_sequences = []
        for sequence in word_level_sequences:
            sequence = self.random_word(sequence)
            tmp_word_level_sequences.append(sequence)

        # Get word_level sequences
        word_level_sequences = tmp_word_level_sequences
        word_level_sequences = pad_sequences(word_level_sequences, maxlen=self.config.MAX_SENTENCE_LENGTH,
                                             padding='post', truncating='post')

        # Get character_level mask matrix.
        tmp_character_level_sequences = []
        for character_level_sequence in character_level_sequences:
            sentence_tmp = []
            # With each sentence
            for word in character_level_sequence:
                # With each word
                character_level_sequence = self.random_word(word, word_level=False)
                sentence_tmp.append(character_level_sequence)

            while len(sentence_tmp) < self.config.MAX_SENTENCE_LENGTH:
                sentence_tmp.append([])
            while len(sentence_tmp) > self.config.MAX_SENTENCE_LENGTH:
                del sentence_tmp[-1]
            tmp_character_level_sequences.append(
                pad_sequences(sentence_tmp, maxlen=self.config.MAX_WORD_LENGTH,
                              padding='post', truncating='post'))

        character_level_sequences = tf.convert_to_tensor(tmp_character_level_sequences)

        return (word_level_sequences, character_level_sequences), target_word_level_sequences

    def random_word(self, sequence, word_level=True):
        output = []

        # 15% of the tokens would be replaced
        for token in sequence:
            prob = random.random()

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    output.append(0)

                # 10% chance change token to random token
                elif prob < 0.9:
                    if word_level:
                        random_token = random.randrange(len(self.tokenizer.word_index) + 1)
                        if random_token > self.tokenizer.word_vocab_size:
                            random_token = 0
                    else:
                        random_token = random.randrange(len(self.tokenizer.character_index) + 1)

                    output.append(random_token)

                # 10% chance change token to current token
                else:
                    output.append(token)

            else:
                output.append(token)

        return output

    def build_test_data(self, sentences):
        word_level_sequences, character_level_sequences = self.tokenizer.texts_to_sequences(sentences)

        word_level_sequences = pad_sequences(word_level_sequences, maxlen=self.config.MAX_SENTENCE_LENGTH,
                                             padding='post', truncating='post')

        tmp_character_level_sequences = []
        for character_level_sequence in character_level_sequences:
            while len(character_level_sequence) < self.config.MAX_SENTENCE_LENGTH:
                character_level_sequence.append([])
            while len(character_level_sequence) > self.config.MAX_SENTENCE_LENGTH:
                del character_level_sequence[-1]
            tmp_character_level_sequences.append(
                pad_sequences(character_level_sequence, maxlen=self.config.MAX_WORD_LENGTH,
                              padding='post', truncating='post'))

        character_level_sequences = tf.convert_to_tensor(tmp_character_level_sequences)
        return word_level_sequences, character_level_sequences


# Custom tokenizer.

In [13]:
class CustomTokenizer:

    def __init__(self, word_vocab_size=0, max_word_len=0, max_sentence_len=0):
        self.index_word = {1: '[UNK]'}
        self.word_count = {'[UNK]': 1}
        self.word_index = {'[UNK]': 1}
        self.word_vocab_size = word_vocab_size

        self.character_index = {'[UNK]': 1}
        self.index_character = {1: '[UNK]'}

        self.signs = [',', '.', '!', '\"', "\'", "?", ";", ")", "(", ':', "/", "+", "-", "=", "`", "~", "*", "^", "@",
                      "%", "&", "_"]

        self.max_word_len = max_word_len
        self.max_sentence_len = max_sentence_len

    def preprocess_texts(self, texts):
        texts_preprocessed = []
        for text in texts:
            for sign in self.signs:
                text = text.replace(sign, ' ')

            text = text.lower()
            texts_preprocessed.append(text)
        return texts_preprocessed

    def sort(self):
        self.word_count = sorted(list(self.word_count.items())[1:], key=lambda x: x[1], reverse=True)
        for i, (word, count) in enumerate(self.word_count):
            self.index_word[i + 2] = word
            self.word_index[word] = i + 2

        self.word_count = dict(self.word_count)

    def texts_to_sequences(self, texts):
        word_level_sequences = []
        character_level_sequences = []

        texts = self.preprocess_texts(texts)
        for text in texts:
            words = text.split(' ')
            words = [i for i in words if i != '']

            tokens = [self.word_index.get(word, 1)
                      if (self.word_index.get(word, 1) <= self.word_vocab_size) else 1
                      for word in words]
            word_level_sequences.append(tokens)

            sentence_tokens = []
            for word in words:
                tokens = [self.character_index.get(i, 1) for i in word]
                sentence_tokens.append(tokens)
            character_level_sequences.append(sentence_tokens)

        return word_level_sequences, character_level_sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            texts.append([self.index_word.get(token, 1) for token in sequence])
        return texts

    def fit_on_texts(self, texts):

        texts = self.preprocess_texts(texts)

        for text in texts:

            # Update word vocab.
            words = text.split(' ')
            for word in words:
                if word == '': continue
                self.word_count[word] = self.word_count.get(word, 0) + 1

            # Update character vocab
            for character in text:
                if character == ' ': continue
                if self.character_index.get(character, -1) == -1:
                    self.character_index[character] = len(self.character_index) + 1

                self.index_character[self.character_index[character]] = character

        self.sort()
        self.save_tokenizer()

    def save_tokenizer(self):
        write_dict_data_to_file("model/tokenizer/word_index.json", self.word_index)
        write_dict_data_to_file("model/tokenizer/index_word.json", self.index_word)
        write_dict_data_to_file("model/tokenizer/index_character.json", self.index_character)
        write_dict_data_to_file("model/tokenizer/character_index.json", self.character_index)
        write_dict_data_to_file("model/tokenizer/config.json", {
            'word_vocab_size': self.word_vocab_size,
            'max_word_len': self.max_word_len,
            'max_sentence_len': self.max_sentence_len,
            'signs': self.signs
        }, indent=4)

    @staticmethod
    def load_tokenizer():
        instance = CustomTokenizer()

        config = json.load(open('model/tokenizer/config.json'))
        instance.word_vocab_size = config['word_vocab_size']
        instance.max_word_len = config['max_word_len']
        instance.max_sentence_len = config['max_sentence_len']
        instance.signs = config['signs']

        instance.word_index = json.load(open('model/tokenizer/word_index.json'))
        instance.index_word = json.load(open('model/tokenizer/index_word.json'))
        instance.character_index = json.load(open('model/tokenizer/character_index.json'))
        instance.index_character = json.load(open('model/tokenizer/index_character.json'))

        return instance

# Custom Schedule

In [14]:
@keras.utils.register_keras_serializable()
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {
            'd_model': float(self.d_model),
            'warmup_steps': self.warmup_steps
        }

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Init config and build dataset.

In [15]:
config = Config().save_config()

# Read data.
data = pd.read_csv('data/vi_processed.csv')
input_sentences = []
for index, row in data.iterrows():
    input_sentences.append(row.correct_text)

# Preprocessing and build dataset.

In [16]:
word_level_tokenizer = CustomTokenizer(word_vocab_size=config.VOCAB_SIZE, max_word_len=config.MAX_WORD_LENGTH,
                                       max_sentence_len=config.MAX_SENTENCE_LENGTH)
word_level_tokenizer.fit_on_texts(input_sentences)

input_sentences = input_sentences[:config.NUM_OF_INPUTS]
dataset = Dataset(input_sentences, word_level_tokenizer, config)
data = dataset.build_dataset()

(input_word_level_sequences, input_character_level_sequences), target_sequences = data

# Build and train model.

In [17]:
model = HierarchicalTransformerEncoderModel(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                            num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                            character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                            word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                            num_heads=config.NUM_HEADS, dff=config.DFF,
                                            max_word_length=config.MAX_WORD_LENGTH,
                                            max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                            vocab_size=config.VOCAB_SIZE,
                                            character_vocab_size=config.CHARACTER_VOCAB_SIZE)

learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['acc'])

  super().__init__(**kwargs)


In [18]:
train_size = int(config.NUM_OF_INPUTS * 0.8)
model.fit(
    [input_word_level_sequences[:train_size], input_character_level_sequences[:train_size]],
    target_sequences[:train_size], epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE,
    validation_data=([input_word_level_sequences[train_size:], input_character_level_sequences[train_size:]],
                     target_sequences[train_size:]))

model.save('model/model.keras')

Epoch 1/7
Instructions for updating:
Use fn_output_signature instead




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4s/step - acc: 0.0000e+00 - loss: 9.1180 - val_acc: 0.0000e+00 - val_loss: 9.0821
Epoch 2/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - acc: 0.0000e+00 - loss: 9.1100 - val_acc: 0.0000e+00 - val_loss: 9.0697
Epoch 3/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - acc: 0.0000e+00 - loss: 9.0964 - val_acc: 0.0000e+00 - val_loss: 9.0475
Epoch 4/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3s/step - acc: 0.0000e+00 - loss: 9.0792 - val_acc: 0.0000e+00 - val_loss: 9.0156
Epoch 5/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - acc: 0.0000e+00 - loss: 9.0447 - val_acc: 0.0000e+00 - val_loss: 8.9738
Epoch 6/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - acc: 0.0000e+00 - loss: 9.0082 - val_acc: 0.0000e+00 - val_loss: 8.9223
Epoch 7/7
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - acc

In [19]:
test_outputs = model.predict(
    [input_word_level_sequences[train_size:], input_character_level_sequences[train_size:]])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


In [20]:
def print_outputs(outputs):
    for sentence in outputs:
        out = ''
        for word in sentence:
            max_index = tf.argmax(word, axis=0).numpy()
            word_str = word_level_tokenizer.index_word.get(max_index)
            if word_str is not None:
                out += word_str + ' '
            else:
                out += '<UNK> '
        print(out)


print_outputs(test_outputs[:10])

alô mòn thấm thoát xua roi nhúng thấm thấm thấm 1989 hai 1m lôi lẽo linus hênh 146 khắp khắp đao đao đao đao đao dậy rehhagel đao đao đao 146 146 146 đao đao đao 146 146 đao đao đao đạt đạt tuân tuân tuân tuân tuân tuân rehhagel rehhagel rehhagel đạt đạt đạt khẳng khẳng thắm thắm thắm javier javier javier javier 
g 146 channel khiêm thấm đao torino javier thấm thinh lẽo hàn blood quẩn đao fi vé tobacco hênh tobacco nỗ đao caneira năng 146 caneira đao caneira đao đao dương mượn ăng đao spears dương aac đao coca đao đao đao dyer robot boot đàm linus đao reader đao ms chợ đao thấm cellulite chứ news kaka 190 javier sứ thắm javier đao 
blood đao thấm miết thấm kênh tuân thấm đao jeonbuk đàm jeonbuk quy javier coca javier 146 146 khắp khắp đao đao đao đao đao javier javier đao đao đao 146 146 146 đao đao đao 146 146 đao đao đạt đạt đạt tuân tuân tuân tuân tuân tuân rehhagel rehhagel rehhagel đạt đạt đạt khẳng khẳng thắm thắm thắm javier javier javier javier 
mòn nhỡ đàm dominik sít beige za

# Test loaded model.

In [21]:
tokenizer = CustomTokenizer.load_tokenizer()
loaded_model = tf.keras.models.load_model('model/model.keras')
config = Config.load_config()
dataset = Dataset(None, tokenizer, config)

In [22]:
def print_outputs(outputs):
    for sentence in outputs:
        out = ''
        for word in sentence:
            max_index = tf.argmax(word, axis=0).numpy()
            word_str = tokenizer.index_word.get(str(max_index))
            if word_str is not None:
                out += word_str + ' '
            else:
                out += '<UNK> '
        print(out)

In [23]:
test_sentences = [
    'Nếu làm được như vậy thì chắc chắn xẽ không còn trường nào tùy tiện thu tiền cao, gây sự lo lắng củb phụ huynh và ai không có tiền thì hhông cần đóng.']
test_inputs = dataset.build_test_data(test_sentences)
test_outputs = loaded_model.predict(test_inputs)
print_outputs(test_outputs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
tô lẫn vé chỉn đao thấm thấm thấm birmingham đao linus giàng linus đao caneira chài smart nkt coca hai nỗ [UNK] năng uae chỉn đao đao đao linus hành thấm nhđá đao đao đao đao 146 146 146 đao đao đạt đạt tuân tuân tuân tuân tuân tuân rehhagel rehhagel rehhagel đạt đạt đạt khẳng khẳng thắm thắm thắm javier javier javier javier 
