In [1]:
import json
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

from Config import Config

json_sentences = []

with open('data/VSEC.jsonl', 'r') as file:
    for line in file:
        json_obj = json.loads(line[:-1])
        json_sentences.append(json_obj)

error_texts = []
correct_texts = []
correct_infos = []

for json_sentence in json_sentences:
    error_text = []
    correct_text = []
    correct_info = []
    for word in json_sentence['annotations']:
        correct_info.append(word['is_correct'])

        current_word = word['current_syllable'].lower()
        error_text.append(current_word)

        if word['is_correct'] is True:
            correct_text.append(current_word)
        else:
            correct_text.append(word['alternative_syllables'][0].lower())
            # if len(word['alternative_syllables']) > 1: print(word)
    error_texts.append(error_text)
    correct_texts.append(correct_text)
    correct_infos.append(correct_info)
# Main
config = Config()

word_level_tokenizer = Tokenizer(num_words=config.VOCAB_SIZE, oov_token='<UNK>', lower=True)
word_unk_level_tokenizer = Tokenizer(oov_token='<UNK>', lower=True)
character_level_tokenizer = Tokenizer(lower=True, char_level=True)

word_level_tokenizer.fit_on_texts(correct_texts)
word_unk_level_tokenizer.fit_on_texts(error_texts)

import itertools

flattened_sentences = list(itertools.chain(*(error_texts + correct_texts)))
character_level_tokenizer.fit_on_texts(flattened_sentences)

2024-04-17 15:30:46.645818: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-17 15:30:46.754482: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
len(correct_texts)

9341

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = word_level_tokenizer.texts_to_sequences(error_texts)
output_sequences = word_level_tokenizer.texts_to_sequences(correct_texts)
unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(error_texts)

# Get character-level words lengths.
input_words_lengths = []

# Get character-level sequences.
character_level_input_sequences = []

for sequence in unk_input_sequences:
    character_level_input_sequence = []
    words_lengths = []
    for word_token in sequence:
        word = word_unk_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                              else config.MAX_WORD_LENGTH))

    # Add padding for each word.
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    character_level_input_sequences.append(character_level_input_sequence)

    input_words_lengths.append(words_lengths)

# Get word-level sentences lengths.
input_sentences_lengths = []
for sequence in input_sequences: input_sentences_lengths.append(
    (len(sequence) if len(sequence) <= config.MAX_SENTENCE_LENGTH
     else config.MAX_SENTENCE_LENGTH))

# Add padding for each.
input_sequences = pad_sequences(input_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
output_sequences = pad_sequences(output_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
character_level_input_sequences = pad_sequences(character_level_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                                padding='post', truncating='post')
input_words_lengths = pad_sequences(input_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                    truncating='post')
correct_infos = pad_sequences(correct_infos, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', value=1)

input_sequences_np = np.array(input_sequences)
character_level_input_sequences_np = np.array(character_level_input_sequences)
output_sequences_np = np.array(output_sequences)

input_words_lengths_np = np.array(input_words_lengths)
input_sentences_lengths_np = np.array(input_sentences_lengths)
correct_infos_np = np.array(correct_infos)

In [4]:
from HierarchicalTransformerEncoder import HierarchicalTransformerEncoder

model = HierarchicalTransformerEncoder(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                       num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                       character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                       word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                       num_heads=config.NUM_HEADS, dff=config.DFF,
                                       max_word_length=config.MAX_WORD_LENGTH,
                                       max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                       vocab_size=config.VOCAB_SIZE,
                                       character_vocab_size=config.CHARACTER_VOCAB_SIZE)

  super().__init__(**kwargs)


In [5]:
from CustomSchedule import CustomSchedule

learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [13]:
# word_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH)
# char_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH, config.MAX_WORD_LENGTH)
# sentence_lengths_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH)
# word_lengths_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH, config.MAX_WORD_LENGTH)

# model.build(input_shape=[[word_input_shape, sentence_lengths_shape], [char_input_shape, word_lengths_shape]])
# model.compile(optimizer=optimizer, metrics=['acc'])

In [6]:
from HierarchicalTransformerEncoder import training_step
from tqdm import tqdm

for e in range(config.EPOCHS):
    pbar = tqdm(range(0, len(input_sequences_np), config.BATCH_SIZE))

    for i in pbar:
        input_batch = [
            [input_sequences_np[i:i + config.BATCH_SIZE], input_sentences_lengths_np[i:i + config.BATCH_SIZE]],
            [character_level_input_sequences_np[i:i + config.BATCH_SIZE],
             input_words_lengths_np[i:i + config.BATCH_SIZE]]
        ]
        output_batch = [
            output_sequences_np[i:i + config.BATCH_SIZE],
            correct_infos_np[i:i + config.BATCH_SIZE]
        ]

        total_loss = training_step(model, optimizer, input_batch, output_batch)

        pbar.set_description(f'Epoch {e + 1}/{config.EPOCHS}')
        pbar.set_postfix_str(f'loss: {total_loss:.4f}')
        pbar.refresh()


  0%|          | 0/47 [00:00<?, ?it/s]

Instructions for updating:
Use fn_output_signature instead


Epoch 1/10: 100%|██████████| 47/47 [03:35<00:00,  4.58s/it, loss: 9.3254] 
Epoch 2/10: 100%|██████████| 47/47 [02:52<00:00,  3.66s/it, loss: 8.6950]
Epoch 3/10: 100%|██████████| 47/47 [02:55<00:00,  3.74s/it, loss: 7.9965]
Epoch 4/10: 100%|██████████| 47/47 [03:09<00:00,  4.03s/it, loss: 7.3039]
Epoch 5/10: 100%|██████████| 47/47 [03:04<00:00,  3.93s/it, loss: 6.5428]
Epoch 6/10: 100%|██████████| 47/47 [03:05<00:00,  3.95s/it, loss: 5.7288]
Epoch 7/10: 100%|██████████| 47/47 [03:03<00:00,  3.90s/it, loss: 4.8023]
Epoch 8/10: 100%|██████████| 47/47 [03:07<00:00,  3.98s/it, loss: 3.8613]
Epoch 9/10: 100%|██████████| 47/47 [02:55<00:00,  3.74s/it, loss: 3.1106]
Epoch 10/10: 100%|██████████| 47/47 [03:03<00:00,  3.90s/it, loss: 2.4582]


In [None]:
# model.fit(
#     [[input_sequences_np, input_sentences_lengths_np], [character_level_input_sequences_np, input_words_lengths_np]],
#     [output_sequences_np, correct_infos_np], epochs=config.EPOCHS,
#     batch_size=config.BATCH_SIZE)

In [None]:
test_sentence_inputs = ['hom nay troi dep qua']
test_word_level_sequences = word_level_tokenizer.texts_to_sequences(test_sentence_inputs)
test_sentence_lengths = [(len(sentence) if len(sentence) < config.MAX_SENTENCE_LENGTH else config.MAX_SENTENCE_LENGTH)
                         for sentence in test_word_level_sequences]

test_output = model.predict([[input_sequences_np[:100], input_sentences_lengths_np[:100]],
                             [character_level_input_sequences_np[:100], input_words_lengths_np[:100]]])

for sentence in test_output[0][10:11]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        print(index)
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)
print(tf.argmax(test_output[0][10][5], axis=0).numpy())


In [11]:
test_sentence_inputs = ['việc điều khiển xe sẽ trở nên rễ dàng hơn nếu các bánh xe được đặt theo một góc chính xác theo yêu cầu thiết kế, việc đặt saii góc sẽ tạo ra tính ổn định khi lái kém và khiến lốp xe nhanh mòn.']

test_word_level_sequences = word_level_tokenizer.texts_to_sequences(test_sentence_inputs)
test_sentence_lengths = [(len(sentence) if len(sentence) < config.MAX_SENTENCE_LENGTH else config.MAX_SENTENCE_LENGTH)
                         for sentence in test_word_level_sequences]
test_word_level_sequences = pad_sequences(test_word_level_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                          truncating='post')

character_level_tokenizer.fit_on_texts(test_sentence_inputs)
word_unk_level_tokenizer.fit_on_texts(test_sentence_inputs)
test_unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(test_sentence_inputs)

test_character_level_input_sequences = []
test_words_lengths = []
for sequence in test_unk_input_sequences:
    character_level_input_sequence = []
    words_lengths = []
    for word_token in sequence:
        word = word_unk_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                              else config.MAX_WORD_LENGTH))

    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    test_character_level_input_sequences.append(character_level_input_sequence)

    test_words_lengths.append(words_lengths)

test_character_level_input_sequences = pad_sequences(test_character_level_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                                padding='post', truncating='post')
test_words_lengths = pad_sequences(test_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                   truncating='post')

In [12]:
test_character_level_input_sequences = np.array(test_character_level_input_sequences)
test_words_lengths = np.array(test_words_lengths)
test_sentence_lengths = np.array(test_sentence_lengths)
test_word_level_sequences = np.array(test_word_level_sequences)

In [13]:
test_outputs = model.predict(
    [[test_word_level_sequences, test_sentence_lengths], [test_character_level_input_sequences, test_words_lengths]])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


In [14]:
for sentence in test_outputs[0]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)

việc điều phân về sẽ trở nên đến chính hơn nếu các từ xe được đặt theo một góc chính xác theo yêu cầu thiết kế việc đặt <UNK> sự sẽ tạo ra tính ổn định khi lại về và không các về vào về <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> 


In [None]:
test_outputs[1]

In [None]:
output_sequences_np[1]

In [None]:
input_sequences_np[1]