In [1]:
import json
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

from Config import Config

# Read data.
json_sentences = []

with open('data/VSEC.jsonl', 'r') as file:
    for line in file:
        json_obj = json.loads(line[:-1])
        json_sentences.append(json_obj)

error_texts = []
correct_texts = []
correct_infos = []

for json_sentence in json_sentences:
    error_text = []
    correct_text = []
    correct_info = []
    for word in json_sentence['annotations']:
        correct_info.append(word['is_correct'])

        current_word = word['current_syllable'].lower()
        error_text.append(current_word)

        if word['is_correct'] is True:
            correct_text.append(current_word)
        else:
            correct_text.append(word['alternative_syllables'][0].lower())
            # if len(word['alternative_syllables']) > 1: print(word)
    error_texts.append(error_text)
    correct_texts.append(correct_text)
    correct_infos.append(correct_info)
# Main
config = Config()

word_level_tokenizer = Tokenizer(num_words=config.VOCAB_SIZE, oov_token='<UNK>', lower=True)
word_unk_level_tokenizer = Tokenizer(oov_token='<UNK>', lower=True)
character_level_tokenizer = Tokenizer(lower=True, char_level=True)

word_level_tokenizer.fit_on_texts(correct_texts)
word_unk_level_tokenizer.fit_on_texts(error_texts)

import itertools

flattened_sentences = list(itertools.chain(*(error_texts + correct_texts)))
character_level_tokenizer.fit_on_texts(flattened_sentences)

2024-04-06 21:46:08.048966: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-06 21:46:08.050581: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-06 21:46:08.084981: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-06 21:46:08.085704: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = word_level_tokenizer.texts_to_sequences(error_texts)
output_sequences = word_level_tokenizer.texts_to_sequences(correct_texts)
unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(error_texts)

# Get character-level words lengths.
input_words_lengths = []

# Get character-level sequences.
character_level_input_sequences = []

for sequence in unk_input_sequences:
    character_level_input_sequence = []
    words_lengths = []
    for word_token in sequence:
        word = word_unk_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                              else config.MAX_WORD_LENGTH))

    # Add padding for each word.
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    character_level_input_sequences.append(character_level_input_sequence)

    input_words_lengths.append(words_lengths)

# Get word-level sentences lengths.
input_sentences_lengths = []
for sequence in input_sequences: input_sentences_lengths.append(
    (len(sequence) if len(sequence) <= config.MAX_SENTENCE_LENGTH
     else config.MAX_SENTENCE_LENGTH))

# Add padding for each.
input_sequences = pad_sequences(input_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
output_sequences = pad_sequences(output_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
character_level_input_sequences = pad_sequences(character_level_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                                padding='post', truncating='post')
input_words_lengths = pad_sequences(input_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                    truncating='post')
correct_infos = pad_sequences(correct_infos, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', value=1)

input_sequences_np = np.array(input_sequences)
character_level_input_sequences_np = np.array(character_level_input_sequences)
output_sequences_np = np.array(output_sequences)

input_words_lengths_np = np.array(input_words_lengths)
input_sentences_lengths_np = np.array(input_sentences_lengths)
correct_infos_np = np.array(correct_infos)

In [3]:
correct_infos_np[1]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [4]:
output_sequences_np[1]

array([ 137,  313,   72,   76,   69,  453,  164,   29,  176,  171,   65,
         15,   67,  383, 2867,  513,  435,    2,  351,   61,  307,  535,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [5]:
from HierarchicalTransformerEncoder import HierarchicalTransformerEncoder

model = HierarchicalTransformerEncoder(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                       num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                       character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                       word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                       num_heads=config.NUM_HEADS, dff=config.DFF,
                                       max_word_length=config.MAX_WORD_LENGTH,
                                       max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                       vocab_size=config.VOCAB_SIZE,
                                       character_vocab_size=config.CHARACTER_VOCAB_SIZE)

In [6]:
from CustomSchedule import CustomSchedule

learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [7]:
from HierarchicalTransformerEncoder import custom_loss
# word_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH)
# char_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH, config.MAX_WORD_LENGTH)
# sentence_lengths_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH)
# word_lengths_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH, config.MAX_WORD_LENGTH)

# model.build(input_shape=[[word_input_shape, sentence_lengths_shape], [char_input_shape, word_lengths_shape]])
model.compile(optimizer=optimizer, loss=custom_loss, metrics=['acc'])

In [8]:
correct_infos_np[102]

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [9]:
model.fit(
    [[input_sequences_np, input_sentences_lengths_np], [character_level_input_sequences_np, input_words_lengths_np]],
    [output_sequences_np, correct_infos_np], epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE)

Epoch 1/10
Shape cá»§a word_embedding_outputs: (None, 40, 128)
Instructions for updating:
Use fn_output_signature instead
y_pred shape: (40, 10000)
y_true shape: (40,)
pred_detect_shape  Tensor("custom_loss/strided_slice_3:0", shape=(10000,), dtype=float32)
true_detect_shape  Tensor("custom_loss/strided_slice_1:0", shape=(), dtype=int32)
y_pred shape: (40,)
y_true shape: (40,)
pred_detect_shape  Tensor("custom_loss_1/strided_slice_3:0", shape=(), dtype=float32)
true_detect_shape  Tensor("custom_loss_1/strided_slice_1:0", shape=(), dtype=int32)


StagingError: in user code:

    File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/thanhan/Documents/projects/python/SpellingCorrection_HierarchicalTransformerEncoder/HierarchicalTransformerEncoder.py", line 91, in custom_loss  *
        softmax_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)(true_outputs, pred_outputs)
    File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/losses.py", line 142, in __call__  **
        losses = call_fn(y_true, y_pred)
    File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/losses.py", line 2354, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "/home/thanhan/.local/lib/python3.10/site-packages/keras/src/backend.py", line 5728, in sparse_categorical_crossentropy
        axis %= output_rank

    ZeroDivisionError: integer division or modulo by zero


In [None]:
test_output = model.predict([[input_sequences_np[:100], input_sentences_lengths_np[:100]],
                             [character_level_input_sequences_np[:100], input_words_lengths_np[:100]]])

for sentence in test_output[:3]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        print(index)
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else:
            out += '<UNK> '
    print(out)