In [None]:
from Config import Config

config = Config()

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

data = pd.read_csv('data/vi_processed.csv')

correct_texts = []
error_texts = []

for index, row in data.iterrows():
    if len(correct_texts) == 100: break
    correct_texts.append(row.correct_text)
    error_texts.append(row.error_text)

correct_texts = correct_texts[:config.NUM_OF_INPUTS]
error_texts = error_texts[:config.NUM_OF_INPUTS]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

word_level_tokenizer = Tokenizer(num_words=config.VOCAB_SIZE, oov_token='<UNK>', lower=True, split=' ', )

word_unk_level_tokenizer = Tokenizer(oov_token='<UNK>', lower=True, split=' ', )

character_level_tokenizer = Tokenizer(num_words=config.CHARACTER_VOCAB_SIZE, lower=True, char_level=True)

In [None]:
word_level_tokenizer.fit_on_texts(correct_texts)
word_unk_level_tokenizer.fit_on_texts(error_texts)
character_level_tokenizer.fit_on_texts(error_texts)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = word_level_tokenizer.texts_to_sequences(error_texts)
output_sequences = word_level_tokenizer.texts_to_sequences(correct_texts)
unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(error_texts)

# Get character-level words lengths.
input_words_lengths = []

# Get character-level sequences.
character_level_input_sequences = []

for sequence in unk_input_sequences:
    character_level_input_sequence = []
    words_lengths = []
    for word_token in sequence:
        word = word_unk_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
        words_lengths.append((len(word_chars) if len(word_chars) <= config.MAX_WORD_LENGTH
                              else config.MAX_WORD_LENGTH))

    # Add padding for each word.
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=config.MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')

    character_level_input_sequences.append(character_level_input_sequence)

    input_words_lengths.append(words_lengths)

# Get word-level sentences lengths.
input_sentences_lengths = []
for sequence in input_sequences: input_sentences_lengths.append(
    (len(sequence) if len(sequence) <= config.MAX_SENTENCE_LENGTH
     else config.MAX_SENTENCE_LENGTH))

# Add padding for each.
input_sequences = pad_sequences(input_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
output_sequences = pad_sequences(output_sequences, maxlen=config.MAX_SENTENCE_LENGTH, padding='post', truncating='post')
character_level_input_sequences = pad_sequences(character_level_input_sequences, maxlen=config.MAX_SENTENCE_LENGTH,
                                                padding='post', truncating='post')
input_words_lengths = pad_sequences(input_words_lengths, maxlen=config.MAX_SENTENCE_LENGTH, padding='post',
                                    truncating='post')

input_sequences_np = np.array(input_sequences)
character_level_input_sequences_np = np.array(character_level_input_sequences)
output_sequences_np = np.array(output_sequences)

input_words_lengths_np = np.array(input_words_lengths)
input_sentences_lengths_np = np.array(input_sentences_lengths)

In [None]:
(input_sentences_lengths[0])

In [None]:
# def prepare_batch(input, output):
#     word_level_input = word_level_tokenizer.texts_to_sequences(input)
#     word_level_input = tf.ragged.constant(word_level_input)
#     word_level_input = word_level_input[:, :MAX_WORD_LEVEL_TOKENS]
#     word_level_input = word_level_input.to_tensor()
# 
#     unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(input)
#     character_level_input_sequences = []
# 
#     for sequence in unk_input_sequences:
#         character_level_input_sequence = []
#         for word_token in sequence:
#             word = word_unk_level_tokenizer.index_word[word_token]
#             word = character_level_tokenizer.texts_to_sequences(word)
#             word_chars = [each[0] for each in word]
#             character_level_input_sequence.append(word_chars)
#         character_level_input_sequence = tf.ragged.constant(character_level_input_sequence)
#         character_level_input_sequence = character_level_input_sequence[
#                                          :MAX_SENTENCE_LENGTH,
#                                          :MAX_WORD_LENGTH]
#         character_level_input_sequence = character_level_input_sequence.to_tensor()
#         character_level_input_sequences.append(character_level_input_sequence)
# 
#     print(character_level_input_sequences)
#     character_level_input_sequences = tf.ragged.constant(character_level_input_sequences)
# 
#     output = word_level_tokenizer.texts_to_sequences(output)
#     output = tf.ragged.constant(output)
#     output = output[:, :MAX_SENTENCE_LENGTH]
#     output = output.to_tensor()
# 
#     return (word_level_input, character_level_input_sequences), output

In [None]:
from HierarchicalTransformerEncoder import HierarchicalTransformerEncoder

model = HierarchicalTransformerEncoder(num_character_level_layers=config.NUM_CHARACTER_LEVEL_LAYERS,
                                       num_word_level_layers=config.NUM_WORD_LEVEL_LAYERS,
                                       character_level_d_model=config.CHARACTER_LEVEL_D_MODEL,
                                       word_level_d_model=config.WORD_LEVEL_D_MODEL,
                                       num_heads=config.NUM_HEADS, dff=config.DFF,
                                       max_word_length=config.MAX_WORD_LENGTH,
                                       max_sentence_length=config.MAX_SENTENCE_LENGTH,
                                       vocab_size=config.VOCAB_SIZE,
                                       character_vocab_size=config.CHARACTER_VOCAB_SIZE)

In [None]:
from CustomSchedule import CustomSchedule

learning_rate = CustomSchedule(config.WORD_LEVEL_D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
from HierarchicalTransformerEncoder import custom_loss

# word_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH)
# char_input_shape = (config.BATCH_SIZE, config.MAX_SENTENCE_LENGTH, config.MAX_WORD_LENGTH)
# 
# model.build(input_shape=[word_input_shape, char_input_shape])
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])
# model.summary()

In [None]:
model.fit(
    [[input_sequences_np, input_sentences_lengths_np], [character_level_input_sequences_np, input_words_lengths_np]],
    output_sequences_np, epochs=config.EPOCHS,
    batch_size=config.BATCH_SIZE)

# Test model

In [None]:
test_output = model.predict([[input_sequences_np, input_sentences_lengths_np], [character_level_input_sequences_np, input_words_lengths_np]])

In [None]:
for sentence in test_output[:100]:
    out = ''
    for word in sentence:
        index = tf.argmax(word, axis=0).numpy()
        word_str = word_level_tokenizer.index_word.get(index)
        if word_str is not None:
            out += word_str + ' '
        else: out += '<UNK> ' 
    print(out)

In [None]:
model.save('model.tf')

In [None]:
loaded_model = tf.keras.models.load_model('model.tf', custom_objects={'CustomSchedule':                   
CustomSchedule})

In [None]:
loaded_model.predict([[input_sequences_np, input_sentences_lengths_np], [character_level_input_sequences_np, input_words_lengths_np]])