In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

data = pd.read_csv('data/vi_processed.csv')

correct_texts = []
error_texts = []

for index, row in data.iterrows():
    if len(correct_texts) == 100: break
    correct_texts.append(row.correct_text)
    error_texts.append(row.error_text)

correct_texts = correct_texts[:10000]
error_texts = error_texts[:10000]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 100000
word_level_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<UNK>', lower=True, split=' ', )

word_unk_level_tokenizer = Tokenizer(oov_token='<UNK>', lower=True, split=' ', )

character_vocab_size = 10000
character_level_tokenizer = Tokenizer(num_words=character_vocab_size, lower=True, char_level=True)

In [None]:
word_level_tokenizer.fit_on_texts(correct_texts)
word_unk_level_tokenizer.fit_on_texts(error_texts)
character_level_tokenizer.fit_on_texts(error_texts)

In [None]:
MAX_WORD_LENGTH = 16
MAX_SENTENCE_LENGTH = 64

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = word_level_tokenizer.texts_to_sequences(error_texts)
output_sequences = word_level_tokenizer.texts_to_sequences(correct_texts)
unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(error_texts)

character_level_input_sequences = []

for sequence in unk_input_sequences:
    character_level_input_sequence = []
    for word_token in sequence:
        word = word_unk_level_tokenizer.index_word[word_token]
        word = character_level_tokenizer.texts_to_sequences(word)
        word_chars = [each[0] for each in word]
        character_level_input_sequence.append(word_chars)
    character_level_input_sequence = pad_sequences(character_level_input_sequence, maxlen=MAX_WORD_LENGTH,
                                                   padding='post', truncating='post')
    character_level_input_sequences.append(character_level_input_sequence)

input_sequences = pad_sequences(input_sequences, maxlen=MAX_SENTENCE_LENGTH, padding='post', truncating='post')
output_sequences = pad_sequences(output_sequences, maxlen=MAX_SENTENCE_LENGTH, padding='post', truncating='post')

character_level_input_sequences = pad_sequences(character_level_input_sequences, maxlen=MAX_SENTENCE_LENGTH,
                                                padding='post',
                                                truncating='post')

input_sequences_np = np.array(input_sequences)
character_level_input_sequences_np = np.array(character_level_input_sequences)
output_sequences_np = np.array(output_sequences)

In [None]:
output_sequences_np[0]

In [None]:
# def prepare_batch(input, output):
#     word_level_input = word_level_tokenizer.texts_to_sequences(input)
#     word_level_input = tf.ragged.constant(word_level_input)
#     word_level_input = word_level_input[:, :MAX_WORD_LEVEL_TOKENS]
#     word_level_input = word_level_input.to_tensor()
# 
#     unk_input_sequences = word_unk_level_tokenizer.texts_to_sequences(input)
#     character_level_input_sequences = []
# 
#     for sequence in unk_input_sequences:
#         character_level_input_sequence = []
#         for word_token in sequence:
#             word = word_unk_level_tokenizer.index_word[word_token]
#             word = character_level_tokenizer.texts_to_sequences(word)
#             word_chars = [each[0] for each in word]
#             character_level_input_sequence.append(word_chars)
#         character_level_input_sequence = tf.ragged.constant(character_level_input_sequence)
#         character_level_input_sequence = character_level_input_sequence[
#                                          :MAX_SENTENCE_LENGTH,
#                                          :MAX_WORD_LENGTH]
#         character_level_input_sequence = character_level_input_sequence.to_tensor()
#         character_level_input_sequences.append(character_level_input_sequence)
# 
#     print(character_level_input_sequences)
#     character_level_input_sequences = tf.ragged.constant(character_level_input_sequences)
# 
#     output = word_level_tokenizer.texts_to_sequences(output)
#     output = tf.ragged.constant(output)
#     output = output[:, :MAX_SENTENCE_LENGTH]
#     output = output.to_tensor()
# 
#     return (word_level_input, character_level_input_sequences), output

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 200

In [None]:
from HierarchicalTransformerEncoder import HierarchicalTransformerEncoder

model = HierarchicalTransformerEncoder(num_character_level_layers=4,
                                       num_word_level_layers=12,
                                       character_level_d_model=64,
                                       word_level_d_model=128,
                                       num_heads=3, dff=512,
                                       max_word_length=MAX_WORD_LENGTH,
                                       max_sentence_length=MAX_SENTENCE_LENGTH,
                                       vocab_size=vocab_size,
                                       character_vocab_size=character_vocab_size)

In [None]:
from HierarchicalTransformerEncoder import custom_loss

word_input_shape = (BATCH_SIZE, MAX_SENTENCE_LENGTH)
char_input_shape = (BATCH_SIZE, MAX_SENTENCE_LENGTH, MAX_WORD_LENGTH)

model.build(input_shape=[word_input_shape, char_input_shape])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.summary()

In [None]:
# model.fit([input_sequences_np, character_level_input_sequences_np], output_sequences_np, epochs=20, batch_size=BATCH_SIZE)

In [None]:
test_output = model.predict([input_sequences_np, character_level_input_sequences_np])

In [None]:
for sentence in test_output[:]:
    out = ''
    for word in sentence:
        out += word_level_tokenizer.index_word.get(tf.argmax(word, axis=0).numpy()) + ' '
    print(out)