In [3]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import wandb
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
from deepcomedy.models.transformer import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Data preprocessing

In [4]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

In [5]:
input_text_raw = open(input_file, "rb").read().decode(encoding="utf-8")
target_text_raw = open(target_file, "rb").read().decode(encoding="utf-8")
print("Length of input text: {} characters".format(len(input_text_raw)))
print("Length of target text: {} characters".format(len(target_text_raw)))

Length of input text: 558637 characters
Length of target text: 873431 characters


In [6]:
input_vocab = sorted(set(input_text_raw))
target_vocab = sorted(set(target_text_raw))
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)

In [7]:
print("Input vocab size: {}".format(input_vocab_size))
print("Target vocab size: {}".format(target_vocab_size))

Input vocab size: 79
Target vocab size: 80


The *preprocess* function adds the start and end symbols to each line and eliminates the empty ones.

In [8]:
def preprocess(text):
    """
    For each line in the file, add start symbol "^" in the beginning and end symbol "$" in the end
    """
    return ["^" + line.strip() + "$" for line in text.split("\n") if line.strip() != ""]


input_text_prepr = preprocess(input_text_raw)
target_text_prepr = preprocess(target_text_raw)

The tokenizer encodes each line into a tensor of char-indexes and for simplicity fits only on the target's vocabulary.

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", char_level=True, lower=False
)
tokenizer.fit_on_texts(target_text_prepr)

input_text_lines_enc = tokenizer.texts_to_sequences(input_text_prepr)
target_text_lines_enc = tokenizer.texts_to_sequences(target_text_prepr)

Padding is required in order to have a non-ragged tensor to feed to the neural network.

In [10]:
def pad(x):
    return tf.keras.preprocessing.sequence.pad_sequences(x, padding="post")

In [11]:
input_text = pad(input_text_lines_enc)
target_text = pad(target_text_lines_enc)

In [12]:
input_train, input_test, target_train, target_test = train_test_split(
    input_text, target_text
)

## 2. The Transformer model


The dataset is created by grouping the lines in batches and by shuffling them.

Each input's line is in correspondence with its target.

In [13]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE

vocab_size = (
    len(tokenizer.word_index) + 1
)  # the +1 is added to take into account the id 0 of the padding

max_length_targ, max_length_inp = target_text.shape[1], input_text.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## 3. Training

In [14]:
num_layers = 6
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

In [15]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate
)

In [16]:
transformer_trainer = TransformerTrainer(transformer, checkpoint_save_path='./checkpoints/char-level-syll')

In [17]:
transformer_trainer.train(dataset, 20)

Epoch 1 Batch 0 Loss 4.9835 Accuracy 0.0016


KeyboardInterrupt: 

To train the decoder we use teacher forcing, calculating the loss between the predicted logits and the real id of the character.

## 4. Syllabification

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [37]:
def evaluate(sentence, max_length=200):

    encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(
        [encoder_input], maxlen=max_length, padding="post"
    )
    encoder_input = tf.convert_to_tensor(encoder_input)

    output = tf.convert_to_tensor([tokenizer.word_index["^"]])
    output = tf.expand_dims(output, 0)
    result = ""

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            break

    # output.shape (1, tokens)

    return result, attention_weights

In [38]:
def print_translation(sentence, result, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {result}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [39]:
sentence = "^E come l’aere, quand’ è ben pïorno,$"
ground_truth = "|E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,"


translated_text, attention_weights = evaluate(sentence)
print_translation(sentence, translated_text, ground_truth)

Input:         : ^E come l’aere, quand’ è ben pïorno,$
Prediction     : | E   | c o | m e   | l ’   a e | r e ,   | q u a n | d ’   è   | b e n   | p ï | o r | n o , $ 
Ground truth   : |E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,
