In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
#import wandb
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

from deepcomedy.util.predicate import predicate
from nlgpoetry.hyphenation import *

## 1. Data loading and preprocessing

In [4]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [5]:
raw_input_text, input_text, input_tokenizer = load_verses(
    input_file, char_level=True, pad=True
)
raw_target_text, target_text, target_tokenizer = load_verses(
    target_file, char_level=True, pad=True
)

In [6]:
print("Length of input text: {} characters".format(len(raw_input_text)))
print("Length of target text: {} characters".format(len(raw_target_text)))

Length of input text: 578077 characters
Length of target text: 892871 characters


In [7]:
input_vocab = sorted(set(input_tokenizer.word_index.keys()))
target_vocab = sorted(set(target_tokenizer.word_index.keys()))
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)

In [8]:
print("Input vocab size: {}".format(input_vocab_size))
print("Target vocab size: {}".format(target_vocab_size))

Input vocab size: 80
Target vocab size: 81


In [9]:
input_train, input_test, target_train, target_test = train_test_split(
    input_text, target_text
)

In [10]:
tokenizer = target_tokenizer

## 2. The Transformer model


The dataset is created by grouping the lines in batches and by shuffling them.

Each input's line is in correspondence with its target.

In [11]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE
num_layers = 6 #con 4 non cambia niente, ma allora dov'è il problema?
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

vocab_size = (
    len(tokenizer.word_index) + 1
)  # the +1 is added to take into account the padding token

max_length_targ, max_length_inp = target_text.shape[1], input_text.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [12]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate,
)

## 3. Training

In [13]:
transformer_trainer = TransformerTrainer(
    transformer, checkpoint_save_path="./checkpoints/char-level-syll"
)

In [14]:
transformer_trainer.train(dataset, 10)

Epoch 1 Batch 0 Loss 4.7963 Accuracy 0.0154
Epoch 1 Batch 50 Loss 3.9402 Accuracy 0.1653
Epoch 1 Batch 100 Loss 3.4893 Accuracy 0.1892
Epoch 1 Batch 150 Loss 3.2268 Accuracy 0.2136
Epoch 1 Loss 3.1475 Accuracy 0.2244
Time taken for 1 epoch: 53.46 secs

Epoch 2 Batch 0 Loss 2.2236 Accuracy 0.3658
Epoch 2 Batch 50 Loss 2.1287 Accuracy 0.3702
Epoch 2 Batch 100 Loss 2.0415 Accuracy 0.3868
Epoch 2 Batch 150 Loss 1.9786 Accuracy 0.3980
Epoch 2 Loss 1.9613 Accuracy 0.4013
Time taken for 1 epoch: 40.09 secs

Epoch 3 Batch 0 Loss 1.7778 Accuracy 0.4412
Epoch 3 Batch 50 Loss 1.7341 Accuracy 0.4489
Epoch 3 Batch 100 Loss 1.6955 Accuracy 0.4588
Epoch 3 Batch 150 Loss 1.6598 Accuracy 0.4677
Epoch 3 Loss 1.6502 Accuracy 0.4703
Time taken for 1 epoch: 40.06 secs

Epoch 4 Batch 0 Loss 1.5191 Accuracy 0.5018
Epoch 4 Batch 50 Loss 1.4800 Accuracy 0.5172
Epoch 4 Batch 100 Loss 1.4023 Accuracy 0.5433
Epoch 4 Batch 150 Loss 1.2776 Accuracy 0.5855
Epoch 4 Loss 1.2319 Accuracy 0.6006
Time taken for 1 epoch: 

To train the decoder we use teacher forcing, calculating the loss between the predicted logits and the real id of the character.

## 4. Syllabification

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [15]:
def evaluate(sentence, max_length=200):

    encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(
        [encoder_input], maxlen=max_length, padding="post"
    )
    encoder_input = tf.convert_to_tensor(encoder_input)

    output = tf.convert_to_tensor([tokenizer.word_index["^"]])
    output = tf.expand_dims(output, 0)
    result = ""

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            break

    # output.shape (1, tokens)

    return result

In [16]:
def print_translation(sentence, result, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {result}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [17]:
sentence = "^E come l’aere, quand’ è ben pïorno,$"
ground_truth = "|E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,"


translated_text = evaluate(sentence)
print_translation(sentence, translated_text, ground_truth)

Input:         : ^E come l’aere, quand’ è ben pïorno,$
Prediction     : | C e n | d a | p e t | g i a l | v a | b e | m i r | g e | ò | z a | r e n | L r l n | v u | v u $ 
Ground truth   : |E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,


In [None]:
sentence = "^stasera mi butto, mi butto con te$"
ground_truth = ""

translated_text = evaluate(sentence)
print_translation(sentence, translated_text, ground_truth)

Input:         : ^stasera mi butto, mi butto con te$
Prediction     : | c | s i | c a | l i e | p o m | m s n | v e | p o | e z | s n e | d n e | d r e | s a u | p o | z o $ 
Ground truth   : 


## 5. Apply the syllabification to the Orlando

In [18]:
orlando_path = 'data/orlando-textonly.txt'
orlando = open(orlando_path, "rb").read().decode(encoding="utf-8")
orlando = orlando.split('\n')

In [21]:
@predicate
def is_not_empty(string):
    """
    Checks string is not empty
    """
    return string is not None and string != ''

@predicate
def is_not_number(string):
    try:
        int(string)
        return False
    except:
        return True

@predicate
def is_not_chapter(string):
    return not re.match(r'CANTO .*', string)

In [22]:
orlando_textonly = list(map(lambda x: x.strip(), orlando))
orlando_textonly = list(filter(is_not_empty, orlando_textonly))
orlando_textonly = list(filter(is_not_number, orlando_textonly))
orlando_textonly = list(filter(is_not_chapter, orlando_textonly))
orlando_syll = list(map(hyphenation, orlando_textonly))

In [23]:
def preprocess(x):
  x = '^' + x + '$'
  return x

orlando_preprocessed = list(map(preprocess, orlando_textonly))

In [24]:
tot_verses = len(orlando_textonly)

In [25]:
indices = np.random.randint(0, tot_verses, size=1000)

In [None]:
neural_syll = []
for i in indices:
  neural_syll.append(evaluate(orlando_textonly[i]))