In [19]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import math
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *
from deepcomedy.metrics import *
import tqdm

from nlgpoetry.hyphenation import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Data preprocessing

In [2]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet='')
text = preprocess_text(raw_text, end_of_tercet='')

Split preprocessed text into verses

In [3]:
sep = "<EOV>"
input_tercets = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_tercets = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [4]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

Get windows of three and four verses on the whole dataset.

In [5]:
input_text = []         # All windows of three verses (not-syll)
target_text = []        # All windows of four verses  (syll)
target_text_tercet = [] # All windows of three verses (syll)

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 3])))
    target_text_tercet.append(list(chain(*enc_target_tercets[line : line + 3])))
    target_text.append(list(chain(*enc_target_tercets[line : line + 4])))

Pad sequences

In [8]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)
padded_target_text_tercet = tf.keras.preprocessing.sequence.pad_sequences(
    target_text_tercet, padding="post"
)

In [9]:
input_train, input_val, target_train, target_val, target_tercet_train, target_tercet_val = train_test_split(padded_input_text, padded_target_text, padded_target_text_tercet)

## 2. Load model

If needed, load the model changing the the parameters accordingly.

In [11]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}
transformer = load_transformer_model(config, input_vocab_size, target_vocab_size, target_tokenizer, './models/c2c-gen.h5')

## 3. Hyperparameter sweep

We use weights and biases to perform hyperparameter optimization.

In [30]:
sweep_config = {
    "name": "char2char-sweep-2",
    "method": "grid",
    "metric": {"name": "loss", "goal": "minimize"},
    "parameters": {
        "batch_size": {"value": 32},
        "epochs": {"value": 50},
        "num_layers": {"values": [4, 8, 12]},
        "num_heads": {"values": [4, 8]},
        "d_model": {"value": 256},
        "dff": {"value": 512},
    },
}

sweep_id = wandb.sweep(sweep_config, project='deepcomedy', entity='deepcomedy')

Create sweep with ID: 177t7xlp
Sweep URL: https://wandb.ai/deepcomedy/deepcomedy/sweeps/177t7xlp


In [None]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

# Input for generation
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

def sweep():
    with wandb.init() as run:
        config = wandb.config
        dataset = make_dataset(input_train, target_train, batch_size=config["batch_size"])
        validation_dataset = make_dataset(input_val, target_val, batch_size=config["batch_size"])
        model, trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path=None)
        trainer.train(dataset, config["epochs"], log_wandb=True, validation_dataset=validation_dataset, validation_every=5)
        
        # Generate
        result = generate(model, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 30, start_symbol, stop_symbol)
        html_result = re.sub(r'\n', '<br>', result)
        wandb.log({"generated": wandb.Html("<pre>" + html_result + "</pre>", inject=False)})
        
        path = './models/char2char-' + str(config['epochs']) + '-' + str(config['num_layers']) + '-' + str(config['num_heads']) + '-' + str(config['dff']) + '.h5'
        model.save_weights(path)
        
        wandb.save(path)
              
        # Generation metrics
        # avg_syll, hend_ratio, plagiarism, correctness, incorrectness, rhymeness = generation_metrics(result)
        avg_syll, hend_ratio, rhymeness, plagiarism, correctness, incorrectness = generation_metrics(result)        
        
        wandb.log({
            'avg_syll': avg_syll, 
            'hend_ratio': hend_ratio,
            'plagiarism': plagiarism,
            'correctness': correctness,
            'incorrectness': incorrectness,
            'rhymeness': rhymeness,
        })
        
wandb.agent(sweep_id, function=sweep)

[34m[1mwandb[0m: Agent Starting Run: 3vz1f5j4 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dff: 512
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	num_heads: 4
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch 1 Batch 0 Loss 4.8135 Accuracy 0.0065
Epoch 1 Batch 50 Loss 3.9673 Accuracy 0.1197
Epoch 1 Batch 100 Loss 3.5294 Accuracy 0.1657
Epoch 1 Batch 150 Loss 3.3100 Accuracy 0.1875
Epoch 1 Batch 200 Loss 3.0889 Accuracy 0.2182
Epoch 1 Batch 250 Loss 2.9057 Accuracy 0.2444
Epoch 1 Batch 300 Loss 2.7665 Accuracy 0.2642
Epoch 1 Loss 2.6951 Accuracy 0.2742
Time taken for 1 epoch: 58.11 secs

Epoch 2 Batch 0 Loss 2.0080 Accuracy 0.3666
Epoch 2 Batch 50 Loss 1.9905 Accuracy 0.3739
Epoch 2 Batch 100 Loss 1.9731 Accuracy 0.3764
Epoch 2 Batch 150 Loss 1.9587 Accuracy 0.3791
Epoch 2 Batch 200 Loss 1.9450 Accuracy 0.3819
Epoch 2 Batch 250 Loss 1.9318 Accuracy 0.3847
Epoch 2 Batch 300 Loss 1.9192 Accuracy 0.3875
Epoch 2 Loss 1.9105 Accuracy 0.3897
Time taken for 1 epoch: 45.29 secs

Epoch 3 Batch 0 Loss 1.8013 Accuracy 0.4170
Epoch 3 Batch 50 Loss 1.8005 Accuracy 0.4177
Epoch 3 Batch 100 Loss 1.7819 Accuracy 0.4227
Epoch 3 Batch 150 Loss 1.7634 Accuracy 0.4276
Epoch 3 Batch 200 Loss 1.7435 Accurac

## 4. Training

In [10]:
dataset = make_dataset(input_train, target_train)
val_dataset = make_dataset(input_val, target_val)

In [11]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}
transformer, transformer_trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path= None)

In [12]:
wandb.init()
transformer_trainer.train(dataset, 30, validation_dataset=val_dataset, validation_every=1, log_wandb=True)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch 1 Batch 0 Loss 5.1948 Accuracy 0.0104
Epoch 1 Batch 50 Loss 4.0714 Accuracy 0.1170
Epoch 1 Batch 100 Loss 3.5884 Accuracy 0.1623
Epoch 1 Batch 150 Loss 3.3615 Accuracy 0.1833
Epoch 1 Batch 200 Loss 3.1426 Accuracy 0.2121
Epoch 1 Batch 250 Loss 2.9530 Accuracy 0.2381
Epoch 1 Batch 300 Loss 2.8055 Accuracy 0.2585
Epoch 1 Batch 0 Validation Loss 1.9196 Validation Accuracy 0.3867
Epoch 1 Batch 50 Validation Loss 1.9345 Validation Accuracy 0.3875
Epoch 1 Batch 100 Validation Loss 1.9359 Validation Accuracy 0.3869
Epoch 1 Loss 2.7298 Accuracy 0.2690
Time taken for 1 epoch: 70.13 secs

Epoch 2 Batch 0 Loss 2.0107 Accuracy 0.3646
Epoch 2 Batch 50 Loss 1.9831 Accuracy 0.3727
Epoch 2 Batch 100 Loss 1.9686 Accuracy 0.3756
Epoch 2 Batch 150 Loss 1.9553 Accuracy 0.3784
Epoch 2 Batch 200 Loss 1.9429 Accuracy 0.3811
Epoch 2 Batch 250 Loss 1.9308 Accuracy 0.3840
Epoch 2 Batch 300 Loss 1.9175 Accuracy 0.3874
Epoch 2 Batch 0 Validation Loss 1.7672 Validation Accuracy 0.4215
Epoch 2 Batch 50 Valida

Save model.

In [None]:
save_transformer_model(transformer, 'models/c2c-gen-input_train.h5')

## 5. Generation

In [14]:
def generate(transformer, input_sequence, target_sequence, input_tokenizer, target_tokenizer, steps, start_symbol, stop_symbol):

    result = strip_tokens(target_tokenizer.sequences_to_texts(target_sequence)[0])
    
    encoder_input = input_sequence
    decoder_input = target_sequence

    for _ in tqdm.tqdm(range(steps)):

        encoder_input = tf.convert_to_tensor(encoder_input)
        decoder_input = tf.convert_to_tensor(decoder_input)
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, choose_next_token=choose_topk)

        # Detokenize output
        generated_text = target_tokenizer.sequences_to_texts(output.numpy())[0]
        
        # Remove structural tokens (<EOV>, <GO>, <SEP>)
        generated_text = strip_tokens(generated_text)
        
        # Split into verses
        generated_verses = generated_text.split('\n')
        
        # Append last generated verse to result
        result = '\n'.join([result, generated_verses[-1]])
        
        # Create input for next step by taking last three verses
        next_input = '\n'.join(generated_verses[-3:])
        next_input = preprocess_text(next_input, end_of_tercet='')
        
        decoder_input = target_tokenizer.texts_to_sequences([next_input])
        
        # The encoder input should not have syllable separators
        encoder_input = remove_syll_token(next_input)
        encoder_input = input_tokenizer.texts_to_sequences([encoder_input])
        
    return result

In [15]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 30, start_symbol, stop_symbol)

100%|██████████| 30/30 [01:36<00:00,  3.22s/it]


In [11]:
print(result)

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|E |quel|la |che |l’ a|ni|ma |di |Dio |cu|ra,
|del |mio |a|mor, |che |mi |fa |di|scer|ne
|di |quel|la |par|te |che |di |là |m’ ap|pu|ra.
|E |io |a|vea |la |vir|tù |che |l’ u|der|ne
|la |pri|ma |mi|se|ria |che |la |co|sa |giu|sti
|a |la |co|da |sua |per|cuo|ta |la |ger|ne.
|E |que|sta |lin|gua, |che |l’ un |po|co a|gu|sti
|più |che |po|tea |le |sue |ma|ni |di|scer|ne,
|co|me |sa|reb|be |lu|ce |si |ri|ciu|sti,
|se |non |po|tea |me|mo|ria |le|va|ter|ne,
|che |la |co|sa e |di |san|za |com’ |io |sce|sa
|che |non |si |può |sì |la |ve|ra|ce |ster|ne,
|e |l’ al|tra |mer|ta|re |sì |se|gue |stre|sa.
|E |se |la |vo|glia |che |la |par|te |guer|ne
|di |quel|la |sua |pa|ro|la |che |li |vol|se,
|che |non |si |fa |da |l’ al|tra |suo |con|ver|ne,
|e |per|ché ’l |sol |che |la |pa|ro|la |scol|se
|poi |che |la |mia |men|te |lui |si |ri|co|ne,
|e |poi |di|cea:« |Quan|to |l

### Generation metrics

The following function computes all generation metrics

In [25]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)
original_text

# Get the set of real words from the Divine Comedy to evaluate word correctness
# TODO create function to obtain word-level vocabulary from divine comedy
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='\n-:,?“‘)—»«!”(";.’ ', lower=False)
word_tokenizer.fit_on_texts([raw_text])
real_words = set(word_tokenizer.word_index.keys())

def generation_metrics(result):
    
    result_verses = result.split("\n")
    
    avg_syll = average_syllables(result_verses)
    hend_ratio = correct_hendecasyllables_ratio(result_verses)
    
    result_verses = re.sub(r'\|', '', result)
    result_verses = remove_punctuation(result_verses)

    plagiarism = ngrams_plagiarism(result_verses, original_text)
    
    gen_tokenizer = tfds.deprecated.text.Tokenizer()
    gen_words = gen_tokenizer.tokenize(result_verses)
    
    correctness, _ = correct_words_ratio(gen_words, real_words, return_errors=True)
    incorrectness_score = incorrectness(set(gen_words), real_words)
            
    result_verses = result_verses.split('\n')
    rhyme_ratio = chained_rhymes_ratio(result_verses)
    
    return avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score

In [19]:
avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score = generation_metrics(result)

del calcato fiamme la sua più fatta de l altro solo che la fiamma piaggia
come la faccia di quella raggia che la sua vita per la più di sua fige
sovra sopra la stretta che sarte si farà con questi fame convene
la mia carte de la virtù del mige con le sue parole parole sue piane
con le sue parole parole sue piane Io son questo falso che parlana
disposto saro e a l altre pensene di color che più sono a la fama


In [20]:
print("average syllables per verse: {}\nhendecasyllables ratio: {}\nrhyme_ratio: {}\nngrams plagiarism: {}\ncorrectness: {}\nincorrectness_score: {}\n"\
      .format(avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score))

average syllables per verse: 10.866666666666667
hendecasyllables ratio: 1.0
rhyme_ratio: 0.6842105263157895
ngrams plagiarism: 0.10185185185185185
correctness: 0.908675799086758
incorrectness_score: 0.19



### Hendecasyllabicness

In [None]:
result_verses = stripped_result.split("\n")

In [None]:
avg_syll = average_syllables(result_verses)

In [None]:
hend_ratio = correct_hendecasyllables_ratio(result_verses)

In [None]:
print('Average number of syllables per verse: {}'.format(avg_syll))
print('Ratio of hendecasyllables to total number of verses: {}'.format(hend_ratio))

Average number of syllables per verse: 11.0
Ratio of hendecasyllables to total number of verses: 1.0


### Ngrams plagiarism

In [None]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)

In [None]:
result_verses = re.sub(r'\|', '', strip_result)
result_verses = remove_punctuation(result_verses)

In [None]:
plagiarism = ngrams_plagiarism(result_verses, original_text)

In [None]:
print('Plagiarism: {}'.format(plagiarism))

Plagiarism: 0.425


### Word correctness

In [None]:
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='\n-:,?“‘)—»«!”(";.’ ', lower=False)
word_tokenizer.fit_on_texts([raw_text])
real_words = set(word_tokenizer.word_index.keys())

In [None]:
gen_tokenizer = tfds.deprecated.text.Tokenizer()
gen_words = gen_tokenizer.tokenize(result_verses)

In [None]:
correctness, errors = correct_words_ratio(gen_words, real_words, return_errors=True)
incorrectness_score = incorrectness(set(gen_words), real_words)

In [None]:
print('Correct words: {:.2f}%'.format(correctness * 100))

Correct words: 97.67%


The incorrectness score also considers "how wrong" a word is, by computing the edit distance between an incorrect word and the nearest correct word in the vocabulary.

In [None]:
print('Incorrectness score: {}'.format(incorrectness_score))

Incorrectness score: 0.06


### Rhymeness

In [None]:
result_verses = result_verses.split('\n')
rhyme_ratio = chained_rhymes_ratio(result_verses)

In [None]:
print('Correct rhymes ratio: {}'.format(rhyme_ratio))

Correct rhymes ratio: 1.0


## 6. Syllabification

### 6.1 Experiment on the first verses of the Divine Comedy

Let's see how the algorithm performs on the first verses of the Divine Comedy.

In [92]:
sep = "<EOV>"
input_verses = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_verses = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

enc_input_verses = input_tokenizer.texts_to_sequences(input_verses)

In [98]:
correct_syll_text = list(map(lambda x: strip_tokens(x), target_verses))

In [23]:
padded_input_verses = tf.keras.preprocessing.sequence.pad_sequences(
    enc_input_verses, padding="post"
)

In [88]:
N=9

start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = tf.convert_to_tensor(padded_input_verses[:N])
decoder_input = tf.repeat([[start_symbol]], repeats=encoder_input.shape[0], axis=0)

output = evaluate(transformer, encoder_input, decoder_input,  stop_symbol, stopping_condition=stop_after_stop_symbol)

# Only take output before the first end of verse
stripped_output_N = list(map(lambda x: x.split('<EOV>')[0], target_tokenizer.sequences_to_texts(output.numpy())))
stripped_output_N = list(map(strip_tokens, stripped_output_N))

In [89]:
stripped_output_N

['|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta',
 '|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,',
 '|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.',
 '|Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra',
 '|e|sta |sel|va |sel|va|gia e |a|spra e |for|te',
 '|che |nel |pen|sier |ri|no|va |la |pa|u|ra!',
 '|Tan|t’ è |a|ma|ra |che |po|co è |più |mor|te;',
 '|ma |per |trat|tar |del |ben |ch’ i’ |vi |tro|vai,',
 '|di|rò |de |l’ al|tre |co|se |ch’ i’ |v’ ho |scor|te.']

In [101]:
exact_matches, similarities = zip(*validate_syllabification(stripped_output_N, correct_syll_text))

accuracy = sum(exact_matches) / len(exact_matches)
avg_similarities = np.mean(similarities)

print('Syllabification exact matches: {:.2f}%'.format(accuracy * 100))
print('Average similarity: {:.2f}'.format(avg_similarities))

Syllabification exact matches: 88.89%
Average similarity: 1.00


### 6.2 Syllabification of the validation set
Validation of the syllabification on input_val

In [26]:
n_verses = len(input_val)

# Obtain tercets from input and target "windows"
syll_input = input_val[:n_verses:3]
syll_input_text = strip_tokens(' '.join(input_tokenizer.sequences_to_texts(syll_input)))
syll_input_text = preprocess_text(syll_input_text, end_of_tercet='')

correct_syll = target_tercet_val[:n_verses:3]
correct_syll_text = strip_tokens(' '.join(target_tokenizer.sequences_to_texts(correct_syll)))
correct_syll_text = correct_syll_text.split('\n')

In [27]:
sep = "<EOV>"
input_verses_val = [x.lstrip() + sep for x in syll_input_text.split(sep)][:-1]

enc_input_verses_val = input_tokenizer.texts_to_sequences(input_verses_val)

padded_input_verses_val = tf.keras.preprocessing.sequence.pad_sequences(
    enc_input_verses_val, padding="post"
)

The evaluate function can handle many syllabification tasks in parallel, generating each output sentence simultaneously until all outputs contain at least one \<EOV\> token. This is faster than handling one sentence at a time, however we found that giving the whole test set in parallel results in GPU out-of-memory, so we came up with this solution that seems to be a good trade-off between parallelism and memory consumption.

What we do is split the test set in batches of 100 verses, and call `evaluate` on one batch at a time passing the appropriate stopping condition.

As an empirical proof, try using a `window_size` of 1: you will see that the ETA will grow to several hours, while the whole process only took about 20 minutes in this experiment.

In [30]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

window_size = 100

result = []

for i in tqdm.tqdm(range(math.ceil(len(input_val) / window_size))):
    window = padded_input_verses_val[i*window_size:min((i + 1)*window_size, len(padded_input_verses_val))]
    
    encoder_input = tf.convert_to_tensor(window)
    decoder_input = tf.repeat([[start_symbol]], repeats=encoder_input.shape[0], axis=0)
    
    output = evaluate(transformer, encoder_input, decoder_input,  stop_symbol, stopping_condition=stop_after_stop_symbol)
    
    # Only take output before the first end of verse
    stripped_output = list(map(lambda x: x.split('<EOV>')[0], target_tokenizer.sequences_to_texts(output.numpy())))
    stripped_output = list(map(strip_tokens, stripped_output))
    
    result += stripped_output

100%|██████████| 36/36 [17:14<00:00, 28.73s/it]


In [33]:
exact_matches, similarities = zip(*validate_syllabification(result, correct_syll_text))

accuracy = sum(exact_matches) / len(exact_matches)
avg_similarities = np.mean(similarities)

print('Syllabification exact matches: {:.2f}%'.format(accuracy * 100))
print('Average similarity: {:.2f}'.format(avg_similarities))

Syllabification exact matches: 96.68%
Average similarity: 1.00


In [37]:
result = np.array(result)
correct_syll_text = np.array(correct_syll_text)
error_mask = ~np.array(exact_matches)

errors_output = result[error_mask]
errors_correct = correct_syll_text[error_mask]

In [38]:
errors_output

array(['|Ve|de|va |Tro|ia in |ce|ne|re e |in |ca|ver|ne;',
       '|le|gno è |più |sù |che |fu |mor|so |da E|va,',
       '|io |non |vi |di|scer|nea al|cu|na |co|sa.',
       '“ |Sum|mae |De|us |cle|men|tï|a”” |nel |se|no',
       '|di|ce|an:« |Chi |è |co|stui |che |san|za |mor|te',
       '|me|mo|ria, |in|tel|li|gen|za e |vo|lon|ta|de',
       '|con |ar|chi |e |a|sti|ciuo|le |pri|ma e|let|te;',
       '|Com’ |ei |par|la|va, e |Sor|del|lo a |sé |il |tras|se',
       '|le|gno è |più |sù |che |fu |mor|so |da E|va,',
       '|El |dis|se a |me:« |To|sto |ver|rà |di |sov|ra',
       '|è |Teg|ghia|io Al|do|bran|di, |la |cui |vo|ce',
       '|Mes|so |t’ ho |in|nan|zi: o|mai |per |te |ti |ci|ba;',
       '|con |tre |go|le |ca|ni|na|men|te |la|traa',
       '|Be|a|ti |pa|u|pe|res |spi|ri|tu!’ |vo|ci',
       '|e|ra o|no|ra|ta, es|sa e |suoi |con|sor|ti:',
       '|On|de, |pe|rò |che |a |l’ at|to |che |con|ce|pe',
       '|sì |che, |veg|gen|do|la |io |so|spe|sa e |va|ga,',
       '|quan|do |fu’ 

In [39]:
errors_correct

array(['|Ve|de|va |Tro|ia in |ce|ne|re e in |ca|ver|ne;',
       '|le|gno è |più |sù |che |fu |mor|so |da |E|va,',
       '|io |non |vi |di|scer|nea |al|cu|na |co|sa.',
       '“ |Sum|mae |De|us |cle|men|tï|ae” |nel |se|no',
       '|di|cean:« |Chi |è |co|stui |che |san|za |mor|te',
       '|me|mo|ria, in|tel|li|gen|za e |vo|lon|ta|de',
       '|con |ar|chi e |as|tic|ciuo|le |pri|ma e|let|te;',
       '|Com’ |ei |par|la|va, e |Sor|del|lo a |sé il |tras|se',
       '|le|gno è |più |sù |che |fu |mor|so |da |E|va,',
       '|El |dis|se a |me:« |To|sto |ver|rà |di |so|vra',
       '|è |Teg|ghiaio |Al|do|bran|di, |la |cui |vo|ce',
       '|Mes|so |t’ ho in|nan|zi: o|mai |per |te |ti |ci|ba;',
       '|con |tre |go|le |ca|ni|na|men|te |la|tra',
       '|‘be|a|ti |pau|pe|res |spi|ri|tu! ’ |vo|ci',
       '|e|ra o|no|ra|ta, es|sa |e |suoi |con|sor|ti:',
       '|On|de, |pe|rò |che a |l’ at|to |che |con|ce|pe',
       '|sì |che, |veg|gen|do|la io |so|spe|sa e |va|ga,',
       '|quan|do |fu’ |so

### 6.3 Syllabificatio of the Orlando Furioso
We performed the syllabification of the whole *Orlando Furioso* by Ludovico Ariosto and compared the results with the `hyphenation` algorithm from Neural Poetry.

In [30]:
def is_not_number(string):
    try:
        int(string)
        return False
    except:
        return True

def is_not_chapter(string):
    return not re.match(r'CANTO .*', string)

In [31]:
raw_text_ariosto = open("./data/orlando-textonly.txt", "rb").read().decode(encoding="utf-8")
raw_text_ariosto = raw_text_ariosto.split('\n')
raw_text_ariosto = list(map(lambda x: x.strip(), raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_empty, raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_number, raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_chapter, raw_text_ariosto))

In [32]:
sep = "<EOV>"

raw_text_ariosto_joined = "\n".join(raw_text_ariosto)
text_ariosto = preprocess_text(raw_text_ariosto_joined, end_of_tercet='')
text_ariosto = re.sub("'", '’', text_ariosto)
ariosto_verses = [x.lstrip() + sep for x in text_ariosto.split(sep)][:-1]
enc_ariosto_verses = input_tokenizer.texts_to_sequences(ariosto_verses)

In [33]:
padded_enc_ariosto_verses = tf.keras.preprocessing.sequence.pad_sequences(
    enc_ariosto_verses, padding="post"
)

Actually produce the syllabification, the whole process requires a few hours. So we provide the results in the outputs folder. If you want to see the results just skip this cell and run the following ones.

In [None]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

window_size = 100

result = []

for i in tqdm.tqdm(range(math.ceil(len(ariosto_verses) / window_size))):
    window = padded_enc_ariosto_verses[i*window_size:min((i + 1)*window_size, len(padded_enc_ariosto_verses))]
    
    encoder_input = tf.convert_to_tensor(window)
    decoder_input = tf.repeat([[start_symbol]], repeats=encoder_input.shape[0], axis=0)
    
    output = evaluate(transformer, encoder_input, decoder_input,  stop_symbol, stopping_condition=stop_after_stop_symbol)
    
    # Only take output before the first end of verse
    stripped_output = list(map(lambda x: x.split('<EOV>')[0], target_tokenizer.sequences_to_texts(output.numpy())))
    stripped_output = list(map(strip_tokens, stripped_output))
    
    result += stripped_output

In [35]:
result = []
with open('outputs/orlando_syll.txt') as file:
    for line in file:
        result.append(line)

In [36]:
result

['|Le |don|ne, i |ca|val|lier, |l’ ar|me, |gli a|mo|ri,\n',
 '|le |cor|te|sie, |l’ au|da|ci im|pre|se io |can|to,\n',
 '|che |fu|ro al |tem|po |che |pas|sa|ro i |Mo|ri\n',
 '|d’ A|fri|ca il |ma|re, e |in |Fran|cia |noc|quer |tan|to,\n',
 '|se|guen|do |l’ i|re e i |gio|ve|nil |fu|ro|ri\n',
 '|d’ A|gra|man|te |lor |re, |che |si |diè |van|to\n',
 '|di |ven|di|car |la |mor|te |di |Tro|ia|no\n',
 '|so|pra |re |Car|lo im|pe|ra|tor |ro|ma|no.\n',
 '|Di|rò |d’ Or|lan|do in |un |me|de|smo |trat|to\n',
 '|co|sa |non |det|ta in |pro|sa |mai, |né |in |ri|ma:\n',
 '|che |per |a|mor |ven|ne in |fu|ro|re e |mat|to,\n',
 '|d’ uom |che |sì |sag|gio e|ra |sti|ma|to |pri|ma;\n',
 '|se |da |co|lei |che |tal |qua|si |m’ ha |fat|to,\n',
 '|che ’l |po|co in|ge|gno ad |or |ad |or |mi |li|ma,\n',
 '|me |ne |sa|rà |pe|rò |tan|to |con|ces|so,\n',
 '|che |mi |ba|sti a |fi|nir |quan|to ho |pro|mes|so.\n',
 '|Piac|cia|vi, |ge|ne|ro|sa Er|cu|lea |pro|le,\n',
 '|or|na|men|to e |splen|dor |del |se|col |no|stro,\n',
 '

Obtain alternative syllabification for the Orlando furioso (from Neural Poetry)

In [37]:
ariosto_alt_syll = list(map(hyphenation, raw_text_ariosto))
ariosto_alt_syll = list(map(lambda x: '|' + '|'.join(x), ariosto_alt_syll))

In [38]:
ariosto_alt_syll

["|Le |don|ne,| i |ca|val|lier|, l'ar|me,| gli| a|mo|ri,",
 "|le |cor|te|sie,| l'au|da|ci| im|pre|se| io |can|to,",
 '|che |fu|ro| al| tem|po |che |pas|sa|ro| i |Mo|ri',
 "|d'A|fri|ca| il| ma|re,| e| in| Fran|cia |noc|quer| tan|to,",
 "|se|guen|do |l'i|re| e| i |gio|ve|nil| fu|ro|ri",
 "|d'A|gra|man|te |lor| re,| che |si |di|è |van|to",
 '|di |ven|di|car| la |mor|te |di |Troi|a|no',
 '|so|pra |re |Car|lo| im|pe|ra|tor| ro|ma|no.',
 "|Di|rò |d'Or|lan|do| in| un| me|de|smo |trat|to",
 '|co|sa |non| det|ta| in| pro|sa |mai,| né| in| ri|ma:',
 '|che |per| a|mor| ven|ne| in| fu|ro|re| e |mat|to,',
 "|d'uom| che |sì |sag|gio| e|ra |sti|ma|to |pri|ma;",
 "|se |da |co|lei |che |tal| qua|si |m'ha |fat|to,",
 "|che |'l po|co| in|ge|gno| ad| or| ad| or| mi |li|ma,",
 '|me |ne |sa|rà |pe|rò |tan|to |con|ces|so,',
 '|che |mi |ba|sti| a |fi|nir| quan|to |ho |pro|mes|so.',
 '|Piac|cia|vi,| ge|ne|ro|sa| Er|cu|le|a |pro|le,',
 '|or|na|men|to| e |splen|dor| del| se|col| no|stro,',
 '|Ip|po|li|to,| ag|gr

Substitute quotes with the ones used in the Divine Comedy.

In [39]:
result = list(map(lambda x: re.sub('’', "'", x), result))

In [69]:
exact_matches, similarities = zip(*validate_syllabification(result, ariosto_alt_syll[:100]))

accuracy = sum(exact_matches) / len(exact_matches)
avg_similarities = np.mean(similarities)

print('Syllabification exact matches: {:.2f}%'.format(accuracy * 100))
print('Average similarity: {:.2f}'.format(avg_similarities))

Syllabification exact matches: 4.00%
Average similarity: 0.89


### 6.4 Syllabification of other poetry
We show once again that the model is able to handle text in metric forms other than the hendecasyllable.

In [40]:
arbitrary_verses = """
È una canzone senza titolo
Tanto pe’ cantà
Pe’ fa quarche cosa
Non è gnente de straordinario
È robba der paese nostro
Che se po’ cantà pure senza voce
Basta ’a salute
Quanno c'è 'a salute c'è tutto
Basta ’a salute e un par de scarpe nove
Poi girà tutto er monno
E m’a accompagno da me
Pe’ fa la vita meno amara
Me so’ comprato 'sta chitara
E quanno er sole scenne e more
Me sento ’n core cantatore
La voce e’ poca ma ’ntonata
Nun serve a fa ’na serenata
Ma solamente a fa 'n maniera
De famme ’n sogno a prima sera
Tanto pe’ cantà
Perché me sento un friccico ner core
Tanto pe’ sognà
Perché ner petto me ce naschi ’n fiore
Fiore de lillà
Che m'ariporti verso er primo amore
Che sospirava le canzoni mie
E m’aritontoniva de bucie
Canzoni belle e appassionate
Che Roma mia m’aricordate
Cantate solo pe’ dispetto
Ma co’ ’na smania dentro ar petto
Io nun ve canto a voce piena
Ma tutta l’anima è serena
E quanno er cielo se scolora
De me nessuna se ’nnamora
Tanto pe’ cantà
Perché me sento un friccico ner core
Tanto pe’ sognà
Perché ner petto me ce naschi un fiore
Fiore de lillà
Che m’ariporti verso er primo amore
Che sospirava le canzoni mie
E m’aritontoniva de bucie
"""

arbitrary_verses = preprocess_text(arbitrary_verses)
arbitrary_verses = [verse.strip() + ' <EOV>' for verse in arbitrary_verses.split('<EOV>')]

In [41]:
encoded_verses = input_tokenizer.texts_to_sequences(arbitrary_verses)
padded_verses = tf.keras.preprocessing.sequence.pad_sequences(
    encoded_verses, padding="post"
)

In [74]:
encoder_input = tf.convert_to_tensor(padded_verses)
decoder_input = tf.repeat([[start_symbol]], repeats=encoder_input.shape[0], axis=0)

output = evaluate(transformer, encoder_input, decoder_input,  stop_symbol, stopping_condition=stop_after_stop_symbol)

In [76]:
# Only take output before the first end of verse
stripped_output = list(map(lambda x: x.split('<EOV>')[0], target_tokenizer.sequences_to_texts(output.numpy())))
stripped_output = list(map(strip_tokens, stripped_output))

In [77]:
stripped_output

['|È |u|na |can|zo|ne |sen|za |ti|to|lo',
 '|Tan|to |pe’ |can|tà',
 '|Pe’ |fa |quar|che |co|sa',
 '|Non |è |gnen|te |de |stra|or|di|na|rio',
 '|È |rob|ba |der |pa|e|se |no|stro',
 '|Che |se |po’ |can|tà |pu|re |sen|za |vo|ce',
 '|Ba|sta ’a |sa|lu|te',
 '|Quan|no |cè |a |sa|lu|te |cè |tut|to',
 '|Ba|sta ’a |sa|lu|te e |un |par |de |scar|pe |no|ve',
 '|Poi |gi|rà |tut|to er |mon|no',
 '|E |m’ a |ac|com|pa|gno |da |me',
 '|Pe’ |fa |la |vi|ta |me|no a|ma|ra',
 '|Me |so’ |com|pra|to |sta |chi|ta|ra',
 '|E |quan|no er |so|le |scen|ne e |mo|ree',
 '|Me |sen|to ’n |co|re |can|ta|to|re',
 '|La |vo|ce e’ |po|ca |ma ’n|to|na|ta',
 '|Nun |ser|ve a |fa ’ |na |se|re|na|ta',
 '|Ma |so|la|men|te a |fa |na|ma|nie|ra',
 '|De |fam|me ’n |so|gno a |pri|ma |se|raa',
 '|Tan|to |pe’ |can|tà',
 '|Per|ché |me |sen|to un |fric|ci|co |ner |co|re',
 '|Tan|to |pe’ |so|gnà',
 '|Per|ché |ner |pet|to |me |ce |na|schi ’n |fio|re',
 '|Fio|re |de |lil|là',
 '|Che |ma|ri|por|ti |ver|so er |pri|mo a|mo|re',
 '|Che |so|spi