In [1]:
!tar zxvf deepcomedy.tar.gz
!tar zxvf data.tar.gz

deepcomedy/
deepcomedy/utils.py
deepcomedy/models/
deepcomedy/models/layers.py
deepcomedy/models/decoder_only.py
deepcomedy/models/transformer.py
deepcomedy/models/__pycache__/
deepcomedy/models/__pycache__/layers.cpython-37.pyc
deepcomedy/models/__pycache__/__init__.cpython-37.pyc
deepcomedy/models/__pycache__/transformer.cpython-37.pyc
deepcomedy/models/__init__.py
deepcomedy/models/.ipynb_checkpoints/
deepcomedy/models/.ipynb_checkpoints/transformer-checkpoint.py
deepcomedy/preprocessing.py
deepcomedy/__pycache__/
deepcomedy/__pycache__/utils.cpython-37.pyc
deepcomedy/__pycache__/__init__.cpython-37.pyc
deepcomedy/__pycache__/preprocessing.cpython-37.pyc
deepcomedy/metrics.py
deepcomedy/__init__.py
deepcomedy/.ipynb_checkpoints/
data/
data/orlando.txt
data/divina_textonly.txt
data/divina.txt
data/divina_syll_textonly.txt
data/orlando-textonly.txt
data/divina_syll.txt
data/.ipynb_checkpoints/
data/.ipynb_checkpoints/orlando-checkpoint.txt
data/.ipynb_checkpoints/orlando-textonly-chec

In [1]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *
from deepcomedy.metrics import *
import tqdm

%load_ext autoreload
%autoreload 2

## 1. Data preprocessing

In [2]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet='')
text = preprocess_text(raw_text, end_of_tercet='', word_level= True)

Split preprocessed text into verses

In [3]:
sep = "<EOV>"
input_tercets = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_tercets = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [4]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [5]:
input_text = []
target_text = []
target_text_tercet = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 3])))
    target_text_tercet.append(list(chain(*enc_target_tercets[line : line + 3])))
    target_text.append(list(chain(*enc_target_tercets[line : line + 4])))

Pad sequences

In [6]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [7]:
input_train, input_val, target_train, target_val = train_test_split(padded_input_text, padded_target_text)

## 2. Hyperparameter sweep

In [9]:
sweep_config = {
    "name": "word2char-sweep-3",
    "method": "grid",
    "metric": {"name": "loss", "goal": "minimize"},
    "parameters": {
        "batch_size": {"value": 32},
        "epochs": {"value": 1},
        "num_layers": {"values": [4, 8, 12]},
        "num_heads": {"values": [4, 8]},
        "d_model": {"value": 256},
        "dff": {"value": 512},
    },
}

sweep_id = wandb.sweep(sweep_config, project='deepcomedy', entity='deepcomedy')

Create sweep with ID: hladvn8s
Sweep URL: https://wandb.ai/deepcomedy/deepcomedy/sweeps/hladvn8s


In [10]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

# Input for generation
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]


def sweep():
    with wandb.init() as run:
        config = wandb.config
        dataset = make_dataset(input_train, target_train, batch_size=config["batch_size"])
        validation_dataset = make_dataset(input_val, target_val, batch_size=config["batch_size"])
        model, trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path=None)
        trainer.train(dataset, config["epochs"], log_wandb=True, validation_dataset=validation_dataset, validation_every=5)

        result = generate(model, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 1, start_symbol, stop_symbol)
        html_result = re.sub(r'\n', '<br>', result)
        wandb.log({"generated": wandb.Html("<pre>" + html_result + "</pre>", inject=False)})
        
        path = './models/word2char-' + str(config['epochs']) + '-' + str(config['num_layers']) + '-' + str(config['num_heads']) + '-' + str(config['dff']) + '.h5'
        model.save_weights(path)
        
        wandb.save(path)
              
        # Generation metrics
        avg_syll, hend_ratio, rhymeness, plagiarism, correctness, incorrectness = generation_metrics(result)        
        
        wandb.log({
            'avg_syll': avg_syll, 
            'hend_ratio': hend_ratio,
            'plagiarism': plagiarism,
            'correctness': correctness,
            'incorrectness': incorrectness,
            'rhymeness': rhymeness,
        })
        
wandb.agent(sweep_id, function=sweep)

[34m[1mwandb[0m: Agent Starting Run: a9k46zp7 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dff: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	num_heads: 4
[34m[1mwandb[0m: 	num_layers: 4
[34m[1mwandb[0m: Currently logged in as: [33malessandropacielli[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch 1 Batch 0 Loss 4.9843 Accuracy 0.0099
Epoch 1 Batch 50 Loss 4.0922 Accuracy 0.1314
Epoch 1 Batch 100 Loss 3.6105 Accuracy 0.1714
Epoch 1 Batch 150 Loss 3.3785 Accuracy 0.1917
Epoch 1 Batch 200 Loss 3.1525 Accuracy 0.2203
Epoch 1 Batch 250 Loss 2.9609 Accuracy 0.2450
Epoch 1 Batch 300 Loss 2.8147 Accuracy 0.2637
Epoch 1 Loss 2.7398 Accuracy 0.2731
Time taken for 1 epoch: 51.26 secs

['Nel mezzo del cammin di nostra vita', 'mi ritrovai per una selva oscura', 'ché la diritta via era smarrita', 'stosotrale toso che so sa lto tra seto so ta tose']
mi ritrovai per una selva oscura stosotrale toso che so sa lto tra seto so ta tose


VBox(children=(Label(value=' 16.63MB of 33.68MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.493688401…

0,1
train_loss,2.73982
train_accuracy,0.27313
_runtime,68.0
_timestamp,1622623540.0
_step,2.0
avg_syll,12.5
hend_ratio,0.75
plagiarism,0.59259
correctness,0.8
incorrectness,0.31


0,1
train_loss,▁
train_accuracy,▁
_runtime,▁█
_timestamp,▁█
_step,▁█
avg_syll,▁
hend_ratio,▁
plagiarism,▁
correctness,▁
incorrectness,▁


[34m[1mwandb[0m: Agent Starting Run: acja3ilv with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dff: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	num_heads: 4
[34m[1mwandb[0m: 	num_layers: 8
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


## 3. Training



In [11]:
dataset = make_dataset(input_train, target_train)
val_dataset = make_dataset(input_val, target_val)

In [20]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 1024,
}

In [11]:
transformer, transformer_trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path=None)

In [36]:
transformer_trainer.train(dataset, 30, validation_dataset=val_dataset, validation_every=1)

Epoch 1 Batch 0 Loss 5.4091 Accuracy 0.0008
Epoch 1 Batch 50 Loss 4.2407 Accuracy 0.1238
Epoch 1 Batch 100 Loss 3.6921 Accuracy 0.1638
Epoch 1 Batch 150 Loss 3.4692 Accuracy 0.1800
Epoch 1 Batch 200 Loss 3.2968 Accuracy 0.1976
Epoch 1 Batch 250 Loss 3.1047 Accuracy 0.2225
Epoch 1 Batch 300 Loss 2.9436 Accuracy 0.2433
Epoch 1 Batch 0 Validation Loss 1.9496 Validation Accuracy 0.3826
Epoch 1 Batch 50 Validation Loss 1.9754 Validation Accuracy 0.3752
Epoch 1 Batch 100 Validation Loss 1.9726 Validation Accuracy 0.3754
Epoch 1 Loss 2.8584 Accuracy 0.2542
Time taken for 1 epoch: 96.52 secs

Epoch 2 Batch 0 Loss 2.0470 Accuracy 0.3592
Epoch 2 Batch 50 Loss 2.0151 Accuracy 0.3640
Epoch 2 Batch 100 Loss 1.9988 Accuracy 0.3661
Epoch 2 Batch 150 Loss 1.9869 Accuracy 0.3678
Epoch 2 Batch 200 Loss 1.9747 Accuracy 0.3702
Epoch 2 Batch 250 Loss 1.9627 Accuracy 0.3725
Epoch 2 Batch 300 Loss 1.9501 Accuracy 0.3754
Epoch 2 Batch 0 Validation Loss 1.8128 Validation Accuracy 0.4117
Epoch 2 Batch 50 Valida

KeyboardInterrupt: ignored

## 4. Generation

In [138]:
encoder_input = tf.convert_to_tensor(encoder_input)
decoder_input = tf.convert_to_tensor(decoder_input)

evaluate(transformer, encoder_input, decoder_input, stop_symbol, choose_next_token=choose_greedy)

<tf.Tensor: shape=(1, 188), dtype=int32, numpy=
array([[14,  1, 53,  3,  9,  2,  1, 17,  3, 28,  1, 28,  6,  2,  1, 13,
         3,  9,  2,  1, 12,  4, 17,  1, 17,  5,  7,  2,  1, 13,  5,  2,
         1,  7,  6,  1, 11, 10,  8,  4,  2,  1, 20,  5,  1, 10,  4, 15,
        14,  1, 17,  5,  2,  1,  8,  5,  1, 10,  8,  4,  1, 20,  5,  2,
         1, 18,  3,  8,  2,  1, 16,  1,  7,  4,  2,  1, 11,  3,  9,  1,
        20,  4,  2,  6,  1, 11, 12, 16,  1,  8,  4, 19, 15, 14,  1, 12,
        23, 37,  2,  1,  9,  4,  2,  1, 13,  5,  1, 20,  5,  1, 10,  4,
         2,  1, 20,  5,  4,  2,  1,  3,  1,  8,  4,  2,  1, 11, 17,  4,
         8,  1, 10,  5,  1, 10,  4, 25, 15, 14,  1, 48,  2,  1, 13,  5,
         2,  1, 26, 16,  3,  9,  2,  1, 12, 23,  3,  2,  1, 12, 23,  3,
         2,  1, 11,  5,  2,  1, 11, 10,  4,  1, 20,  4,  2,  1, 11,  5,
         2,  1,  8,  5,  1, 20,  6,  1, 11, 12,  3, 15]], dtype=int32)>

In [8]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)
original_text

# Get the set of real words from the Divine Comedy to evaluate word correctness
# TODO create function to obtain word-level vocabulary from divine comedy
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='\n-:,?“‘)—»«!”(";.’ ', lower=False)
word_tokenizer.fit_on_texts([raw_text])
real_words = set(word_tokenizer.word_index.keys())

def generation_metrics(result):
    
    result_verses = result.split("\n")
    
    avg_syll = average_syllables(result_verses)
    hend_ratio = correct_hendecasyllables_ratio(result_verses)
    
    result_verses = re.sub(r'\|', '', result)
    result_verses = remove_punctuation(result_verses)

    plagiarism = ngrams_plagiarism(result_verses, original_text)
    
    gen_tokenizer = tfds.deprecated.text.Tokenizer()
    gen_words = gen_tokenizer.tokenize(result_verses)
    
    correctness, _ = correct_words_ratio(gen_words, real_words, return_errors=True)
    incorrectness_score = incorrectness(set(gen_words), real_words)
            
    result_verses = result_verses.split('\n')
    print(result_verses)
    rhyme_ratio = chained_rhymes_ratio(result_verses)
    
    return avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score

def generate(transformer, input_sequence, target_sequence, input_tokenizer, target_tokenizer, steps, start_symbol, stop_symbol):

    result = strip_tokens(target_tokenizer.sequences_to_texts(target_sequence)[0])
    
    encoder_input = input_sequence
    decoder_input = target_sequence

    for _ in range(steps):

        encoder_input = tf.convert_to_tensor(encoder_input)
        decoder_input = tf.convert_to_tensor(decoder_input)
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, choose_next_token=choose_topk)

        # Detokenize output
        generated_text = target_tokenizer.sequences_to_texts(output.numpy())[0]
        
        # Remove structural tokens (<EOV>, <GO>, <SEP>)
        generated_text = strip_tokens(generated_text)
        
        # Split into verses
        generated_verses = generated_text.split('\n')
        
        # Append last generated verse to result
        result = '\n'.join([result, generated_verses[-1]])
        
        # Create input for next step by taking last three verses
        next_input = '\n'.join(generated_verses[-3:])
        next_input = preprocess_text(next_input, end_of_tercet='')
        
        decoder_input = target_tokenizer.texts_to_sequences([next_input])
        
        # The encoder input should not have syllable separators
        next_input = '\n'.join(generated_verses[-3:])
        next_input = remove_syll_token(next_input)
        next_input = preprocess_text(next_input, end_of_tercet='', word_level=True)
                
        encoder_input = input_tokenizer.texts_to_sequences([next_input])
        
    return result

In [67]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 2, start_symbol, stop_symbol)

<GO> mi <SEP> ritrovai <SEP> per <SEP> una <SEP> selva <SEP> oscura ,  <EOV>  <GO> ché <SEP> la <SEP> diritta <SEP> via <SEP> era <SEP> smarrita .  <EOV>  <GO> Ne <SEP> la <SEP> folgór <SEP> de <SEP> l ’  <SEP> altra <SEP> selvaggia <SEP> vura <EOV>
<GO> ché <SEP> la <SEP> diritta <SEP> via <SEP> era <SEP> smarrita .  <EOV>  <GO> Ne <SEP> la <SEP> folgór <SEP> de <SEP> l ’  <SEP> altra <SEP> selvaggia <SEP> vura <EOV>  <GO> e <SEP> a <SEP> li <SEP> occhi <SEP> miei <SEP> andavan <SEP> sì <SEP> seda ,  <EOV>


In [47]:
result

'|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta\n|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,\n|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.\n|Co|sì |nel |suo |pec|ca|to e |nel |po|vu|ra\n|e |al |sel|vag|gian|te |de |la |stel|la\n|del |di|re et|ter|no e |fal|lo|ra |stel|le,\n|che |l’ u|na |vi|sta |se|co in |quel|la |bel|la\n|che |to|sto |nel |cor|po |d’ un |sa|li|re ac|quel|le\n|di|si|de|re e |a |la |spe|ran|za |stel|la;\n|e |poi |a |l’ oc|chio |suo |per |lo |spi|re\n|che |non |si |con|sor|men|ta |sen|tie |pec|ca;\n|e |quel|la |cie|la |se|con|da |tut|te |gi|ro,\n|se |non |po|tea |che |non |ve|dea |pec|ca|to\n|per |lo |mio |a|ves|se |quan|to |se|re|ro,\n|quan|do |la |boc|ca |tra|smu|ta|dia |ta|gno.\n|Per |lo |con|cor|do, |s’ a|vea |per |la |gran |fiu|na,\n|che |mi |fe|ce |di |là |da |qui |non |ve|nis|se,\n|sì |mi |vol|se |per |noi |non |si |mos|se,\n|e |non |m’ e|ra |chia|man|do |di |sen|sie|vi.\n|Co|me |vi |di|cea |e |non |mi |ri|chia|ro\n|pria |ch’ a |mez|zo |stel|la, |che |non |mi |vol

#### Ngrams plagiarism

In [49]:
x = re.sub(r'\|', '', result)
x = re.sub(r'[-:,?“‘\)—»«!”\(";.’]', " \g<0> ", x)
x

'Nel mezzo del cammin di nostra vita\nmi ritrovai per una selva oscura , \nché la diritta via era smarrita . \nCosì nel suo peccato e nel povura\ne al selvaggiante de la stella\ndel dire etterno e fallora stelle , \nche l ’  una vista seco in quella bella\nche tosto nel corpo d ’  un salire acquelle\ndisidere e a la speranza stella ; \ne poi a l ’  occhio suo per lo spire\nche non si consormenta sentie pecca ; \ne quella ciela seconda tutte giro , \nse non potea che non vedea peccato\nper lo mio avesse quanto serero , \nquando la bocca trasmutadia tagno . \nPer lo concordo ,  s ’  avea per la gran fiuna , \nche mi fece di là da qui non venisse , \nsì mi volse per noi non si mosse , \ne non m ’  era chiamando di sensievi . \nCome vi dicea e non mi richiaro\npria ch ’  a mezzo stella ,  che non mi volse , \nche mi sembiava mano a sé festia . \nAhi conoscendo il mio carcato colse , '

In [50]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)
original_text

'Nel mezzo del cammin di nostra vita\nmi ritrovai per una selva oscura , \nché la diritta via era smarrita . \nAhi quanto a dir qual era è cosa dura\nesta selva selvaggia e aspra e forte\nche nel pensier rinova la paura ! \nTant ’  è amara che poco è più morte ; \nma per trattar del ben ch ’ i ’  vi trovai , \ndirò de l ’ altre cose ch ’ i ’  v ’ ho scorte . \nIo non so ben ridir com ’  i ’  v ’ intrai , \ntant ’  era pien di sonno a quel punto\nche la verace via abbandonai . \nMa poi ch ’ i ’  fui al piè d ’ un colle giunto , \nlà dove terminava quella valle\nche m ’ avea di paura il cor compunto , \nguardai in alto e vidi le sue spalle\nvestite già de ’  raggi del pianeta\nche mena dritto altrui per ogne calle . \nAllor fu la paura un poco queta , \nche nel lago del cor m ’ era durata\nla notte ch ’ i ’  passai con tanta pieta . \nE come quei che con lena affannata , \nuscito fuor del pelago a la riva , \nsi volge a l ’ acqua perigliosa e guata , \ncosì l ’ animo mio ,  ch ’ ancor fu

In [52]:
ngrams_plagiarism(x, original_text)

0.10759493670886076

#### Word correctness

In [53]:
real_words = input_tokenizer.word_index.keys()

In [54]:
gen_tokenizer = tf.keras.preprocessing.text.Tokenizer()
gen_tokenizer.fit_on_texts([x])

In [55]:
gen_words = set(gen_tokenizer.word_index)

In [56]:
correct_words_ratio(gen_words, real_words, return_errors=True)

(0.8285714285714286,
 array(['selvaggiante', 'sensievi', 'festia', 'richiaro', 'conoscendo',
        'povura', 'consormenta', 'acquelle', 'ciela', 'fallora', 'serero',
        'concordo', 'fiuna', 'carcato', 'sentie', 'disidere', 'tagno',
        'trasmutadia'], dtype='<U12'))

In [57]:
incorrectness(gen_words, real_words)

0.28

#### Rhymeness

In [59]:
result_verses = result.split('\n')
rhyme_ratio = chained_rhymes_ratio(result_verses)

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta |ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra, |Co|sì |nel |suo |pec|ca|to e |nel |po|vu|ra
|Co|sì |nel |suo |pec|ca|to e |nel |po|vu|ra |del |di|re et|ter|no e |fal|lo|ra |stel|le,
|che |l’ u|na |vi|sta |se|co in |quel|la |bel|la |di|si|de|re e |a |la |spe|ran|za |stel|la;
|che |to|sto |nel |cor|po |d’ un |sa|li|re ac|quel|le |e |poi |a |l’ oc|chio |suo |per |lo |spi|re
|e |poi |a |l’ oc|chio |suo |per |lo |spi|re |e |quel|la |cie|la |se|con|da |tut|te |gi|ro,
|che |non |si |con|sor|men|ta |sen|tie |pec|ca; |se |non |po|tea |che |non |ve|dea |pec|ca|to
|se |non |po|tea |che |non |ve|dea |pec|ca|to |quan|do |la |boc|ca |tra|smu|ta|dia |ta|gno.
|per |lo |mio |a|ves|se |quan|to |se|re|ro, |Per |lo |con|cor|do, |s’ a|vea |per |la |gran |fiu|na,
|Per |lo |con|cor|do, |s’ a|vea |per |la |gran |fiu|na, |sì |mi |vol|se |per |noi |non |si |mos|se,
|che |mi |fe|ce |di |là |da |qui |non |ve|nis|se, |e |non |

In [60]:
rhyme_ratio

0.07142857142857142

#### All generation metrics

In [68]:
avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score = generation_metrics(result)        

['Nel mezzo del cammin di nostra vita', 'mi ritrovai per una selva oscura', 'ché la diritta via era smarrita', 'Ne la folgór de l altra selvaggia vura', 'e a li occhi miei andavan sì seda']


## 5. Syllabification

In [109]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

In order to perform syllabification we pass the tercet to be syllabified to the encoder and the `start_symbol` to the decoder.

In [110]:
encoder_input = tf.convert_to_tensor([input_text[0]])
decoder_input = tf.convert_to_tensor([[start_symbol]])

In [94]:
syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)

In [111]:
syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)
syll_output = target_tokenizer.sequences_to_texts(syll_output.numpy())[0]
syll_output = strip_tokens(syll_output)
syll_output = syll_output.split('\n')[:3] # Only take first 3 produced verses
syll_output

['|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta',
 '|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,',
 '|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.']

In [112]:
syll_correct = target_text_tercet[0]
syll_correct = target_tokenizer.sequences_to_texts([syll_correct])[0]
syll_correct = strip_tokens(syll_correct)
syll_correct = syll_correct.split('\n')
syll_correct

['|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta',
 '|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,',
 '|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.']

In [72]:
for i in range(0, 10, 3):
    print(strip_tokens(target_tokenizer.sequences_to_texts([target_text_tercet[i]])[0]))

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra
|e|sta |sel|va |sel|vag|gia e |a|spra e |for|te
|che |nel |pen|sier |ri|no|va |la |pa|u|ra!
|Tan|t’ è |a|ma|ra |che |po|co è |più |mor|te;
|ma |per |trat|tar |del |ben |ch’ i’ |vi |tro|vai,
|di|rò |de |l’ al|tre |co|se |ch’ i’ |v’ ho |scor|te.
|Io |non |so |ben |ri|dir |com’ |i’ |v’ in|trai,
|tan|t’ e|ra |pien |di |son|no |a |quel |pun|to
|che |la |ve|ra|ce |via |ab|ban|do|nai.


In [83]:
input_tokenizer.sequences_to_texts([input_text[0]])

['<GO> Nel <SEP> mezzo <SEP> del <SEP> cammin <SEP> di <SEP> nostra <SEP> vita <EOV> <GO> mi <SEP> ritrovai <SEP> per <SEP> una <SEP> selva <SEP> oscura, <EOV> <GO> ché <SEP> la <SEP> diritta <SEP> via <SEP> era <SEP> smarrita. <EOV>']

In [142]:
n = 10
N = 3 * n

output = []

for i in tqdm.tqdm(range(0, N, 3)):
    
    encoder_input = tf.convert_to_tensor([input_text[i]])
    decoder_input = tf.convert_to_tensor([[start_symbol]])
        
    syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)
    syll_output = target_tokenizer.sequences_to_texts(syll_output.numpy())[0]
    syll_output = strip_tokens(syll_output)
    syll_output = syll_output.split('\n')[:3] # Only take first 3 produced verses 
        
    output += syll_output
    

100%|██████████| 10/10 [02:12<00:00, 13.21s/it]


In [148]:
syll_correct = target_tokenizer.sequences_to_texts(target_text_tercet[:N:3])
syll_correct = ''.join(syll_correct)
syll_correct = strip_tokens(syll_correct)
syll_correct = syll_correct.split('\n')
syll_correct

['|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta',
 '|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,',
 '|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.',
 '|Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra',
 '|e|sta |sel|va |sel|vag|gia e |a|spra e |for|te',
 '|che |nel |pen|sier |ri|no|va |la |pa|u|ra!',
 '|Tan|t’ è |a|ma|ra |che |po|co è |più |mor|te;',
 '|ma |per |trat|tar |del |ben |ch’ i’ |vi |tro|vai,',
 '|di|rò |de |l’ al|tre |co|se |ch’ i’ |v’ ho |scor|te.',
 '|Io |non |so |ben |ri|dir |com’ |i’ |v’ in|trai,',
 '|tan|t’ e|ra |pien |di |son|no |a |quel |pun|to',
 '|che |la |ve|ra|ce |via |ab|ban|do|nai.',
 '|Ma |poi |ch’ i’ |fui |al |piè |d’ un |col|le |giun|to,',
 '|là |do|ve |ter|mi|na|va |quel|la |val|le',
 '|che |m’ a|vea |di |pa|u|ra il |cor |com|pun|to,',
 '|guar|dai |in |al|to e |vi|di |le |sue |spal|le',
 '|ve|sti|te |già |de’ |rag|gi |del |pia|ne|ta',
 '|che |me|na |drit|to al|trui |per |o|gne |cal|le.',
 '|Al|lor |fu |la |pa|u|ra un |po|co |que|ta,',
 '|che |nel |la|go |del 

In [146]:
output

['|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta',
 '|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,',
 '|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.',
 '|Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra',
 '|e|sta |sel|va |sel|vag|gia e |a|spra e |for|te',
 '|che |nel |pen|sier |ri|no|va |la |pa|u|ra',
 '|Tan|t’ è |a|ma|ra |che |po|co è |più |mor|te;',
 '|ma |per |trat|tar |del |ben |ch’ i’ |vi |tro|vai,',
 '|di|rò |de |l’ al|tre |co|se |ch’ i’ |v’ ho |scor|te.',
 '|Io |non |so |ben |ri|dir |com’ |i’ |v’ in|trai,',
 '|tan|t’ e|ra |pien |di |son|no |a |quel |pun|to',
 '|che |la |ve|ra|ce |via |ab|ban|do|nai.',
 '|Ma |poi |ch’ i’ |fui |al |piè |d’ un |col|le |giun|to,',
 '|là |do|ve |ter|mi|na|va |quel|la |val|le',
 '|che |m’ a|vea |di |pa|u|ra il |cor |com|pun|to,',
 '|guar|dai |in |al|to e |vi|di |le |sue |spal|le',
 '|ve|sti|te |già |de’ |rag|gi |del |pia|ne|ta',
 '|che |me|na |drit|to al|trui |per |o|gne |cal|le.',
 '|Al|lor |fu |la |pa|u|ra un |po|co |que|ta,',
 '|che |nel |la|go |del |

In [151]:
def validate_syllabification(prod, target):
    """
    Evaluates the correctness of produced syllabification with a correct reference.
    
    prod: list[string] produced syllabification as a list of verses
    target: list[string] correct syllabification as a list of verses
    
    returns for each verse, whether or not the produced verse is correctly syllabified (exact match) 
        and the edit distance between the produced string and the target
    
    """
    
    levenshtein = NormalizedLevenshtein()
    return [(x == y, levenshtein.similarity(x, y)) for x, y in zip(prod, target)]

## 6. Save model

In [23]:
transformer.save_weights('models/w2c-gen.h5')

NameError: name 'transformer' is not defined

In [27]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}

new_transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )

In [28]:
# In order to load the new weights the model should be called once for the variables to be initialized


start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

# Any inp, tar is ok here
inp = tf.convert_to_tensor([[start_symbol]])
tar = tf.convert_to_tensor([[start_symbol]])

enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)

new_transformer(inp, tar, False, enc_padding_mask, look_ahead_mask, dec_padding_mask);

In [29]:
new_transformer.load_weights('models/word2char-4-4-512.h5')

In [30]:
transformer = new_transformer

In [39]:
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(new_transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 6, start_symbol, stop_symbol)

<GO> mi <SEP> ritrovai <SEP> per <SEP> una <SEP> selva <SEP> oscura ,  <EOV>  <GO> ché <SEP> la <SEP> diritta <SEP> via <SEP> era <SEP> smarrita .  <EOV>  <GO> Nel <SEP> segno <SEP> de <SEP> la <SEP> spella <SEP> si <SEP> divosca ,  <EOV>
<GO> ché <SEP> la <SEP> diritta <SEP> via <SEP> era <SEP> smarrita .  <EOV>  <GO> Nel <SEP> segno <SEP> de <SEP> la <SEP> spella <SEP> si <SEP> divosca ,  <EOV>  <GO> e <SEP> in <SEP> altra <SEP> voi <SEP> e <SEP> sempre <SEP> stellati ,  <EOV>
<GO> Nel <SEP> segno <SEP> de <SEP> la <SEP> spella <SEP> si <SEP> divosca ,  <EOV>  <GO> e <SEP> in <SEP> altra <SEP> voi <SEP> e <SEP> sempre <SEP> stellati ,  <EOV>  <GO> dinanzi <SEP> da <SEP> Piccala <SEP> e <SEP> Perdanise ;  <EOV>
<GO> e <SEP> in <SEP> altra <SEP> voi <SEP> e <SEP> sempre <SEP> stellati ,  <EOV>  <GO> dinanzi <SEP> da <SEP> Piccala <SEP> e <SEP> Perdanise ;  <EOV>  <GO> ma <SEP> quel <SEP> che <SEP> solo <SEP> a <SEP> me <SEP> non <SEP> vi <SEP> d ’  <SEP> ira ,  <EOV>
<GO> dinanzi <SEP>

In [41]:
print(result)

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|Nel |se|gno |de |la |spel|la |si |di|vo|sca,
|e |in |al|tra |voi |e |sem|pre |stel|la|ti,
|di|nan|zi |da |Pic|ca|la e |Per|da|ni|se;
|ma |quel |che |so|lo a |me |non |vi |d’ i|ra,
|quan|do |la |ter|ra |di |sù |e |mi |vi|va,
|che |l’ al|ta |par|te |ve|ni|tà |si |gi|ra.
