In [2]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *
from deepcomedy.metrics import *
import tqdm

from nlgpoetry.hyphenation import *

%load_ext autoreload
%autoreload 2

## 1. Data preprocessing

In [3]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet='')
text = preprocess_text(raw_text, end_of_tercet='')

Split preprocessed text into verses

In [4]:
sep = "<EOV>"
input_tercets = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_tercets = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [5]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [6]:
input_text = []
target_text = []
target_text_tercet = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 3])))
    target_text_tercet.append(list(chain(*enc_target_tercets[line : line + 3])))
    target_text.append(list(chain(*enc_target_tercets[line : line + 4])))

Pad sequences

In [7]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)
padded_target_text_tercet = tf.keras.preprocessing.sequence.pad_sequences(
    target_text_tercet, padding="post"
)

In [8]:
input_train, input_val, target_train, target_val, target_tercet_train, target_tercet_val = train_test_split(padded_input_text, padded_target_text, padded_target_text_tercet)

In [16]:
transformer = load_transformer_model(config, input_vocab_size, target_vocab_size, target_tokenizer, './models/c2c-gen.h5')

## 2. Hyperparameter sweep

In [309]:
sweep_config = {
    "name": "char2char-sweep-2",
    "method": "grid",
    "metric": {"name": "loss", "goal": "minimize"},
    "parameters": {
        "batch_size": {"value": 32},
        "epochs": {"value": 70},
        "num_layers": {"values": [8, 12]},
        "num_heads": {"value": 4},
        "d_model": {"value": 256},
        "dff": {"value": 512},
    },
}

sweep_id = wandb.sweep(sweep_config, project='deepcomedy', entity='deepcomedy')

Create sweep with ID: i08xtnzs
Sweep URL: https://wandb.ai/deepcomedy/deepcomedy/sweeps/i08xtnzs


In [306]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

# Input for generation
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

def sweep():
    with wandb.init() as run:
        config = wandb.config
        dataset = make_dataset(input_train, target_train, batch_size=config["batch_size"])
        validation_dataset = make_dataset(input_val, target_val, batch_size=config["batch_size"])
        model, trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path=None)
        trainer.train(dataset, config["epochs"], log_wandb=True, validation_dataset=validation_dataset, validation_every=5)
        
        # Generate
        result = generate(model, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 1, start_symbol, stop_symbol)
        html_result = re.sub(r'\n', '<br>', result)
        wandb.log({"generated": wandb.Html("<pre>" + html_result + "</pre>", inject=False)})
        
        path = './models/char2char-' + str(config['epochs']) + '-' + str(config['num_layers']) + '-' + str(config['num_heads']) + '-' + str(config['dff']) + '.h5'
        model.save_weights(path)
        
        wandb.save(path)
              
        # Generation metrics
        print(result)
        avg_syll, hend_ratio, plagiarism, correctness, incorrectness, rhymeness = generation_metrics(result)
        
        wandb.log({
            'avg_syll': avg_syll, 
            'hend_ratio': hend_ratio,
            'plagiarism': plagiarism,
            'correctness': correctness,
            'incorrectness': incorrectness,
            'rhymeness': rhymeness,
        })
        
        
wandb.agent(sweep_id, function=sweep)

[34m[1mwandb[0m: Agent Starting Run: 6lj17gch with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	d_model: 256
[34m[1mwandb[0m: 	dff: 512
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	num_heads: 4
[34m[1mwandb[0m: 	num_layers: 8
[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch 1 Batch 0 Loss 5.0637 Accuracy 0.0283
Epoch 1 Batch 50 Loss 3.9751 Accuracy 0.1375
Epoch 1 Batch 100 Loss 3.5455 Accuracy 0.1719
Epoch 1 Batch 150 Loss 3.3695 Accuracy 0.1858
Epoch 1 Batch 200 Loss 3.2719 Accuracy 0.1937
Epoch 1 Batch 250 Loss 3.1430 Accuracy 0.2091
Epoch 1 Batch 300 Loss 2.9924 Accuracy 0.2302
Epoch 1 Loss 2.9071 Accuracy 0.2421
Time taken for 1 epoch: 104.98 secs

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|di |di|l|li|ti |se |ti |tri |sta |tr|di |tr|che|di |char|di|la|le|ti|lal |ti |r|ti|tia|stre |li |tue |schiari |tre |to|de |ti|di |tri |tan|tr|di |tera |ste |sti |tria|scon|ti|stare |stri|ti|ti |ti |
Hend ok!
Plagiarism ok
Correctness ok
mi ritrovai per una selva oscura di dilliti se ti tri sta trdi trchedi chardilaletilal ti rtitiastre li tue schiari tre tode tidi tri tantrdi tera ste sti triascontistare strititi ti 
Rhymes ok


VBox(children=(Label(value=' 40.80MB of 40.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Problem finishing run
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/wandb/sdk/wandb_run.py", line 1532, in _atexit_cleanup
    self._on_finish()
  File "/usr/local/lib/python3.6/dist-packages/wandb/sdk/wandb_run.py", line 1709, in _on_finish
    self._backend.cleanup()
  File "/usr/local/lib/python3.6/dist-packages/wandb/sdk/backend/backend.py", line 155, in cleanup
    self.wandb_process.join()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 124, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 50, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
Exception


## 3. Training

In [10]:
dataset = make_dataset(input_train, target_train)
val_dataset = make_dataset(input_val, target_val)

In [11]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}

In [91]:
transformer, transformer_trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path= None)

In [None]:
wandb.init()
transformer_trainer.train(dataset, 30, validation_dataset=val_dataset, validation_every=1, log_wandb=True)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch 1 Batch 0 Loss 5.2556 Accuracy 0.0096
Epoch 1 Batch 50 Loss 4.1858 Accuracy 0.1067
Epoch 1 Batch 100 Loss 3.6529 Accuracy 0.1588
Epoch 1 Batch 150 Loss 3.4032 Accuracy 0.1833
Epoch 1 Batch 200 Loss 3.1710 Accuracy 0.2137
Epoch 1 Batch 250 Loss 2.9775 Accuracy 0.2399
Epoch 1 Batch 300 Loss 2.8283 Accuracy 0.2600
Epoch 1 Batch 350 Loss 2.7127 Accuracy 0.2754
Epoch 1 Batch 400 Loss 2.6215 Accuracy 0.2877
Epoch 1 Batch 0 Validation Loss 1.8824 Validation Accuracy 0.3955
Epoch 1 Batch 50 Validation Loss 1.8906 Validation Accuracy 0.3962
Epoch 1 Batch 100 Validation Loss 1.8892 Validation Accuracy 0.3961
Epoch 1 Loss 2.5568 Accuracy 0.2964
Time taken for 1 epoch: 87.73 secs

Epoch 2 Batch 0 Loss 1.9097 Accuracy 0.3937
Epoch 2 Batch 50 Loss 1.9288 Accuracy 0.3828
Epoch 2 Batch 100 Loss 1.9184 Accuracy 0.3855
Epoch 2 Batch 150 Loss 1.9057 Accuracy 0.3885
Epoch 2 Batch 200 Loss 1.8911 Accuracy 0.3926
Epoch 2 Batch 250 Loss 1.8755 Accuracy 0.3968
Epoch 2 Batch 300 Loss 1.8590 Accuracy 0.40

## 4. Generation

In [271]:
def generate(transformer, input_sequence, target_sequence, input_tokenizer, target_tokenizer, steps, start_symbol, stop_symbol):

    result = strip_tokens(target_tokenizer.sequences_to_texts(target_sequence)[0])
    
    encoder_input = input_sequence
    decoder_input = target_sequence

    for _ in range(steps):

        encoder_input = tf.convert_to_tensor(encoder_input)
        decoder_input = tf.convert_to_tensor(decoder_input)
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, choose_next_token=choose_topk)

        # Detokenize output
        generated_text = target_tokenizer.sequences_to_texts(output.numpy())[0]
        
        # Remove structural tokens (<EOV>, <GO>, <SEP>)
        generated_text = strip_tokens(generated_text)
        
        # Split into verses
        generated_verses = generated_text.split('\n')
        
        # Append last generated verse to result
        result = '\n'.join([result, generated_verses[-1]])
        
        # Create input for next step by taking last three verses
        next_input = '\n'.join(generated_verses[-3:])
        next_input = preprocess_text(next_input, end_of_tercet='')
        
        decoder_input = target_tokenizer.texts_to_sequences([next_input])
        
        # The encoder input should not have syllable separators
        encoder_input = remove_syll_token(next_input)
        encoder_input = input_tokenizer.texts_to_sequences([encoder_input])
        
    return result

In [292]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 1, start_symbol, stop_symbol)

In [299]:
print(result)

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|E |quel|la |che |pa|ro|la |più |ch’ io |cu|ra


In [293]:
avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score = generation_metrics(result)

In [294]:
print("average syllables per verse: {}\nhendecasyllables ratio: {}\nrhyme_ratio: {}\nngrams plagiarism: {}\ncorrectness: {}\nincorrectness_score: {}\n"\
      .format(avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score))

average syllables per verse: 11.0
hendecasyllables ratio: 1.0
rhyme_ratio: 1.0
ngrams plagiarism: 0.6666666666666666
correctness: 1.0
incorrectness_score: 0.0



### Hendecasyllabicness

In [62]:
result_verses = stripped_result.split("\n")

In [64]:
avg_syll = average_syllables(result_verses)

In [65]:
hend_ratio = correct_hendecasyllables_ratio(result_verses)

In [69]:
print('Average number of syllables per verse: {}'.format(avg_syll))
print('Ratio of hendecasyllables to total number of verses: {}'.format(hend_ratio))

Average number of syllables per verse: 11.0
Ratio of hendecasyllables to total number of verses: 1.0


### Ngrams plagiarism

In [71]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)

In [72]:
result_verses = re.sub(r'\|', '', strip_result)
result_verses = remove_punctuation(result_verses)

In [73]:
plagiarism = ngrams_plagiarism(result_verses, original_text)

In [74]:
print('Plagiarism: {}'.format(plagiarism))

Plagiarism: 0.425


### Word correctness

In [75]:
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='\n-:,?“‘)—»«!”(";.’ ', lower=False)
word_tokenizer.fit_on_texts([raw_text])
real_words = set(word_tokenizer.word_index.keys())

In [76]:
gen_tokenizer = tfds.deprecated.text.Tokenizer()
gen_words = tokenizer.tokenize(result_verses)

In [78]:
correctness, errors = correct_words_ratio(gen_words, real_words, return_errors=True)
incorrectness_score = incorrectness(set(gen_words), real_words)

In [82]:
print('Correct words: {:.2f}%'.format(correctness * 100))

Correct words: 97.67%


The incorrectness score also considers "how wrong" a word is, by computing the edit distance between an incorrect word and the nearest correct word in the vocabulary.

In [83]:
print('Incorrectness score: {}'.format(incorrectness_score))

Incorrectness score: 0.06


### Rhymeness

In [86]:
result_verses = result_verses.split('\n')
rhyme_ratio = chained_rhymes_ratio(result_verses)

In [88]:
print('Correct rhymes ratio: {}'.format(rhyme_ratio))

Correct rhymes ratio: 1.0


### Generation metrics function

The following computes all generation metrics

In [307]:
original_text = preprocess_text(raw_text, end_of_verse='\n', end_of_tercet='', start_of_verse='', word_level=True)
original_text = re.sub(r' <SEP> ', ' ', original_text)
original_text

# Get the set of real words from the Divine Comedy to evaluate word correctness
# TODO create function to obtain word-level vocabulary from divine comedy
word_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='\n-:,?“‘)—»«!”(";.’ ', lower=False)
word_tokenizer.fit_on_texts([raw_text])
real_words = set(word_tokenizer.word_index.keys())

def generation_metrics(result):
    # Generation metrics
    # · Average syllables
    # · Hendecasyllabicness
    # · Correctness
    # · Ngrams-plagiarism
    # · Rhymeness
    
    result_verses = result.split("\n")
    
    avg_syll = average_syllables(result_verses)
    hend_ratio = correct_hendecasyllables_ratio(result_verses)
    
    result_verses = re.sub(r'\|', '', result)
    result_verses = remove_punctuation(result_verses)

    plagiarism = ngrams_plagiarism(result_verses, original_text)
    
    gen_tokenizer = tfds.deprecated.text.Tokenizer()
    gen_words = tokenizer.tokenize(result_verses)
    
    correctness, _ = correct_words_ratio(gen_words, real_words, return_errors=True)
    incorrectness_score = incorrectness(set(gen_words), real_words)
            
    result_verses = result_verses.split('\n')
    rhyme_ratio = chained_rhymes_ratio(result_verses)
    
    return avg_syll, hend_ratio, rhyme_ratio, plagiarism, correctness, incorrectness_score

## 5. Syllabification

In order to perform syllabification we pass the tercet to the encoder and the `start_symbol` to the decoder.

In [89]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

def syllabify_tercets(transformer, input_text):

    output = []
    
    for i in tqdm.tqdm(range(len(input_text))):

        encoder_input = tf.convert_to_tensor([input_text[i]])
        decoder_input = tf.convert_to_tensor([[start_symbol]])

        syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)
        syll_output = target_tokenizer.sequences_to_texts(syll_output.numpy())[0]
        syll_output = strip_tokens(syll_output)
        syll_output = syll_output.split('\n')[:3] # Only take first 3 produced verses 

        output += syll_output
        
    return output

In [281]:
n_verses = 9

# Obtain tercets from input and target "windows"
syll_input = input_text[:n_verses:3]
syll_input_text = strip_tokens(' '.join(input_tokenizer.sequences_to_texts(syll_input)))

correct_syll = target_text_tercet[:n_verses:3]
correct_syll = ' '.join(target_tokenizer.sequences_to_texts(correct_syll))
correct_syll = strip_tokens(correct_syll)
correct_syll = correct_syll.split('\n')

In [282]:
syll_output = syllabify_tercets(transformer, syll_input)

100%|██████████| 3/3 [00:34<00:00, 11.34s/it]


In [283]:
validate_syllabification(syll_output, correct_syll)

[(True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0),
 (True, 1.0)]

Validation of the syllabification on input_val

In [113]:
input_val_non_padded=[]
for i in range(len(input_val)):
    input_val_non_padded.append(input_val[i][input_val[i]!=0])

target_tercet_val_non_padded=[]
for i in range(len(target_tercet_val)):
    target_tercet_val_non_padded.append(target_tercet_val[i][target_tercet_val[i]!=0])

In [146]:
syll_input = input_val_non_padded[:10]
correct_syll = target_tercet_val_non_padded[:10]
correct_syll = target_tokenizer.sequences_to_texts(correct_syll)
correct_syll = strip_tokens(' '.join(correct_syll))
correct_syll = correct_syll.split('\n')

In [147]:
syll_output = syllabify_tercets(transformer, syll_input)

100%|██████████| 10/10 [02:05<00:00, 12.57s/it]


In [150]:
correct_verses, distances = zip(*validate_syllabification(syll_output, correct_syll))

In [155]:
print('Syllabification accuracy: {:.2f}%'.format(sum(correct_verses) / len(correct_verses) * 100))

Syllabification accuracy: 100.00%


In [159]:
print('Average levenshtein similarity to correct syllabification: {}'.format(np.mean(distances)))

Average levenshtein similarity to correct syllabification: 1.0


Syllabification of the *Orlando Furioso* and comparison with Neural Poetry

In [160]:
def is_not_number(string):
    try:
        int(string)
        return False
    except:
        return True

def is_not_chapter(string):
    return not re.match(r'CANTO .*', string)

In [161]:
raw_text_ariosto = open("./data/orlando-textonly.txt", "rb").read().decode(encoding="utf-8")
raw_text_ariosto = raw_text_ariosto.split('\n')
raw_text_ariosto = list(map(lambda x: x.strip(), raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_empty, raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_number, raw_text_ariosto))
raw_text_ariosto = list(filter(is_not_chapter, raw_text_ariosto))

In [173]:
sep = "<EOV>"

raw_text_ariosto_joined = "\n".join(raw_text_ariosto)
text_ariosto = preprocess_text(raw_text_ariosto_joined, end_of_tercet='')
ariosto_verses = [x.lstrip() + sep for x in text_ariosto.split(sep)][:-1]
enc_ariosto_verses = input_tokenizer.texts_to_sequences(ariosto_verses)

Divide into tercets

In [175]:
ariosto_tercets = []

for line in range(len(enc_ariosto_verses) - 2):
    ariosto_tercets.append(list(chain(*enc_ariosto_verses[line : line + 3])))

Produce syllabification

In [196]:
syll_output = syllabify_tercets(transformer, ariosto_tercets[:10])

100%|██████████| 10/10 [01:54<00:00, 11.48s/it]


Obtain alternative syllabification for the Orlando furioso (from Neural Poetry)

In [193]:
ariosto_alt_syll = list(map(hyphenation, raw_text_ariosto))
ariosto_alt_syll = list(map(lambda x: '|' + '|'.join(x), ariosto_alt_syll))

In [187]:
ariosto_alt_syll

["|Le |don|ne,| i |ca|val|lier|, l'ar|me,| gli| a|mo|ri,",
 "|le |cor|te|sie,| l'au|da|ci| im|pre|se| io |can|to,",
 '|che |fu|ro| al| tem|po |che |pas|sa|ro| i |Mo|ri',
 "|d'A|fri|ca| il| ma|re,| e| in| Fran|cia |noc|quer| tan|to,",
 "|se|guen|do |l'i|re| e| i |gio|ve|nil| fu|ro|ri",
 "|d'A|gra|man|te |lor| re,| che |si |di|è |van|to",
 '|di |ven|di|car| la |mor|te |di |Troi|a|no',
 '|so|pra |re |Car|lo| im|pe|ra|tor| ro|ma|no.',
 "|Di|rò |d'Or|lan|do| in| un| me|de|smo |trat|to",
 '|co|sa |non| det|ta| in| pro|sa |mai,| né| in| ri|ma:',
 '|che |per| a|mor| ven|ne| in| fu|ro|re| e |mat|to,',
 "|d'uom| che |sì |sag|gio| e|ra |sti|ma|to |pri|ma;",
 "|se |da |co|lei |che |tal| qua|si |m'ha |fat|to,",
 "|che |'l po|co| in|ge|gno| ad| or| ad| or| mi |li|ma,",
 '|me |ne |sa|rà |pe|rò |tan|to |con|ces|so,',
 '|che |mi |ba|sti| a |fi|nir| quan|to |ho |pro|mes|so.',
 '|Piac|cia|vi,| ge|ne|ro|sa| Er|cu|le|a |pro|le,',
 '|or|na|men|to| e |splen|dor| del| se|col| no|stro,',
 '|Ip|po|li|to,| ag|gr

In [197]:
validate_syllabification(syll_output, orlando_syll_NP[:10])

[(False, 0.8703703703703703),
 (False, 0.9038461538461539),
 (False, 0.9183673469387755),
 (False, 0.35593220338983056),
 (False, 0.36170212765957444),
 (False, 0.4285714285714286),
 (False, 0.3191489361702128),
 (False, 0.375),
 (False, 0.2978723404255319),
 (False, 0.23636363636363633)]

In [198]:
syll_output[0]

'|Le |don|ne, i |ca|val|lier, |lar|me, |gli a|mo|ri,'

## 6. Save model

In [None]:
save_transformer_model(transformer, 'models/c2c-gen.h5')