In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import files
    
    files.upload()

Saving data.zip to data.zip
Saving deepcomedy.zip to deepcomedy.zip


In [2]:
# !pip install wandb
!tar zxvf deepcomedy.tar.gz
# !unzip deepcomedy.zip
# #!tar zxvf data.tar.gz
# !unzip data.zip

deepcomedy/
deepcomedy/util/
deepcomedy/util/predicate.py
deepcomedy/util/__pycache__/
deepcomedy/util/__pycache__/predicate.cpython-37.pyc
deepcomedy/util/__pycache__/__init__.cpython-37.pyc
deepcomedy/util/__init__.py
deepcomedy/util/.ipynb_checkpoints/
deepcomedy/util/.ipynb_checkpoints/predicate-checkpoint.py
deepcomedy/models/
deepcomedy/models/layers.py
deepcomedy/models/decoder_only.py
deepcomedy/models/transformer.py
deepcomedy/models/__pycache__/
deepcomedy/models/__pycache__/layers.cpython-37.pyc
deepcomedy/models/__pycache__/__init__.cpython-37.pyc
deepcomedy/models/__pycache__/transformer.cpython-37.pyc
deepcomedy/models/__init__.py
deepcomedy/models/.ipynb_checkpoints/
deepcomedy/models/.ipynb_checkpoints/transformer-checkpoint.py
deepcomedy/preprocessing.py
deepcomedy/__pycache__/
deepcomedy/__pycache__/__init__.cpython-37.pyc
deepcomedy/__pycache__/preprocessing.cpython-37.pyc
deepcomedy/metrics.py
deepcomedy/__init__.py
deepcomedy/.ipynb_checkpoints/


In [1]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *
from itertools import chain

## 1. Data preprocessing

In [54]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_verse = "")
text = preprocess_text(raw_text, end_of_verse = "")

Split preprocessed text into verses

In [55]:
sep = "<EOT>"
input_tercets = [x + sep for x in text.split(sep)][:-1]
target_tercets = [x + sep for x in syll_text.split(sep)][:-1]

Encode with tokenizer

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
tokenizer.fit_on_texts(target_tercets)
enc_input_tercets = tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = tokenizer.texts_to_sequences(target_tercets)
vocab_size = len(tokenizer.word_index) + 1

In [59]:
input_text = []
target_text = []

for line in range(len(enc_input_tercets) - 1):
    input_text.append(enc_input_tercets[line])
    target_text.append(enc_target_tercets[line + 1])

Pad sequences

In [64]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [71]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input_text, padded_target_text
)

## 2. The Transformer model


In [75]:
dataset = make_dataset(padded_input_text, padded_target_text)

In [80]:
config = {
    "num_layers" : 4,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 1024
}

checkpoint_save_path = "./checkpoints/char-level_gen"

In [81]:
transformer, transformer_trainer = make_transformer_model(config, vocab_size, vocab_size, checkpoint_save_path=None)

## 3. Training

In [None]:
transformer_trainer.train(dataset, 10)

## 4. Generation

In [37]:
start_symbol = tokenizer.word_index["<GO>"]
stop_symbol = tokenizer.word_index["<EOT>"]
syllable_separator = tokenizer.word_index["|"]

In [97]:
def generate(transformer, input_sequence, steps, stop_symbol):

    decoder_input = tf.convert_to_tensor([[start_symbol]])
    encoder_input = input_sequence
    
    result = []

    for _ in range(steps):

        encoder_input = tf.convert_to_tensor([encoder_input])
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol)

        # Remove syllable separator
        encoder_input = list(filter(lambda x: x != syllable_separator, output.numpy()[0]))
        
        output = tokenizer.sequences_to_texts(output.numpy())[0]

        
        result.append(output)
        
    return result
        

In [101]:
input_sequence = input_train[0]

out = generate(transformer, input_sequence, 5, stop_symbol)

In [102]:
out

['<GO> | C o | m e <SEP> | s i <SEP> | r i | s p u o | s e <SEP> | l a <SEP> | m i a <SEP> | d i | s i | r a <GO> | d i <SEP> | q u e l <SEP> | c h e <SEP> | s i <SEP> | r i | m o | s t r a <SEP> | c o | m e <SEP> | s i <SEP> | r i | s t r a , <GO> | c h e <SEP> | l ’ <SEP> a | n i | m a <SEP> | d e <SEP> | l a <SEP> | m i a <SEP> | d i | s i | r a ; <EOT>',
 '<GO> | e <SEP> | s e <SEP> | t u <SEP> | l a <SEP> | v o | g l i a <SEP> | c h e <SEP> | s i <SEP> | r i | s p o | s a <GO> | l a <SEP> | v i | s t a <SEP> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | s i <SEP> | r i | s p o | s t a <GO> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | s i <SEP> | r i | m i | s e | r a <SEP> | s i <SEP> | r o | s a . <EOT>',
 '<GO> | C o | m e <SEP> | l ’ <SEP> a l | t r a <SEP> | c h e <SEP> | s i <SEP> | r i | m a | s e <SEP> | s t a | t a <GO> | d i <SEP> | q u e l <SEP> | c h e <SEP> | s i <SEP> | r i | m o | r a <SEP> | c o | m e <SEP> | s i <SEP> | s t a | g n a , <GO> | c h e <SEP> |

## 5. Metrics

In [125]:
generated = ''.join(out)
generated = strip_tokens(generated)
generated = generated.strip()

In [122]:
print(generated)

['|Co|me |si |ri|spuo|se |la |mia |di|si|ra', '|di |quel |che |si |ri|mo|stra |co|me |si |ri|stra,', '|che |l’ a|ni|ma |de |la |mia |di|si|ra;', '|e |se |tu |la |vo|glia |che |si |ri|spo|sa', '|la |vi|sta |di |quel|la |che |si |ri|spo|sta', '|di |quel|la |che |si |ri|mi|se|ra |si |ro|sa.', '|Co|me |l’ al|tra |che |si |ri|ma|se |sta|ta', '|di |quel |che |si |ri|mo|ra |co|me |si |sta|gna,', '|che |l’ a|ni|ma |de |la |mia |so|lea |so|la.', '|Co|me |la |mia |don|na |che |si |ri|guar|da', '|di |quel|la |che |si |ri|mi|se|ra |si |ri|sta,', '|sì |che |la |sua |pa|ro|la |si |ri|mar|da.', '|Co|me |la |mia |don|na |che |si |ri|guar|da', '|di |quel|la |che |si |ri|mi|se|ra |si |ri|sta,', '|sì |che |la |sua |pa|ro|la |si |ri|mar|da.']


In [118]:
from deepcomedy.metrics import *

In [126]:
correct_hendecasyllables_ratio(generated.split('\n'))

1.0

Preprocess generated text for rhyme metrics

In [146]:
not re.match(r'[a-zA-Z]', '«')

True

In [188]:
generated_nosyll = remove_syll_token(generated)
generated_nosyll = remove_punctuation(generated_nosyll)
print(generated_nosyll)

Come si rispuose la mia disira
di quel che si rimostra come si ristra
che l anima de la mia disira
e se tu la voglia che si risposa
la vista di quella che si risposta
di quella che si rimisera si rosa
Come l altra che si rimase stata
di quel che si rimora come si stagna
che l anima de la mia solea sola
Come la mia donna che si riguarda
di quella che si rimisera si rista
sì che la sua parola si rimarda
Come la mia donna che si riguarda
di quella che si rimisera si rista
sì che la sua parola si rimarda


In [187]:
chained_rhymes_ratio(generated_nosyll.split('\n'))

di quel che si rimostra come si ristra e se tu la voglia che si risposa
la vista di quella che si risposta Come l altra che si rimase stata
Come l altra che si rimase stata che l anima de la mia solea sola
di quel che si rimora come si stagna Come la mia donna che si riguarda
di quella che si rimisera si rista Come la mia donna che si riguarda


0.4444444444444444