In [None]:
!pip install wandb strsimpy

In [None]:
!tar zxvf deepcomedy.tar.gz
!tar zxvf data.tar.gz

In [21]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *

%load_ext autoreload
%autoreload 2

## 1. Data preprocessing

In [4]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet='')
text = preprocess_text(raw_text, end_of_tercet='', word_level= True)

Split preprocessed text into verses

In [5]:
sep = "<EOV>"
input_tercets = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_tercets = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [6]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [7]:
input_text = []
target_text = []
target_text_tercet = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 3])))
    target_text_tercet.append(list(chain(*enc_target_tercets[line : line + 3])))
    target_text.append(list(chain(*enc_target_tercets[line : line + 4])))

Pad sequences

In [8]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [13]:
input_train, input_val, target_train, target_val = train_test_split(padded_input_text, padded_target_text)

## 2. The Transformer model


In [14]:
dataset = make_dataset(input_train, target_train)
val_dataset = make_dataset(input_val, target_val)

In [15]:
config = {
    "num_layers" : 6,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}

In [35]:
transformer, transformer_trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path=None)

## 3. Training

In [None]:
transformer_trainer.train(dataset, 30, validation_dataset=val_dataset, validation_every=1)

Epoch 1 Batch 0 Loss 5.4091 Accuracy 0.0008
Epoch 1 Batch 50 Loss 4.2407 Accuracy 0.1238
Epoch 1 Batch 100 Loss 3.6921 Accuracy 0.1638
Epoch 1 Batch 150 Loss 3.4692 Accuracy 0.1800
Epoch 1 Batch 200 Loss 3.2968 Accuracy 0.1976
Epoch 1 Batch 250 Loss 3.1047 Accuracy 0.2225
Epoch 1 Batch 300 Loss 2.9436 Accuracy 0.2433
Epoch 1 Batch 0 Validation Loss 1.9496 Validation Accuracy 0.3826
Epoch 1 Batch 50 Validation Loss 1.9754 Validation Accuracy 0.3752
Epoch 1 Batch 100 Validation Loss 1.9726 Validation Accuracy 0.3754
Epoch 1 Loss 2.8584 Accuracy 0.2542
Time taken for 1 epoch: 96.52 secs

Epoch 2 Batch 0 Loss 2.0470 Accuracy 0.3592
Epoch 2 Batch 50 Loss 2.0151 Accuracy 0.3640
Epoch 2 Batch 100 Loss 1.9988 Accuracy 0.3661
Epoch 2 Batch 150 Loss 1.9869 Accuracy 0.3678
Epoch 2 Batch 200 Loss 1.9747 Accuracy 0.3702
Epoch 2 Batch 250 Loss 1.9627 Accuracy 0.3725
Epoch 2 Batch 300 Loss 1.9501 Accuracy 0.3754
Epoch 2 Batch 0 Validation Loss 1.8128 Validation Accuracy 0.4117
Epoch 2 Batch 50 Valida

## 4. Generation

In [None]:
def evaluate(
    transformer,
    encoder_input,
    decoder_input,
    stop_symbol,
    max_length=200,
):
    """
    Predicts the output of the model given the `input_sequence`.
    The `input_sequence` is encoded by the Encoder, then its output is fed to the Decoder,
    whose output is fed back into the Decoder until the `stop_symbol` token is produced.

    This function works with a batch of inputs and stops when all outputs include a stop symbol.
    """

    output = decoder_input

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output
    )

    enc_output = transformer.encoder(
        encoder_input, False, enc_padding_mask
    )  # (batch_size, inp_seq_len, d_model)

    for _ in range(max_length):

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, _ = transformer.decoder(
            output, enc_output, False, combined_mask, dec_padding_mask
        )

        predictions = transformer.final_layer(dec_output)

        # select the last character from the seq_len dimension
        predicted_ids = tf.argmax(predictions[:, -1:, :], axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [
                tf.cast(output, dtype=tf.int32),
                tf.cast(predicted_ids, dtype=tf.int32),
            ],
            axis=-1,
        )
                
        if sum(output.numpy()[0] == stop_symbol) == 4:
            print('Stopped')
            return output

    return output

In [None]:
def generate(transformer, input_sequence, target_sequence, input_tokenizer, target_tokenizer, steps, start_symbol, stop_symbol):

    result = target_tokenizer.sequences_to_texts(target_sequence)[0]
    
    encoder_input = input_sequence
    decoder_input = target_sequence

    for _ in range(steps):

        encoder_input = tf.convert_to_tensor(encoder_input)
        decoder_input = tf.convert_to_tensor(decoder_input)
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol)

        generated_text = target_tokenizer.sequences_to_texts(output.numpy())[0]
        
        verses = [line.lstrip() + '<EOV> ' for line in generated_text.split('<EOV>') if line.strip() != '']
        
        result = ''.join([result, verses[-1]])
                
        verses = ''.join(verses[-3:])
        
        decoder_input = target_tokenizer.texts_to_sequences([verses])
        
        verses = remove_syll_token(verses)
        verses = re.sub(r"[ ]+", "", verses)
        verses = re.sub("<[^>]*>", " \g<0> ", verses)
        verses = re.sub("<EOV>  <GO>", "<EOV> <GO>", verses)
        verses = verses.strip()

        encoder_input = input_tokenizer.texts_to_sequences([verses])
        
    return result

In [None]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 6, start_symbol, stop_symbol)

Stopped
Stopped
Stopped
Stopped
Stopped
Stopped


In [None]:
print(strip_tokens(result))

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|El|la |so|pra |che ’l |vi|so a |quel|la |gen|te
|che |per |lo |suo |av|ver|sa|rio al|trui |man|to,
|e |al|tra |vo|ce |mi |pa|rea |più |rat|ta.
« |O |tu |che |se’ |che |sì |pres|so |di|sciol|ta»,
|dis|se ’l |ma|e|stro,« |quan|to |pos|so |po|co,
|se |non |che |tu |se’ |tem|pion |far |non |la|ti,


## 5. Syllabification

In [None]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

In [None]:
encoder_input = tf.convert_to_tensor([input_text[0]])
decoder_input = tf.convert_to_tensor([[start_symbol]])

In [None]:
syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)

Stopped


In [None]:
print(target_tokenizer.sequences_to_texts(syll_output.numpy()))

['<GO> | c h e <SEP> | d i <SEP> | p e n | s i e r <SEP> | m i <SEP> | s t a | v a <SEP> i n <SEP> | u | n o <SEP> | s t r a | l e , <EOV> <GO> | e <SEP> | d i | c o <SEP> | d i <SEP> | g e n | t e <SEP> a l | t r o <SEP> | c h e <SEP> | p i ù <SEP> | d o l | c e » . <EOV> <GO> | N o i <SEP> | e | r a | v a m <SEP> | n e l <SEP> | s u o <SEP> | a | s p e t | t o <SEP> | b a n | d o <EOV> <GO> | c h e <SEP> | l ’ <SEP> a | n i | m a <SEP> | s u a <SEP> | a v | v e n | t a <SEP> | d i | s t a n | t e , <EOV>']


Potrebbe essere underfitting?

## 6. Save model

In [None]:
transformer.save_weights('models/w2c-gen.h5')

In [None]:
new_transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )

In [None]:
# In order to load the new weights the model should be called once for the variables to be initialized

# Any inp, tar is ok here
inp = tf.convert_to_tensor([[start_symbol]])
tar = tf.convert_to_tensor([[start_symbol]])

enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)

new_transformer(inp, tar, False, enc_padding_mask, look_ahead_mask, dec_padding_mask);

In [None]:
new_transformer.load_weights('models/w2c-gen.h5')

In [None]:
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(new_transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 6, start_symbol, stop_symbol)

Stopped
Stopped
Stopped
Stopped
Stopped
Stopped


In [None]:
result

'<GO> | N e l <SEP> | m e z | z o <SEP> | d e l <SEP> | c a m | m i n <SEP> | d i <SEP> | n o | s t r a <SEP> | v i | t a <EOV> <GO> | m i <SEP> | r i | t r o | v a i <SEP> | p e r <SEP> | u | n a <SEP> | s e l | v a <SEP> o | s c u | r a , <EOV> <GO> | c h é <SEP> | l a <SEP> | d i | r i t | t a <SEP> | v i a <SEP> | e | r a <SEP> | s m a r | r i | t a . <EOV><GO> | E l | l a <SEP> | s o | p r a <SEP> | c h e <SEP> ’ l <SEP> | v i | s o <SEP> a <SEP> | q u e l | l a <SEP> | g e n | t e <EOV> <GO> | c h e <SEP> | p e r <SEP> | l o <SEP> | s u o <SEP> | a v | v e r | s a | r i o <SEP> a l | t r u i <SEP> | m a n | t o , <EOV> <GO> | e <SEP> | a l | t r a <SEP> | v o | c e <SEP> | m i <SEP> | p a | r e a <SEP> | p i ù <SEP> | r a t | t a . <EOV> <GO> « <SEP> | O <SEP> | t u <SEP> | c h e <SEP> | s e ’ <SEP> | c h e <SEP> | s ì <SEP> | p r e s | s o <SEP> | d i | s c i o l | t a » , <EOV> <GO> | d i s | s e <SEP> ’ l <SEP> | m a | e | s t r o , « <SEP> | q u a n | t o <SEP> | p o s | s o 