In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
if 'google.colab' in str(get_ipython()):
    from google.colab import files
    
    files.upload()

Saving data.zip to data.zip
Saving deepcomedy.zip to deepcomedy.zip


In [3]:
!pip install wandb
!pip install strsimpy
#!tar zxvf deepcomedy.tar.gz
#!tar zxvf checkpoints.tar.gz
!unzip deepcomedy.zip
#!tar zxvf data.tar.gz
!unzip data.zip

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/6c/48/b199e2b3b341ac842108c5db4956091dd75d961cfa77aceb033e99cac20f/wandb-0.10.31-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.2MB/s 
[?25hCollecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting pathtools
  Downloading https://files.pythonhosted.org/packages/e7/7f/470d6fcdf23f9f3518f6b0b76be9df16dcc8630ad409947f8be2eb0ed13a/pathtools-0.1.2.tar.gz
Collecting subprocess32>=3.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/32/c8/564be4d12629b912ea431f1a50eb8b3b9d00f1a0b1ceff17f266be190007/subprocess32-3.5.4.tar.gz (97kB)
[K     |██████████████

In [4]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from deepcomedy.utils import *

## 1. Data preprocessing

In [5]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet='')
text = preprocess_text(raw_text, end_of_tercet='')

Split preprocessed text into verses

In [6]:
sep = "<EOV>"
input_tercets = [x.lstrip() + sep for x in text.split(sep)][:-1]
target_tercets = [x.lstrip() + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [7]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [8]:
input_text = []
target_text = []
target_text_tercet = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 3])))
    target_text_tercet.append(list(chain(*enc_target_tercets[line : line + 3])))
    target_text.append(list(chain(*enc_target_tercets[line : line + 4])))

Pad sequences

In [9]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

## 2. The Transformer model


In [10]:
dataset = make_dataset(padded_input_text, padded_target_text)

In [11]:
config = {
    "num_layers" : 6,
    "d_model" : 256,
    "num_heads" : 4,
    "dff" : 512,
}

checkpoint_save_path = "./checkpoints/char-input_char-output_gen"

In [12]:
transformer, transformer_trainer = make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path= None)

## 3. Training

In [13]:
transformer_trainer.train(dataset, 30)

Epoch 1 Batch 0 Loss 5.0481 Accuracy 0.0003
Epoch 1 Batch 50 Loss 3.9581 Accuracy 0.1262
Epoch 1 Batch 100 Loss 3.5209 Accuracy 0.1663
Epoch 1 Batch 150 Loss 3.3449 Accuracy 0.1823
Epoch 1 Batch 200 Loss 3.1761 Accuracy 0.2015
Epoch 1 Batch 250 Loss 2.9910 Accuracy 0.2272
Epoch 1 Batch 300 Loss 2.8416 Accuracy 0.2484
Epoch 1 Batch 350 Loss 2.7246 Accuracy 0.2650
Epoch 1 Batch 400 Loss 2.6314 Accuracy 0.2785
Epoch 1 Loss 2.5653 Accuracy 0.2883
Time taken for 1 epoch: 106.16 secs

Epoch 2 Batch 0 Loss 1.9573 Accuracy 0.3826
Epoch 2 Batch 50 Loss 1.9313 Accuracy 0.3835
Epoch 2 Batch 100 Loss 1.9163 Accuracy 0.3871
Epoch 2 Batch 150 Loss 1.9030 Accuracy 0.3903
Epoch 2 Batch 200 Loss 1.8881 Accuracy 0.3943
Epoch 2 Batch 250 Loss 1.8720 Accuracy 0.3982
Epoch 2 Batch 300 Loss 1.8551 Accuracy 0.4025
Epoch 2 Batch 350 Loss 1.8371 Accuracy 0.4074
Epoch 2 Batch 400 Loss 1.8186 Accuracy 0.4121
Epoch 2 Loss 1.8031 Accuracy 0.4160
Time taken for 1 epoch: 95.17 secs

Epoch 3 Batch 0 Loss 1.6437 Accur

## 4. Generation

In [14]:
def evaluate(
    transformer,
    encoder_input,
    decoder_input,
    stop_symbol,
    max_length=200,
):
    """
    Predicts the output of the model given the `input_sequence`.
    The `input_sequence` is encoded by the Encoder, then its output is fed to the Decoder,
    whose output is fed back into the Decoder until the `stop_symbol` token is produced.

    This function works with a batch of inputs and stops when all outputs include a stop symbol.
    """

    output = decoder_input

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output
    )

    enc_output = transformer.encoder(
        encoder_input, False, enc_padding_mask
    )  # (batch_size, inp_seq_len, d_model)

    for _ in range(max_length):

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, _ = transformer.decoder(
            output, enc_output, False, combined_mask, dec_padding_mask
        )

        predictions = transformer.final_layer(dec_output)

        # select the last character from the seq_len dimension
        predicted_ids = tf.argmax(predictions[:, -1:, :], axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [
                tf.cast(output, dtype=tf.int32),
                tf.cast(predicted_ids, dtype=tf.int32),
            ],
            axis=-1,
        )
                
        if sum(output.numpy()[0] == stop_symbol) == 4:
            print('Stopped')
            return output

    return output

In [15]:
def generate(transformer, input_sequence, target_sequence, input_tokenizer, target_tokenizer, steps, start_symbol, stop_symbol):

    result = target_tokenizer.sequences_to_texts(target_sequence)[0]
    
    encoder_input = input_sequence
    decoder_input = target_sequence

    for _ in range(steps):

        encoder_input = tf.convert_to_tensor(encoder_input)
        decoder_input = tf.convert_to_tensor(decoder_input)
        output = evaluate(transformer, encoder_input, decoder_input, stop_symbol)

        generated_text = target_tokenizer.sequences_to_texts(output.numpy())[0]
        
        verses = [line.lstrip() + '<EOV> ' for line in generated_text.split('<EOV>') if line.strip() != '']
        
        result = ''.join([result, verses[-1]])
                
        verses = ''.join(verses[-3:])
        
        decoder_input = target_tokenizer.texts_to_sequences([verses])
        
        verses = remove_syll_token(verses)
        verses = re.sub(r"[ ]+", "", verses)
        verses = re.sub("<[^>]*>", " \g<0> ", verses)
        verses = re.sub("<EOV>  <GO>", "<EOV> <GO>", verses)
        verses = verses.strip()

        encoder_input = input_tokenizer.texts_to_sequences([verses])
        
    return result

In [16]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 6, start_symbol, stop_symbol)

Stopped
Stopped
Stopped
Stopped
Stopped
Stopped


In [17]:
print(strip_tokens(result))

|Nel |mez|zo |del |cam|min |di |no|stra |vi|ta
|mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,
|ché |la |di|rit|ta |via |e|ra |smar|ri|ta.
|E |quel|l’ om|bre, |che |più |non |si |di|scer|ne,
|co|me |fu |l’ uom |che |non |suo |più |si |puo|te
|da |tut|te |l’ al|tre |par|ti |già |di|vi|na.
|La |tua |cit|tà |co|sì |com’ |io |ti |fio|ri
|a |ri|guar|dar |de |le |sa|cre |che |be|ne,
|quan|do |non |so |chi |son |si |fuor |di|ser|ra».


## 5. Syllabification

In [18]:
start_symbol = target_tokenizer.word_index["<GO>"]
stop_symbol = target_tokenizer.word_index["<EOV>"]

In [19]:
encoder_input = tf.convert_to_tensor([input_text[0]])
decoder_input = tf.convert_to_tensor([[start_symbol]])

In [20]:
syll_output = evaluate(transformer, encoder_input, decoder_input, stop_symbol, max_length=400)

Stopped


In [21]:
print(target_tokenizer.sequences_to_texts(syll_output.numpy()))

['<GO> | p e r <SEP> | c h e <SEP> | l a <SEP> | v i | s t a <SEP> | m i a <SEP> | s o n <SEP> | l i <SEP> | d i | s e r | r a . <EOV> <GO> | Q u a n | d o <SEP> | n ’ <SEP> a | s c o n | d e | r e <SEP> i l <SEP> | m i o <SEP> | c a | p o <SEP> | f i | g l i o , <EOV> <GO> | u | d i | r e <SEP> e <SEP> | a l | t r o <SEP> a <SEP> | n o i , <SEP> | s e <SEP> | t u <SEP> | a | v e | r e <EOV> <GO> | c o | m e <SEP> | t u <SEP> | a <SEP> | t e <SEP> | c h e <SEP> | s i <SEP> | f a <SEP> | c o n | t e n | t a . <EOV>']


Potrebbe essere underfitting?

## 6. Save model

In [23]:
transformer.save_weights('models/c2c-gen.h5')

In [24]:
new_transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )

In [25]:
# In order to load the new weights the model should be called once for the variables to be initialized

# Any inp, tar is ok here
inp = tf.convert_to_tensor([[start_symbol]])
tar = tf.convert_to_tensor([[start_symbol]])

enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)

new_transformer(inp, tar, False, enc_padding_mask, look_ahead_mask, dec_padding_mask);

In [26]:
new_transformer.load_weights('models/c2c-gen.h5')

In [27]:
encoder_input = [input_text[0]]
decoder_input = [target_text_tercet[0]]

result = generate(new_transformer, encoder_input, decoder_input, input_tokenizer, target_tokenizer, 6, start_symbol, stop_symbol)

Stopped
Stopped
Stopped
Stopped
Stopped
Stopped


In [28]:
result

'<GO> | N e l <SEP> | m e z | z o <SEP> | d e l <SEP> | c a m | m i n <SEP> | d i <SEP> | n o | s t r a <SEP> | v i | t a <EOV> <GO> | m i <SEP> | r i | t r o | v a i <SEP> | p e r <SEP> | u | n a <SEP> | s e l | v a <SEP> o | s c u | r a , <EOV> <GO> | c h é <SEP> | l a <SEP> | d i | r i t | t a <SEP> | v i a <SEP> | e | r a <SEP> | s m a r | r i | t a . <EOV><GO> | E <SEP> | q u e l | l ’ <SEP> o m | b r e , <SEP> | c h e <SEP> | p i ù <SEP> | n o n <SEP> | s i <SEP> | d i | s c e r | n e , <EOV> <GO> | c o | m e <SEP> | f u <SEP> | l ’ <SEP> u o m <SEP> | c h e <SEP> | n o n <SEP> | s u o <SEP> | p i ù <SEP> | s i <SEP> | p u o | t e <EOV> <GO> | d a <SEP> | t u t | t e <SEP> | l ’ <SEP> a l | t r e <SEP> | p a r | t i <SEP> | g i à <SEP> | d i | v i | n a . <EOV> <GO> | L a <SEP> | t u a <SEP> | c i t | t à <SEP> | c o | s ì <SEP> | c o m ’ <SEP> | i o <SEP> | t i <SEP> | f i o | r i <EOV> <GO> | a <SEP> | r i | g u a r | d a r <SEP> | d e <SEP> | l e <SEP> | s a | c r e <SEP> | c 

In [29]:
if 'google.colab' in str(get_ipython()):
    files.download('models/c2c-gen.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>