In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
if 'google.colab' in str(get_ipython()):
    from google.colab import files
    
    files.upload()

Saving data.zip to data.zip
Saving deepcomedy.zip to deepcomedy.zip


In [3]:
!pip install wandb
#!tar zxvf deepcomedy.tar.gz
!unzip deepcomedy.zip
#!tar zxvf data.tar.gz
!unzip data.zip

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.1MB/s 
[?25hCollecting pathtools
  Downloading https://files.pythonhosted.org/packages/e7/7f/470d6fcdf23f9f3518f6b0b76be9df16dcc8630ad409947f8be2eb0ed13a/pathtools-0.1.2.tar.gz
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/32/c8/564be4d12629b912ea431f1a50eb8b3b9d00f1a0b1ceff17f266be190007/subprocess32-3.5.4.tar.gz (97kB)
[K     |████████████████████████████████| 102kB 12.9MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2f

In [4]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *

## 1. Data preprocessing

In [5]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_verse = "")
text = preprocess_text(raw_text, end_of_verse = "", word_level= True)

Split preprocessed text into verses

In [6]:
sep = "<EOT>"
input_tercets = [x + sep for x in text.split(sep)][:-1]
target_tercets = [x + sep for x in syll_text.split(sep)][:-1]

Encode with input and target tokenizers

In [7]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
input_tokenizer.fit_on_texts(input_tercets)

target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
target_tokenizer.fit_on_texts(target_tercets)

enc_input_tercets = input_tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = target_tokenizer.texts_to_sequences(target_tercets)

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [8]:
input_text = []
target_text = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 2])))
    target_text.append(list(chain(*enc_target_tercets[line + 1 : line + 3])))

Pad sequences

In [9]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [10]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input_text, padded_target_text
)

## 2. The Transformer model


In [11]:
def make_dataset(input_verses, target_verses, batch_size):
    buffer_size = len(input_verses)

    steps_per_epoch = len(input_verses) // batch_size

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_train, target_train)
    ).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    return dataset

In [12]:
batch_size = 32
dataset = make_dataset(input_train, target_train, batch_size)

In [13]:
def make_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path = None):
    transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )
    transformer_trainer = TransformerTrainer(transformer, checkpoint_save_path= checkpoint_save_path)

    return transformer, transformer_trainer

In [14]:
config = {
    "num_layers" : 6,
    "d_model" : 256,
    "num_heads" : 8,
    "dff" : 1024
}

checkpoint_save_path = "./checkpoints/word-input_char-output_gen"

In [15]:
transformer, transformer_trainer = make_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path= checkpoint_save_path)

## 3. Training

In [16]:
transformer_trainer.train(dataset, 100)

Epoch 1 Batch 0 Loss 5.1266 Accuracy 0.0092
Epoch 1 Batch 50 Loss 3.9077 Accuracy 0.1471
Epoch 1 Batch 100 Loss 3.4774 Accuracy 0.1799
Epoch 1 Loss 3.4281 Accuracy 0.1836
Time taken for 1 epoch: 57.82 secs

Epoch 2 Batch 0 Loss 2.9811 Accuracy 0.2214
Epoch 2 Batch 50 Loss 2.8181 Accuracy 0.2403
Epoch 2 Batch 100 Loss 2.6015 Accuracy 0.2721
Epoch 2 Loss 2.5633 Accuracy 0.2777
Time taken for 1 epoch: 46.03 secs

Epoch 3 Batch 0 Loss 2.2175 Accuracy 0.3313
Epoch 3 Batch 50 Loss 2.1317 Accuracy 0.3420
Epoch 3 Batch 100 Loss 2.0948 Accuracy 0.3460
Epoch 3 Loss 2.0879 Accuracy 0.3467
Time taken for 1 epoch: 45.73 secs

Epoch 4 Batch 0 Loss 2.0237 Accuracy 0.3517
Epoch 4 Batch 50 Loss 2.0013 Accuracy 0.3578
Epoch 4 Batch 100 Loss 1.9907 Accuracy 0.3594
Epoch 4 Loss 1.9890 Accuracy 0.3595
Time taken for 1 epoch: 45.86 secs

Epoch 5 Batch 0 Loss 1.9899 Accuracy 0.3602
Epoch 5 Batch 50 Loss 1.9557 Accuracy 0.3658
Epoch 5 Batch 100 Loss 1.9474 Accuracy 0.3682
Saving checkpoint for epoch 5 at ./ch

KeyboardInterrupt: ignored

## 4. Generation

In [17]:
def generate_greedy(encoder_input, decoder_input):

    # encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.expand_dims(encoder_input, 0)

    # decoder_input = tf.convert_to_tensor(decoder_input)
    output = tf.expand_dims(decoder_input, 0)
    result = "<GO> "
    tokenized_result = [target_tokenizer.word_index["<GO>"]]

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "
        tokenized_result.append(predicted_id.numpy()[0][0])

        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result, tokenized_result

In [18]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = "<GO> "
    tokenized_result = [target_tokenizer.word_index["<GO>"]]

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        pred = tf.random.categorical(predictions, num_samples=1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "
        tokenized_result.append(predicted_id.numpy()[0][0])

        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result, tokenized_result

Abbiamo provato due modi per generare:
1. Dare all'encoder in input una terzina e ottenere la terzina successiva (come abbiamo allenato la rete a fare fondamentalmente), poi passare la terzina generata sempre all'encoder per ottenere la successiva e così via.
1. Dare all'encoder in input uno start symbol e al decoder gli ultimi due versi della terzina generata. Il risultato dovrebbe tenere in considerazione esclusivamente il verso che ne esce fuori (TODO modificare generate greedy in modo tale che restituisca esclusivamente il next verse).
1. TODO next provare a dare qualcosa all'encoder e al decoder contemporaneamente (es. contesto di generazione per il decoder generato dall'encoder?)

## Feeding the encoder the last output

In [19]:
def clean(x):
  x = re.sub(r'\| \b', '', x)
  x = re.sub(r'\b \|', '', x)
  x = re.sub(r'\|', '', x)
  x = re.sub(r'[ ]+', ' ', x)
  x = re.sub(r'\b \b', '', x)
  return x

In [29]:
encoder_input = [input_tokenizer.word_index["<GO>"]]
decoder_input = [target_tokenizer.word_index["<GO>"]]

generated_text, generated_tokenized = generate_greedy(encoder_input, decoder_input)
print(generated_text)
print(clean(generated_text))

<GO> | E <SEP> | i o <SEP> | a <SEP> | l u i : « <SEP> | S e <SEP> | t u <SEP> | v u o ’ <SEP> | c h ’ <SEP> i o <SEP> | t i <SEP> | r i e | d i <GO> | l a <SEP> | p r i | m a <SEP> | c h e <SEP> | t a n | t o <SEP> | d i | s c o r | d e | r e , <GO> | p e r <SEP> | l o <SEP> | s e | g n o r <SEP> | d e l <SEP> | m o n | d o <SEP> | s i <SEP> | r i | c h i e | d i » . <EOT> 
<GO> E <SEP> io <SEP> a <SEP> lui : « <SEP> Se <SEP> tu <SEP> vuo ’ <SEP> ch ’ <SEP> io <SEP> ti <SEP> riedi <GO> la <SEP> prima <SEP> che <SEP> tanto <SEP> discordere , <GO> per <SEP> lo <SEP> segnor <SEP> del <SEP> mondo <SEP> si <SEP> richiedi » . <EOT> 


In [24]:
tokenized_generated = input_tokenizer.texts_to_sequences([clean(generated_text)])
print(tokenized_generated)

[[2, 30, 1, 26, 1, 8, 1, 70, 1, 281, 1, 29, 1, 216, 1, 216, 1, 26, 1, 37, 1, 2, 6, 1, 83, 1, 5, 1, 49, 1, 2, 10, 1, 27, 1, 748, 1, 25, 1, 95, 1, 12, 1, 643, 3]]


In [27]:
generated_text_2, _ = generate_greedy(tokenized_generated[0], decoder_input)
print(generated_text_2)
print(clean(generated_text_2))

<GO> | E <SEP> | i o <SEP> | a <SEP> | l u i : « <SEP> | S e <SEP> | t u <SEP> | t i <SEP> | t i <SEP> | t i <SEP> | c o | t a <GO> | d i m | m i , <SEP> | s e <SEP> | t u <SEP> | l a <SEP> | m e | m o | r i a <SEP> | t e m | p o <SEP> | c e n | n o <GO> | c h e <SEP> | l ’ <SEP> a | n i | m a <SEP> | t u a <SEP> | q u e | s t i o n <SEP> | t i <SEP> | r i | t o | t a » . <EOT> 
<GO> E <SEP> io <SEP> a <SEP> lui : « <SEP> Se <SEP> tu <SEP> ti <SEP> ti <SEP> ti <SEP> cota <GO> dimmi , <SEP> se <SEP> tu <SEP> la <SEP> memoria <SEP> tempo <SEP> cenno <GO> che <SEP> l ’ <SEP> anima <SEP> tua <SEP> question <SEP> ti <SEP> ritota » . <EOT> 


## Feeding the decoder the last output

In [30]:
print(generated_tokenized)

[14, 1, 39, 2, 1, 5, 6, 2, 1, 4, 2, 1, 9, 15, 5, 34, 32, 2, 1, 42, 3, 2, 1, 10, 15, 2, 1, 19, 15, 6, 20, 2, 1, 12, 22, 20, 2, 5, 6, 2, 1, 10, 5, 2, 1, 8, 5, 3, 1, 13, 5, 14, 1, 9, 4, 2, 1, 17, 8, 5, 1, 16, 4, 2, 1, 12, 22, 3, 2, 1, 10, 4, 7, 1, 10, 6, 2, 1, 13, 5, 1, 11, 12, 6, 8, 1, 13, 3, 1, 8, 3, 18, 14, 1, 17, 3, 8, 2, 1, 9, 6, 2, 1, 11, 3, 1, 21, 7, 6, 8, 2, 1, 13, 3, 9, 2, 1, 16, 6, 7, 1, 13, 6, 2, 1, 11, 5, 2, 1, 8, 5, 1, 12, 22, 5, 3, 1, 13, 5, 33, 25, 24]


In [31]:
generated_text_3, _ = generate_greedy(encoder_input, generated_tokenized)
print(generated_text_2)
print(clean(generated_text_2))

<GO> | E <SEP> | i o <SEP> | a <SEP> | l u i : « <SEP> | S e <SEP> | t u <SEP> | t i <SEP> | t i <SEP> | t i <SEP> | c o | t a <GO> | d i m | m i , <SEP> | s e <SEP> | t u <SEP> | l a <SEP> | m e | m o | r i a <SEP> | t e m | p o <SEP> | c e n | n o <GO> | c h e <SEP> | l ’ <SEP> a | n i | m a <SEP> | t u a <SEP> | q u e | s t i o n <SEP> | t i <SEP> | r i | t o | t a » . <EOT> 
<GO> E <SEP> io <SEP> a <SEP> lui : « <SEP> Se <SEP> tu <SEP> ti <SEP> ti <SEP> ti <SEP> cota <GO> dimmi , <SEP> se <SEP> tu <SEP> la <SEP> memoria <SEP> tempo <SEP> cenno <GO> che <SEP> l ’ <SEP> anima <SEP> tua <SEP> question <SEP> ti <SEP> ritota » . <EOT> 
