In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
!tar zxvf deepcomedy.tar.gz

deepcomedy/
deepcomedy/util/
deepcomedy/util/predicate.py
deepcomedy/util/__init__.py
deepcomedy/util/.ipynb_checkpoints/
deepcomedy/util/.ipynb_checkpoints/predicate-checkpoint.py
deepcomedy/models/
deepcomedy/models/layers.py
deepcomedy/models/transformer.py
deepcomedy/models/__pycache__/
deepcomedy/models/__pycache__/layers.cpython-37.pyc
deepcomedy/models/__pycache__/__init__.cpython-37.pyc
deepcomedy/models/__pycache__/transformer.cpython-37.pyc
deepcomedy/models/__init__.py
deepcomedy/models/.ipynb_checkpoints/
deepcomedy/models/.ipynb_checkpoints/transformer-checkpoint.py
deepcomedy/preprocessing.py
deepcomedy/__pycache__/
deepcomedy/__pycache__/__init__.cpython-37.pyc
deepcomedy/__pycache__/preprocessing.cpython-37.pyc
deepcomedy/__init__.py
deepcomedy/.ipynb_checkpoints/


In [None]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

## 1. Data preprocessing

In [None]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [None]:
raw_input_text, input_text, input_tokenizer = load_verses(
    input_file, char_level=False, pad=False
)
raw_target_text, target_text, target_tokenizer = load_verses(
    target_file, char_level=True, pad=False
)

In [None]:
print("Length of input text: {} characters".format(len(raw_input_text)))
print("Length of target text: {} characters".format(len(raw_target_text)))

Length of input text: 558637 characters
Length of target text: 873431 characters


In [None]:
input_vocab = sorted(set(input_tokenizer.word_index.keys()))
target_vocab = sorted(set(target_tokenizer.word_index.keys()))

# + 1 to account for padding token "0"
input_vocab_size = len(input_vocab) + 1
target_vocab_size = len(target_vocab) + 1

In [None]:
print("Input vocab size: {}".format(input_vocab_size))
print("Target vocab size: {}".format(target_vocab_size))

Input vocab size: 20750
Target vocab size: 82


In [None]:
input_tercets = []
target_tercets = []

for line in range(len(input_text) - 6):
    input_tercets.append(list(chain(*input_text[line : line + 3])))
    target_tercets.append(list(chain(*target_text[line + 3 : line + 6])))

In [None]:
padded_input = tf.keras.preprocessing.sequence.pad_sequences(
    input_tercets, padding="post"
)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(
    target_tercets, padding="post"
)

In [None]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input, padded_target
)

## 2. The Transformer model


In [None]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE

EPOCHS = 10

num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

max_length_targ, max_length_inp = target_train.shape[1], input_train.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate,
)

## 3. Training

In [None]:
checkpoint_path = "./checkpoints/word-level-gen"

transformer_trainer = TransformerTrainer(
    transformer, checkpoint_save_path=checkpoint_path
)

In [45]:
transformer_trainer.train(dataset, EPOCHS)

Epoch 1 Batch 0 Loss 0.9069 Accuracy 0.6899
Epoch 1 Batch 50 Loss 0.9018 Accuracy 0.6934
Epoch 1 Batch 100 Loss 0.9042 Accuracy 0.6928
Epoch 1 Batch 150 Loss 0.9058 Accuracy 0.6924
Epoch 1 Loss 0.9064 Accuracy 0.6922
Time taken for 1 epoch: 79.02 secs

Epoch 2 Batch 0 Loss 0.8778 Accuracy 0.7035
Epoch 2 Batch 50 Loss 0.8791 Accuracy 0.7011
Epoch 2 Batch 100 Loss 0.8841 Accuracy 0.6996
Epoch 2 Batch 150 Loss 0.8861 Accuracy 0.6989
Epoch 2 Loss 0.8864 Accuracy 0.6988
Time taken for 1 epoch: 78.58 secs

Epoch 3 Batch 0 Loss 0.8495 Accuracy 0.7125
Epoch 3 Batch 50 Loss 0.8615 Accuracy 0.7067
Epoch 3 Batch 100 Loss 0.8645 Accuracy 0.7057
Epoch 3 Batch 150 Loss 0.8666 Accuracy 0.7050
Epoch 3 Loss 0.8674 Accuracy 0.7049
Time taken for 1 epoch: 78.41 secs

Epoch 4 Batch 0 Loss 0.8545 Accuracy 0.7082
Epoch 4 Batch 50 Loss 0.8369 Accuracy 0.7148
Epoch 4 Batch 100 Loss 0.8439 Accuracy 0.7125
Epoch 4 Batch 150 Loss 0.8478 Accuracy 0.7111
Epoch 4 Loss 0.8484 Accuracy 0.7110
Time taken for 1 epoch: 

## 4. Generation

TODO change this :)

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [102]:
def generate_greedy(encoder_input, decoder_input):

    # encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.expand_dims(encoder_input, 0)

    # decoder_input = tf.convert_to_tensor(decoder_input)
    output = tf.expand_dims(decoder_input, 0)
    result = ""
    tokenized_result = []

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "
        tokenized_result.append(predicted_id.numpy()[0][0])


        # return the result if the predicted_id is equal to the end token
        if predicted_id == target_tokenizer.word_index["$"]:
            result += "\n"
        if result.count("$") == 3:
            
            return result, tokenized_result

    # output.shape (1, tokens)

In [93]:
def clean_detokenized(x):
  x = ''.join(x.split('\n'))
  x = re.sub(r'\b \b', '', x)
  x = re.sub(r'| \b', '', x)
  x = re.sub(r'\b |', '', x)
  x = re.sub(r'\|', '', x)
  x = re.sub(r'\$', ' $', x)
  x = re.sub(r'[ ]+', ' ', x)
  return x

In [73]:
raw_target_text.split('\n')[6]

' |Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra           '

In [94]:
clean_detokenized(target_tokenizer.sequences_to_texts([target_tercets[0]])[0])

'^ Ahi quanto a dir qual era è cosa dura $ ^ esta selva selvaggia e aspra e forte $ ^ che nel pensier rinova la paura! $'

## Feeding the encoder the last output

In [99]:
encoder_input = [input_tokenizer.word_index["^"]]
decoder_input = [target_tokenizer.word_index["^"]]

generated_text, _ = generate_greedy(encoder_input, decoder_input)
print(clean_detokenized(generated_text))

che mi fu per esser tutto quanto fori, $ ^ perché non so li occhi miei si disira, $ ^ e potreva che sola terra fami. $ 


In [100]:

tokenized_output = input_tokenizer.texts_to_sequences(clean_detokenized(generated_text))
generated_text, _ = generate_greedy(tokenized_generated, decoder_input)
print(generated_text)

| c h e   | m i   | p a r | l a | r e   e   | s o | n o   i n | t e n | d e r   | f u i , $ 
^ | t a l   | m i   | f e | c e   | s ì ,   | c h e   | p e r   | l o   | s t r a | d i | t o $ 
^ | t u t | t o   | m i   | p a r | l a r   | s o | l a   | t u a   | v i | s t a , $ 



## Feeding the decoder the last output

In [122]:
def clean_detokenized(x):
  x = ''.join(x.split('\n'))
  x = re.sub(r'\b \b', '', x)
  x = re.sub(r'| \b', '', x)
  x = re.sub(r'\b |', '', x)
  x = re.sub(r'\$', '$', x)
  x = re.sub(r'\^ ', '^', x)
  x = re.sub(r'[ ]+', ' ', x)
  return x

In [109]:
encoder_input = [input_tokenizer.word_index["^"]]
decoder_input = [target_tokenizer.word_index["^"]]

generated_text, tokenized_generated = generate_greedy(encoder_input, decoder_input)
print(clean_detokenized(generated_text))

|che |mi |fu |per |es|ser |tut|to |quan|to |fo|ri, $ ^ |per|ché |non |so |li oc|chi |miei |si |di|si|ra, $ ^ |e |po|tre|va |che |so|la |ter|ra |fa|mi. $ 


In [None]:
x = clean_detokenized(generated_text).split('$')[1:]
x = list(map(lambda x: x.strip(), x))
x = list(filter(lambda x: x != '', x))
x = list(map(lambda x: x + '$' , x))
x = ''.join(x)
x = target_tokenizer.texts_to_sequences([x])[0]
x

In [137]:
generated_text, tokenized_generated = generate_greedy(encoder_input, x)
print(generated_text)

^ | E   | q u e | s t a   | v i r | t ù   | c h i a | m a   i n   | m e z | z a   | f r e | s t a , $ 
^ | c o | m e   | f o | r a $ 
^ | t e   | s t a | t a | t o r o ,   | t a   | t a | t o r a   | t o r a   | t o , $ 



Abbiamo provato due modi per generare:
1. Dare all'encoder in input una terzina e ottenere la terzina successiva (come abbiamo allenato la rete a fare fondamentalmente), poi passare la terzina generata sempre all'encoder per ottenere la successiva e così via.
1. Dare all'encoder in input uno start symbol e al decoder gli ultimi due versi della terzina generata. Il risultato dovrebbe tenere in considerazione esclusivamente il verso che ne esce fuori (TODO modificare generate greedy in modo tale che restituisca esclusivamente il next verse).
1. TODO next provare a dare qualcosa all'encoder e al decoder contemporaneamente (es. contesto di generazione per il decoder generato dall'encoder?)

In [None]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = ""

    output_ = []

    terces = 0

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        pred = tf.random.categorical(predictions, num_samples=1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            result += "\n"

        if result.count("$") == 3:
            terces += 1

        if terces == 3:
            return result

    # output.shape (1, tokens)

In [None]:
encoder_input = [tokenizer.word_index["^"]]
decoder_input = [tokenizer.word_index["^"]]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

| s o | v r a   | l e   | s t e l | l e   | s t e l | l e   | d e l   | s u o   | v e r | b o , $ 
^ | p e r   | c h e   | l a   | v i | v a   | l u | c e   | c o n | v i e n   | c a | r e . $ 
^ | E   | q u e | s t o   | s i   | f e | c e   | c o n | v i e n   | c h e   | v a n | n o $ 
^ ^ 


In [None]:
sentence = "ciao"
encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
decoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

| v e   | g i à   | m a i   | n o n   | f u   | m a i   | n é   | r i | s t r e t | t a . $ 
^ | O r   | s a i   | t u   | d i e | t r o ,   e   | n o n   | t i   | p a r | l a | v a | r o : $ 
^ | p e r   | c h e   | l e   | s t e l | l e   | c h e   ’ n   | s u   | l a   | p r o | p r i a $ 
^ | 
