In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
!pip install wandb

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 4.0MB/s 
[?25hCollecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/27/da/6f6224fdfc47dab57881fe20c0d1bc3122be290198ba0bf26a953a045d92/GitPython-3.1.17-py3-none-any.whl (166kB)
[K     |████████████████████████████████| 174kB 47.7MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/32/c8/564be4d12629b912ea

In [None]:
!tar zxvf deepcomedy.tar.gz

deepcomedy/
deepcomedy/util/
deepcomedy/util/predicate.py
deepcomedy/util/__pycache__/
deepcomedy/util/__pycache__/predicate.cpython-37.pyc
deepcomedy/util/__pycache__/__init__.cpython-37.pyc
deepcomedy/util/__init__.py
deepcomedy/util/.ipynb_checkpoints/
deepcomedy/util/.ipynb_checkpoints/predicate-checkpoint.py
deepcomedy/models/
deepcomedy/models/layers.py
deepcomedy/models/transformer.py
deepcomedy/models/__pycache__/
deepcomedy/models/__pycache__/layers.cpython-37.pyc
deepcomedy/models/__pycache__/__init__.cpython-37.pyc
deepcomedy/models/__pycache__/transformer.cpython-37.pyc
deepcomedy/models/__init__.py
deepcomedy/models/.ipynb_checkpoints/
deepcomedy/models/.ipynb_checkpoints/transformer-checkpoint.py
deepcomedy/preprocessing.py
deepcomedy/__pycache__/
deepcomedy/__pycache__/__init__.cpython-37.pyc
deepcomedy/__pycache__/preprocessing.cpython-37.pyc
deepcomedy/__init__.py
deepcomedy/.ipynb_checkpoints/


In [None]:
!tar zxvf data.tar.gz

data/
data/orlando.txt
data/divina_textonly.txt
data/divina.txt
data/divina_syll_textonly.txt
data/orlando-textonly.txt
data/divina_syll.txt
data/.ipynb_checkpoints/
data/.ipynb_checkpoints/orlando-checkpoint.txt
data/.ipynb_checkpoints/orlando-textonly-checkpoint.txt
data/.ipynb_checkpoints/divina_textonly-checkpoint.txt
data/.ipynb_checkpoints/divina_syll-checkpoint.txt
data/.ipynb_checkpoints/divina-checkpoint.txt
data/.ipynb_checkpoints/divina_syll_textonly-checkpoint.txt


In [None]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import wandb
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

## 1. Data loading and preprocessing

In [None]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [None]:
raw_input_text, input_text = load_verses(input_file, char_level=True, pad=True, tokenize=False)
raw_target_text, target_text, target_tokenizer = load_verses(target_file, char_level=True, pad=True)

In [None]:
input_text = target_tokenizer.texts_to_sequences(input_text)

In [None]:
input_text = tf.keras.preprocessing.sequence.pad_sequences(input_text, padding='post')

In [None]:
print("Length of input text: {} characters".format(len(raw_input_text)))
print("Length of target text: {} characters".format(len(raw_target_text)))

Length of input text: 558637 characters
Length of target text: 873431 characters


In [None]:
input_vocab = sorted(set(input_tokenizer.word_index.keys()))
target_vocab = sorted(set(target_tokenizer.word_index.keys()))
input_vocab_size = len(input_vocab)
target_vocab_size = len(target_vocab)

In [None]:
print("Input vocab size: {}".format(input_vocab_size))
print("Target vocab size: {}".format(target_vocab_size))

Input vocab size: 80
Target vocab size: 81


In [None]:
input_train, input_test, target_train, target_test = train_test_split(
    input_text, target_text
    )

## 2. The Transformer model


The dataset is created by grouping the lines in batches and by shuffling them.

Each input's line is in correspondence with its target.

In [None]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE
num_layers = 6
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

vocab_size = (
    len(target_tokenizer.word_index) + 1
)  # the +1 is added to take into account the padding token

max_length_targ, max_length_inp = target_text.shape[1], input_text.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate
)

## 3. Training

In [None]:
transformer_trainer = TransformerTrainer(transformer, checkpoint_save_path='./checkpoints/char-level-syll_1')

In [None]:
transformer_trainer.train(dataset, 20)

Epoch 1 Batch 0 Loss 4.9341 Accuracy 0.0093
Epoch 1 Batch 50 Loss 3.8692 Accuracy 0.1576
Epoch 1 Batch 100 Loss 3.4398 Accuracy 0.1866
Epoch 1 Batch 150 Loss 3.2315 Accuracy 0.2046
Epoch 1 Loss 3.1646 Accuracy 0.2141
Time taken for 1 epoch: 26.92 secs

Epoch 2 Batch 0 Loss 2.4334 Accuracy 0.3245
Epoch 2 Batch 50 Loss 2.2203 Accuracy 0.3599
Epoch 2 Batch 100 Loss 2.1075 Accuracy 0.3770
Epoch 2 Batch 150 Loss 2.0340 Accuracy 0.3901
Epoch 2 Loss 2.0150 Accuracy 0.3937
Time taken for 1 epoch: 16.77 secs

Epoch 3 Batch 0 Loss 1.7998 Accuracy 0.4356
Epoch 3 Batch 50 Loss 1.7650 Accuracy 0.4421
Epoch 3 Batch 100 Loss 1.7216 Accuracy 0.4528
Epoch 3 Batch 150 Loss 1.6853 Accuracy 0.4623
Epoch 3 Loss 1.6751 Accuracy 0.4650
Time taken for 1 epoch: 16.77 secs

Epoch 4 Batch 0 Loss 1.5592 Accuracy 0.4948
Epoch 4 Batch 50 Loss 1.5289 Accuracy 0.5027
Epoch 4 Batch 100 Loss 1.5050 Accuracy 0.5082
Epoch 4 Batch 150 Loss 1.4860 Accuracy 0.5128
Epoch 4 Loss 1.4808 Accuracy 0.5145
Time taken for 1 epoch: 

KeyboardInterrupt: ignored

To train the decoder we use teacher forcing, calculating the loss between the predicted logits and the real id of the character.

## 4. Syllabification

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [None]:
raw_target_text

'\n\n |Nel |mez|zo |del |cam|min |di |no|stra |vi|ta             \n |mi |ri|tro|vai |per |u|na |sel|va o|scu|ra,               \n |ché |la |di|rit|ta |via |e|ra |smar|ri|ta.                \n\n |Ahi |quan|to a |dir |qual |e|ra è |co|sa |du|ra           \n |e|sta |sel|va |sel|vag|gia e |a|spra e |for|te            \n |che |nel |pen|sier |ri|no|va |la |pa|u|ra!                \n\n |Tan|t’ è |a|ma|ra |che |po|co è |più |mor|te;             \n |ma |per |trat|tar |del |ben |ch’ i’ |vi |tro|vai,         \n |di|rò |de |l’ al|tre |co|se |ch’ i’ |v’ ho |scor|te.      \n\n |Io |non |so |ben |ri|dir |com’ |i’ |v’ in|trai,           \n |tan|t’ e|ra |pien |di |son|no |a |quel |pun|to            \n |che |la |ve|ra|ce |via |ab|ban|do|nai.                    \n\n |Ma |poi |ch’ i’ |fui |al |piè |d’ un |col|le |giun|to,    \n |là |do|ve |ter|mi|na|va |quel|la |val|le                  \n |che |m’ a|vea |di |pa|u|ra il |cor |com|pun|to,           \n\n |guar|dai |in |al|to e |vi|di |le |sue |spal|le       

In [None]:

tokenizer.word_index['^']

KeyError: ignored

In [None]:
tokenizer = target_tokenizer

In [None]:
def evaluate(sentence, max_length=200):

    encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(
        [encoder_input], maxlen=max_length, padding="post"
    )
    encoder_input = tf.convert_to_tensor(encoder_input)

    output = tf.convert_to_tensor([tokenizer.word_index["^"]])
    output = tf.expand_dims(output, 0)
    result = ""

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            break

    # output.shape (1, tokens)

    return result

In [None]:
def print_translation(sentence, result, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {result}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = "^E come l’aere, quand’ è ben pïorno,$"
ground_truth = "|E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,"


translated_text = evaluate(sentence)
print_translation(sentence, translated_text, ground_truth)

Input:         : ^E come l’aere, quand’ è ben pïorno,$
Prediction     : | E   | c o | m e   | l ’   e | r e ,   | q u a n | d ’   è   | b e n   | p ï | o r | n o , $ 
Ground truth   : |E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,


In [None]:
!tar zcvf checkpoints.tar.gz checkpoints

checkpoints/
checkpoints/char-level-syll/
checkpoints/char-level-syll/ckpt-2.data-00000-of-00001
checkpoints/char-level-syll/ckpt-3.data-00000-of-00001
checkpoints/char-level-syll/checkpoint
checkpoints/char-level-syll/ckpt-4.index
checkpoints/char-level-syll/ckpt-4.data-00000-of-00001
checkpoints/char-level-syll/ckpt-2.index
checkpoints/char-level-syll/ckpt-1.index
checkpoints/char-level-syll/ckpt-3.index
checkpoints/char-level-syll/ckpt-1.data-00000-of-00001


In [None]:
from google.colab import files
files.download('checkpoints.tar.gz') 