In [2]:
if 'google.colab' in str(get_ipython()):
    from google.colab import files
    
    files.upload()

Saving deepcomedy.zip to deepcomedy.zip


In [3]:
!pip install wandb
#!tar zxvf deepcomedy.tar.gz
!unzip deepcomedy.zip
#!tar zxvf data.tar.gz
!unzip data.zip

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 13.7MB/s 
[?25hCollecting pathtools
  Downloading https://files.pythonhosted.org/packages/e7/7f/470d6fcdf23f9f3518f6b0b76be9df16dcc8630ad409947f8be2eb0ed13a/pathtools-0.1.2.tar.gz
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Collecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/27/da/6f6224fdfc47dab57881fe20c0d1bc3122be290198ba0bf26a953a045d92/GitPython-3.1.17-py3-none-any.whl (166kB)
[K     |███████

In [4]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import wandb
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *

## 1. Data loading and preprocessing

In [5]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_tercet="")
text = preprocess_text(raw_text, end_of_tercet="")

Split preprocessed text into verses

In [6]:
sep = "<EOV>"
input_verses = [x + sep for x in text.split(sep)][:-1]
target_verses = [x + sep for x in syll_text.split(sep)][:-1]

Encode with tokenizer

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
tokenizer.fit_on_texts(target_verses)
enc_input_verses = tokenizer.texts_to_sequences(input_verses)
enc_target_verses = tokenizer.texts_to_sequences(target_verses)
vocab_size = len(tokenizer.word_index) + 1

Pad sequences

In [8]:
input_text = tf.keras.preprocessing.sequence.pad_sequences(
    enc_input_verses, padding="post"
)
target_text = tf.keras.preprocessing.sequence.pad_sequences(
    enc_target_verses, padding="post"
)

In [9]:
input_train, input_test, target_train, target_test = train_test_split(
    input_text, target_text
)

## 2. The Transformer model


The dataset is created by grouping the lines in batches and by shuffling them.

Each input's line is in correspondence with its target.

In [10]:
def make_dataset(input_verses, target_verses, batch_size):
    buffer_size = len(input_verses)

    steps_per_epoch = len(input_verses) // batch_size

    dataset = tf.data.Dataset.from_tensor_slices(
        (input_train, target_train)
    ).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    return dataset

In [11]:
batch_size = 32
dataset = make_dataset(input_train, target_train, batch_size)

In [12]:
def make_model(config, vocab_size, checkpoint_save_path = None):
    transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=vocab_size,
        target_vocab_size=vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )
    transformer_trainer = TransformerTrainer(transformer, checkpoint_save_path)

    return transformer, transformer_trainer

In [14]:
config = {
    "num_layers" : 6,
    "d_model" : 256,
    "num_heads" : 8,
    "dff" : 1024
}

checkpoint_save_path = "./checkpoints/char-level-syll"

In [15]:
transformer, transformer_trainer = make_model(config, vocab_size) # checkpoint_save_path

## 3. Training

In [16]:
transformer_trainer.train(dataset, 10)

Epoch 1 Batch 0 Loss 6.0993 Accuracy 0.0006
Epoch 1 Batch 50 Loss 4.5459 Accuracy 0.1048
Epoch 1 Batch 100 Loss 3.8027 Accuracy 0.1580
Epoch 1 Batch 150 Loss 3.5138 Accuracy 0.1788
Epoch 1 Batch 200 Loss 3.2762 Accuracy 0.2061
Epoch 1 Batch 250 Loss 3.0547 Accuracy 0.2380
Epoch 1 Batch 300 Loss 2.8792 Accuracy 0.2645
Epoch 1 Loss 2.7856 Accuracy 0.2789
Time taken for 1 epoch: 85.60 secs

Epoch 2 Batch 0 Loss 1.8671 Accuracy 0.4147
Epoch 2 Batch 50 Loss 1.8407 Accuracy 0.4226
Epoch 2 Batch 100 Loss 1.8092 Accuracy 0.4292
Epoch 2 Batch 150 Loss 1.7736 Accuracy 0.4386
Epoch 2 Batch 200 Loss 1.7432 Accuracy 0.4465
Epoch 2 Batch 250 Loss 1.7144 Accuracy 0.4540
Epoch 2 Batch 300 Loss 1.6864 Accuracy 0.4606
Epoch 2 Loss 1.6695 Accuracy 0.4649
Time taken for 1 epoch: 68.43 secs

Epoch 3 Batch 0 Loss 1.4952 Accuracy 0.5052
Epoch 3 Batch 50 Loss 1.4940 Accuracy 0.5119
Epoch 3 Batch 100 Loss 1.4707 Accuracy 0.5172
Epoch 3 Batch 150 Loss 1.4539 Accuracy 0.5212
Epoch 3 Batch 200 Loss 1.4391 Accurac

## 4. Syllabification

We define the *translate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the GO symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the EOV symbol is reached.

In [43]:
def translate(sentence, max_length=200):

    encoder_input = preprocess_text(sentence, end_of_tercet="")
    encoder_input = tokenizer.texts_to_sequences([encoder_input])
    print(encoder_input)
    encoder_input = tf.convert_to_tensor(encoder_input)

    output = tf.convert_to_tensor([tokenizer.word_index["<GO>"]])
    output = tf.expand_dims(output, 0)
    result = ""

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["<EOV>"]:
            break

    # output.shape (1, tokens)

    return result

In [18]:
def print_translation(sentence, result, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {result}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [44]:
sentence = "E come l’aere, quand’ è ben pïorno,"
ground_truth = "|E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,"

translated_text = translate(sentence)
print_translation(sentence, translated_text, ground_truth)

[[14, 39, 2, 12, 6, 17, 3, 2, 9, 21, 4, 3, 8, 3, 19, 2, 26, 16, 4, 7, 13, 21, 2, 36, 2, 27, 3, 7, 2, 18, 43, 6, 8, 7, 6, 19, 15]]
Input:         : E come l’aere, quand’ è ben pïorno,
Prediction     : | E <SEP> | c o | m e <SEP> | l ’ <SEP> a | e | r e , <SEP> | q u a n | d ’ <SEP> è <SEP> | b e n <SEP> | p ï | o r | n o , <EOV> 
Ground truth   : |E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,


In [45]:
!tar zcvf checkpoints.tar.gz checkpoints

checkpoints/
checkpoints/train/
checkpoints/train/checkpoint
checkpoints/train/ckpt-2.data-00000-of-00001
checkpoints/train/ckpt-2.index
checkpoints/train/ckpt-1.data-00000-of-00001
checkpoints/train/ckpt-1.index


In [46]:
if 'google.colab' in str(get_ipython()):
    files.download('checkpoints.tar.gz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>