In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import files
    
    files.upload()

Saving data.zip to data.zip
Saving deepcomedy.zip to deepcomedy.zip


In [None]:
!pip install wandb
#!tar zxvf deepcomedy.tar.gz
!unzip deepcomedy.zip
#!tar zxvf data.tar.gz
!unzip data.zip

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/98/5f/45439b4767334b868e1c8c35b1b0ba3747d8c21be77b79f09eed7aa3c72b/wandb-0.10.30-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.9MB/s 
[?25hCollecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/fd/01/ff260a18caaf4457eb028c96eeb405c4a230ca06c8ec9c1379f813caa52e/configparser-5.0.2-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/32/c8/564be4d12629b912ea431f1a50eb8b3b9d00f1a0b1ceff17f266be190007/subprocess32-3.5.4.tar.gz (97kB)
[K     |████████████████████████████████| 102kB 8.5MB/s 
[?25hCollecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/1c/4a/a54b254f67d8f4052338d54ebe90126f200693440a93ef76d254d581e3ec/sentry_sdk-1.1.0-py2.py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 20.6MB/s 
Collecting pathtools
  Downloading https://fi

In [None]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import *
from itertools import chain

## 1. Data preprocessing

In [None]:
raw_text = open("./data/divina_textonly.txt", "rb").read().decode(encoding="utf-8")
raw_syll_text = (
    open("./data/divina_syll_textonly.txt", "rb").read().decode(encoding="utf-8")
)
syll_text = preprocess_text(raw_syll_text, end_of_verse = "")
text = preprocess_text(raw_text, end_of_verse = "")

Split preprocessed text into verses

In [None]:
sep = "<EOT>"
input_tercets = [x + sep for x in text.split(sep)][:-1]
target_tercets = [x + sep for x in syll_text.split(sep)][:-1]

Encode with tokenizer

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=False, filters="", lower=False
)
tokenizer.fit_on_texts(target_tercets)
enc_input_tercets = tokenizer.texts_to_sequences(input_tercets)
enc_target_tercets = tokenizer.texts_to_sequences(target_tercets)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
input_text = []
target_text = []

for line in range(len(enc_input_tercets) - 2):
    input_text.append(list(chain(*enc_input_tercets[line : line + 2])))
    target_text.append(list(chain(*enc_target_tercets[line + 1 : line + 3])))

Pad sequences

In [None]:
padded_input_text = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target_text = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [None]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input_text, padded_target_text
)

## 2. The Transformer model


In [None]:
def make_dataset(input_train, target_train, batch_size=32):
    buffer_size = len(input_train)

    dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
    dataset = dataset.batch(batch_size, drop_remainder=True)

    return dataset

In [None]:
dataset = make_dataset(input_train, target_train)

In [None]:
def make_transformer_model(config, input_vocab_size, target_vocab_size, checkpoint_save_path = None):
    transformer = Transformer(
        num_layers=config["num_layers"],
        d_model=config["d_model"],
        num_heads=config["num_heads"],
        dff=config["dff"],
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        pe_input=1000,
        pe_target=1000,
        rate=0.1,
    )
    transformer_trainer = TransformerTrainer(transformer, checkpoint_save_path= checkpoint_save_path)

    return transformer, transformer_trainer

In [None]:
config = {
    "num_layers" : 6,
    "d_model" : 256,
    "num_heads" : 8,
    "dff" : 1024
}

checkpoint_save_path = "./checkpoints/char-level_gen"

In [None]:
transformer, transformer_trainer = make_transformer_model(config, vocab_size, vocab_size, checkpoint_save_path= checkpoint_save_path)

## 3. Training

In [None]:
transformer_trainer.train(dataset, 50)

Epoch 1 Batch 0 Loss 5.3591 Accuracy 0.0009
Epoch 1 Batch 50 Loss 4.1508 Accuracy 0.1494
Epoch 1 Batch 100 Loss 3.6142 Accuracy 0.1795
Epoch 1 Loss 3.5534 Accuracy 0.1830
Time taken for 1 epoch: 153.76 secs

Epoch 2 Batch 0 Loss 2.9743 Accuracy 0.2160
Epoch 2 Batch 50 Loss 2.8866 Accuracy 0.2321
Epoch 2 Batch 100 Loss 2.6464 Accuracy 0.2682
Epoch 2 Loss 2.6055 Accuracy 0.2743
Time taken for 1 epoch: 134.99 secs

Epoch 3 Batch 0 Loss 2.1914 Accuracy 0.3351
Epoch 3 Batch 50 Loss 2.1387 Accuracy 0.3418
Epoch 3 Batch 100 Loss 2.0983 Accuracy 0.3467
Epoch 3 Loss 2.0907 Accuracy 0.3477
Time taken for 1 epoch: 134.40 secs

Epoch 4 Batch 0 Loss 2.0155 Accuracy 0.3615
Epoch 4 Batch 50 Loss 2.0054 Accuracy 0.3582
Epoch 4 Batch 100 Loss 1.9949 Accuracy 0.3597
Epoch 4 Loss 1.9920 Accuracy 0.3602
Time taken for 1 epoch: 134.51 secs

Epoch 5 Batch 0 Loss 1.9939 Accuracy 0.3549
Epoch 5 Batch 50 Loss 1.9572 Accuracy 0.3673
Epoch 5 Batch 100 Loss 1.9522 Accuracy 0.3685
Saving checkpoint for epoch 5 at 

KeyboardInterrupt: ignored

## 4. Generation

In [None]:
def generate_greedy(encoder_input, decoder_input):

    # encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.expand_dims(encoder_input, 0)

    # decoder_input = tf.convert_to_tensor(decoder_input)
    output = tf.expand_dims(decoder_input, 0)
    result = "<GO> "
    tokenized_result = [tokenizer.word_index["<GO>"]]

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "
        tokenized_result.append(predicted_id.numpy()[0][0])

        if predicted_id == tokenizer.word_index["<EOT>"]:
            return result, tokenized_result

In [None]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = "<GO> "
    tokenized_result = [tokenizer.word_index["<GO>"]]

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        pred = tf.random.categorical(predictions, num_samples=1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "
        tokenized_result.append(predicted_id.numpy()[0][0])

        if predicted_id == tokenizer.word_index["<EOT>"]:
            return result, tokenized_result

In [None]:
def clean(x):
  x = re.sub(r'\| \b', '', x)
  x = re.sub(r'\b \|', '', x)
  x = re.sub(r'\|', '', x)
  x = re.sub(r'[ ]+', ' ', x)
  return x

Feeding the encoder the last output

In [None]:
encoder_input = [tokenizer.word_index["<GO>"]]
decoder_input = [tokenizer.word_index["<GO>"]]

generated_text, generated_tokenized = generate_greedy(encoder_input, decoder_input)
print(generated_text)
print(clean(generated_text))

<GO> | e <SEP> | q u e l | l a <SEP> | c h e <SEP> | s i <SEP> | p i a n | t e <SEP> | d i | s c e r | n a | t a <GO> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | d i <SEP> | l a | s c i a | t a <SEP> | s p e | r a <GO> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | d i <SEP> | l u | c i <SEP> | s i <SEP> | p i a | t a . <EOT> 
<GO> e <SEP> q u e l l a <SEP> c h e <SEP> s i <SEP> p i a n t e <SEP> d i s c e r n a t a <GO> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> d i <SEP> l a s c i a t a <SEP> s p e r a <GO> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> d i <SEP> l u c i <SEP> s i <SEP> p i a t a . <EOT> 


In [None]:
tokenized_generated = tokenizer.texts_to_sequences([clean(generated_text)])
print(tokenized_generated)

[14, 3, 2, 26, 15, 3, 9, 9, 4, 2, 12, 22, 3, 2, 11, 5, 2, 17, 5, 4, 7, 10, 3, 2, 13, 5, 11, 12, 3, 8, 7, 4, 10, 4, 14, 13, 5, 2, 26, 15, 3, 9, 9, 4, 2, 12, 22, 3, 2, 17, 5, 31, 2, 13, 5, 2, 9, 4, 11, 12, 5, 4, 10, 4, 2, 11, 17, 3, 8, 4, 14, 13, 5, 2, 26, 15, 3, 9, 9, 4, 2, 12, 22, 3, 2, 17, 5, 31, 2, 13, 5, 2, 9, 15, 12, 5, 2, 11, 5, 2, 17, 5, 4, 10, 4, 25, 24]


In [None]:
generated_text_2, _ = generate_greedy(tokenized_generated[0], decoder_input)
print(generated_text_2)
print(clean(generated_text_2))

<GO> | c h é <SEP> | l a <SEP> | p i ù <SEP> | d i | s t a n | t a <SEP> | p i ù <SEP> | d i | s t a n | t a <GO> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | p i a n | t a <GO> | c h e <SEP> | p i ù <SEP> | d i <SEP> | s é <SEP> | l a <SEP> | p i a n | g e <SEP> | s i <SEP> | s p i | g a n | t a . <EOT> 
<GO> c h é <SEP> l a <SEP> p i ù <SEP> d i s t a n t a <SEP> p i ù <SEP> d i s t a n t a <GO> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> p i a n t a <GO> c h e <SEP> p i ù <SEP> d i <SEP> s é <SEP> l a <SEP> p i a n g e <SEP> s i <SEP> s p i g a n t a . <EOT> 


Feeding the decoder the last output

In [None]:
generated_text_3, _ = generate_greedy(encoder_input, generated_tokenized)
print(generated_text_2)
print(clean(generated_text_2))

<GO> | c h é <SEP> | l a <SEP> | p i ù <SEP> | d i | s t a n | t a <SEP> | p i ù <SEP> | d i | s t a n | t a <GO> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | d i <SEP> | q u e l | l a <SEP> | c h e <SEP> | p i ù <SEP> | p i a n | t a <GO> | c h e <SEP> | p i ù <SEP> | d i <SEP> | s é <SEP> | l a <SEP> | p i a n | g e <SEP> | s i <SEP> | s p i | g a n | t a . <EOT> 
<GO> c h é <SEP> l a <SEP> p i ù <SEP> d i s t a n t a <SEP> p i ù <SEP> d i s t a n t a <GO> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> d i <SEP> q u e l l a <SEP> c h e <SEP> p i ù <SEP> p i a n t a <GO> c h e <SEP> p i ù <SEP> d i <SEP> s é <SEP> l a <SEP> p i a n g e <SEP> s i <SEP> s p i g a n t a . <EOT> 


In [None]:
!tar zcvf checkpoints.tar.gz checkpoints

checkpoints/
checkpoints/char-level_gen/
checkpoints/char-level_gen/ckpt-3.index
checkpoints/char-level_gen/ckpt-2.index
checkpoints/char-level_gen/ckpt-1.index
checkpoints/char-level_gen/ckpt-2.data-00000-of-00001
checkpoints/char-level_gen/ckpt-1.data-00000-of-00001
checkpoints/char-level_gen/checkpoint
checkpoints/char-level_gen/ckpt-3.data-00000-of-00001
checkpoints/char-level_gen/ckpt-4.data-00000-of-00001
checkpoints/char-level_gen/ckpt-4.index


In [None]:
if 'google.colab' in str(get_ipython()):
    files.download('checkpoints.tar.gz')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>