In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

In [None]:
# TODO
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
else:
  print('Not running on CoLab')

## 1. Data preprocessing

In [3]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [4]:
raw_input_text, input_text, input_tokenizer = load_verses(
    input_file, char_level=False, pad=False
)
raw_target_text, target_text, target_tokenizer = load_verses(
    target_file, char_level=True, pad=False
)

In [5]:
print("Length of input text: {} characters".format(len(raw_input_text)))
print("Length of target text: {} characters".format(len(raw_target_text)))

Length of input text: 558637 characters
Length of target text: 873431 characters


In [6]:
input_vocab = sorted(set(input_tokenizer.word_index.keys()))
target_vocab = sorted(set(target_tokenizer.word_index.keys()))

# + 1 to account for padding token "0"
input_vocab_size = len(input_vocab) + 1
target_vocab_size = len(target_vocab) + 1

In [7]:
print("Input vocab size: {}".format(input_vocab_size))
print("Target vocab size: {}".format(target_vocab_size))

Input vocab size: 20749
Target vocab size: 81


In [8]:
input_tercets = []
target_tercets = []

for line in range(len(input_text) - 6):
    input_tercets.append(list(chain(*input_text[line : line + 3])))
    target_tercets.append(list(chain(*target_text[line + 3 : line + 6])))

In [9]:
padded_input = tf.keras.preprocessing.sequence.pad_sequences(
    input_tercets, padding="post"
)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(
    target_tercets, padding="post"
)

In [10]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input, padded_target
)

## 2. The Transformer model


In [12]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE

EPOCHS = 50

num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

max_length_targ, max_length_inp = target_train.shape[1], input_train.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [13]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate,
)

## 3. Training

In [14]:
checkpoint_path = "./checkpoints/word-level-gen"

transformer_trainer = TransformerTrainer(
    transformer, checkpoint_save_path=checkpoint_path
)

In [None]:
transformer_trainer.train(dataset, EPOCHS)

## 4. Generation

TODO change this :)

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [54]:
def translate(sentence, max_length=200):

    encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(
        [encoder_input], maxlen=max_length, padding="post"
    )
    encoder_input = tf.convert_to_tensor(encoder_input)

    output = tf.convert_to_tensor([tokenizer.word_index["^"]])
    output = tf.expand_dims(output, 0)
    result = ""

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:

            break

    # output.shape (1, tokens)

    return result

In [83]:
def print_translation(sentence, result, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {result}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [84]:
sentence = "^E come l’aere, quand’ è ben pïorno,$"
ground_truth = "|E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,"


translated_text = translate(sentence)
print_translation(sentence, translated_text, ground_truth)

Input:         : ^E come l’aere, quand’ è ben pïorno,$
Prediction     : | c h e   | l a   | m i a   | v i | s t a   | m i   | f é   | p a | r e a   | m a | l e $ 
Ground truth   : |E |co|me |l’ ae|re, |quan|d’ è |ben |pï|or|no,


In [94]:
def generate_greedy(encoder_input, decoder_input):

    # encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.expand_dims(encoder_input, 0)

    # decoder_input = tf.convert_to_tensor(decoder_input)
    output = tf.expand_dims(decoder_input, 0)
    result = ""

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            result += "\n"
        if result.count("$") == 3:
            return result

    # output.shape (1, tokens)

In [95]:
encoder_input = [tokenizer.word_index["^"]]
decoder_input = [tokenizer.word_index["^"]]

generated_text = generate_greedy(encoder_input, decoder_input)
print(generated_text)

| c h e   | l a   | m i a   | v i | s t a   | m i   | f é   | p a | r e a   | m a | l e $ 
^ | c h e   | l ’   a | b i | t o   | d e   | l ’   a r | g o | m e n | t o   | s e | g n o . $ 
^ | E   | q u e l | l a   | c h e   | p r o | p r i e | n e   i l   | c a | l o r   | m i o $ 



In [114]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = ""

    output_ = []

    terces = 0

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        pred = tf.random.categorical(predictions, num_samples=1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == tokenizer.word_index["$"]:
            result += "\n"

        if result.count("$") == 3:
            terces += 1

        if terces == 3:
            return result

    # output.shape (1, tokens)

In [113]:
encoder_input = [tokenizer.word_index["^"]]
decoder_input = [tokenizer.word_index["^"]]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

| s o | v r a   | l e   | s t e l | l e   | s t e l | l e   | d e l   | s u o   | v e r | b o , $ 
^ | p e r   | c h e   | l a   | v i | v a   | l u | c e   | c o n | v i e n   | c a | r e . $ 
^ | E   | q u e | s t o   | s i   | f e | c e   | c o n | v i e n   | c h e   | v a n | n o $ 
^ ^ 


In [105]:
sentence = "ciao"
encoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]
decoder_input = [tokenizer.word_index[i] for i in list(map(str, sentence))]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

| v e   | g i à   | m a i   | n o n   | f u   | m a i   | n é   | r i | s t r e t | t a . $ 
^ | O r   | s a i   | t u   | d i e | t r o ,   e   | n o n   | t i   | p a r | l a | v a | r o : $ 
^ | p e r   | c h e   | l e   | s t e l | l e   | c h e   ’ n   | s u   | l a   | p r o | p r i a $ 
^ | 
