In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

In [6]:
# TODO
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
else:
  print('Not running on CoLab')

Running on CoLab


## 1. Data preprocessing

In [7]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [8]:
raw_input_text, _, _= load_verses(
    input_file, char_level=False, pad=False
)
raw_target_text, _, _ = load_verses(
    target_file, char_level=False, pad=False
)

In [9]:
def preprocess_target(x):
    x = re.sub(r'([,’.;«»:?!“”—‘\-"()])', r" \1 ", x)
    x = x.replace(' ', ' <SEP> ')
    x = x.replace('|', ' <SYL> ')
    x = x.replace('  ', ' ')
    x = x.strip()
    x = '<GO> ' + x
    return x

def preprocess_input(x):
    x = re.sub(r'([,’.;«»:?!“”—‘\-"()])', r" \1 ", x)
    x = x.replace('  ', ' ')
    x = x.strip()
    x = '<GO> ' + x
    return x

In [10]:
input_text = [verse.strip() for verse in raw_input_text.split('\n') if verse.strip() != '']
input_text = list(map(preprocess_input, input_text))

In [11]:
target_text = [verse.strip() for verse in raw_target_text.split('\n') if verse.strip() != '']
target_text = list(map(preprocess_target, target_text))

In [12]:
input_tercets = []
target_tercets = []

for line in range(len(input_text) - 6):
    input_tercets.append(' '.join(input_text[line:line+3]) + ' <EOT>')
    target_tercets.append(' '.join(target_text[line+3:line+6]) + ' <EOT>')

In [13]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", char_level=False, lower=False
)
input_tokenizer.fit_on_texts(input_tercets)
input_text = input_tokenizer.texts_to_sequences(input_tercets)

input_vocab = set(input_tokenizer.word_index.keys())
input_vocab_size = len(input_vocab) + 1

In [14]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", char_level=False, lower=False
)
target_tokenizer.fit_on_texts(target_tercets)
target_text = target_tokenizer.texts_to_sequences(target_tercets)

target_vocab = set(target_tokenizer.word_index.keys())
target_vocab_size = len(target_vocab) + 1

In [15]:
padded_input = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [16]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input, padded_target
)

## 2. The Transformer model


In [17]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE

EPOCHS = 50

num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

max_length_targ, max_length_inp = target_train.shape[1], input_train.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [18]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate,
)

## 3. Training

In [19]:
checkpoint_path = "./checkpoints/word-syll-gen"

transformer_trainer = TransformerTrainer(
    transformer, checkpoint_save_path=checkpoint_path
)

In [20]:
transformer_trainer.train(dataset, EPOCHS)

Epoch 1 Batch 0 Loss 7.9046 Accuracy 0.0000
Epoch 1 Batch 50 Loss 6.9916 Accuracy 0.1424
Epoch 1 Batch 100 Loss 6.0812 Accuracy 0.2255
Epoch 1 Batch 150 Loss 5.5818 Accuracy 0.2563
Epoch 1 Loss 5.4629 Accuracy 0.2630
Time taken for 1 epoch: 91.69 secs

Epoch 2 Batch 0 Loss 4.1807 Accuracy 0.3384
Epoch 2 Batch 50 Loss 3.9072 Accuracy 0.4051
Epoch 2 Batch 100 Loss 3.5955 Accuracy 0.4377
Epoch 2 Batch 150 Loss 3.3668 Accuracy 0.4530
Epoch 2 Loss 3.3121 Accuracy 0.4563
Time taken for 1 epoch: 79.03 secs

Epoch 3 Batch 0 Loss 2.7288 Accuracy 0.4849
Epoch 3 Batch 50 Loss 2.6522 Accuracy 0.4931
Epoch 3 Batch 100 Loss 2.5942 Accuracy 0.4984
Epoch 3 Batch 150 Loss 2.5416 Accuracy 0.5073
Epoch 3 Loss 2.5269 Accuracy 0.5098
Time taken for 1 epoch: 78.64 secs

Epoch 4 Batch 0 Loss 2.3562 Accuracy 0.5393
Epoch 4 Batch 50 Loss 2.3385 Accuracy 0.5428
Epoch 4 Batch 100 Loss 2.3177 Accuracy 0.5472
Epoch 4 Batch 150 Loss 2.2993 Accuracy 0.5504
Epoch 4 Loss 2.2936 Accuracy 0.5513
Time taken for 1 epoch: 

## 4. Generation

TODO change this :)

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [26]:
def generate_greedy(encoder_input, decoder_input):

    # encoder_input = tf.convert_to_tensor(encoder_input)
    encoder_input = tf.expand_dims(encoder_input, 0)

    # decoder_input = tf.convert_to_tensor(decoder_input)
    output = tf.expand_dims(decoder_input, 0)
    result = ""

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result

    # output.shape (1, tokens)

In [38]:
def postprocess(x):
    x = x.replace('<SEP>', ' ')
    x = x.replace('<SYL>', '|')
    x = x.replace(' <GO> ', '\n')
    return x

In [32]:
encoder_input = [input_tokenizer.word_index["<GO>"]]
decoder_input = [target_tokenizer.word_index["<GO>"]]

generated_text = generate_greedy(encoder_input, decoder_input)
print(generated_text)

<SYL> e <SEP> <SYL> nel <SEP> <SYL> suo <SEP> <SYL> lu <SYL> me <SEP> <SYL> die <SYL> tro <SEP> a <SEP> <SYL> le <SEP> <SYL> ser <SYL> pi <SYL> ne <SEP> , <SEP> <GO> <SYL> di <SEP> <SYL> quel <SYL> la <SEP> <SYL> fie <SYL> ra <SEP> <SYL> che <SEP> <SYL> to <SYL> sto <SEP> <SYL> s <SEP> ’ <SEP> <SEP> a <SYL> spet <SYL> ta <SEP> ; <SEP> <GO> <SYL> e <SEP> <SYL> io <SEP> , <SEP> <SEP> <SYL> se <SEP> <SYL> non <SEP> <SYL> fos <SYL> si <SEP> <SYL> pur <SEP> <SYL> a <SEP> <SYL> sé <SEP> <SYL> no <SYL> ta <SEP> . <SEP> <EOT> 


In [39]:
print(postprocess(generated_text))

| e   | nel   | suo   | lu | me   | die | tro   a   | le   | ser | pi | ne   ,  
| di   | quel | la   | fie | ra   | che   | to | sto   | s   ’     a | spet | ta   ;  
| e   | io   ,     | se   | non   | fos | si   | pur   | a   | sé   | no | ta   .   <EOT> 


In [49]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = ""

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() #qui potrebbe anche essere [0,0]
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result

    # output.shape (1, tokens)

In [50]:
encoder_input = [input_tokenizer.word_index["<GO>"]]
decoder_input = [target_tokenizer.word_index["<GO>"]]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

<SYL> e <SEP> <SYL> poi <SEP> <SYL> che <SEP> <SYL> di <SEP> <SYL> là <SEP> <SYL> mi <SEP> <SYL> ven <SYL> det <SYL> ta <SEP> <SYL> pun <SYL> to <SEP> , <SEP> <GO> <SYL> con <SYL> vien <SEP> <SYL> ch <SEP> ’ <SEP> <SEP> i <SEP> ’ <SEP> <SEP> <SYL> cre <SYL> do <SEP> <SYL> che <SEP> <SYL> l <SEP> ’ <SEP> <SEP> al <SYL> to <SEP> <SYL> mon <SYL> do <SEP> , <SEP> <GO> <SYL> con <SEP> <SYL> tut <SYL> te <SEP> <SYL> le <SEP> <SYL> co <SYL> se <SEP> <SYL> che <SEP> <SYL> qui <SEP> <SYL> si <SEP> <SYL> mon <SYL> da <SEP> . <SEP> <EOT> 


In [51]:
print(postprocess(generated_text))

| e   | poi   | che   | di   | là   | mi   | ven | det | ta   | pun | to   ,  
| con | vien   | ch   ’     i   ’     | cre | do   | che   | l   ’     al | to   | mon | do   ,  
| con   | tut | te   | le   | co | se   | che   | qui   | si   | mon | da   .   <EOT> 
