In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import io
import os
import re
import time
import unicodedata
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing

from deepcomedy.models.transformer import *
from deepcomedy.preprocessing import load_verses

In [4]:
# TODO
if 'google.colab' in str(get_ipython()):
  print('Running on CoLab')
else:
  print('Not running on CoLab')

Running on CoLab


## 1. Data preprocessing

In [5]:
input_file = "data/divina_textonly.txt"
target_file = "data/divina_syll_textonly.txt"

The `load_verses` function loads the file, splits it into verses, prepends the start_symbol and appends the end_symbol to each verse, then pads each verse to the lenght of the longest verse so that the tensor can be fed to our model.

In [6]:
raw_input_text, _, _= load_verses(
    input_file, char_level=False, pad=False
)
raw_target_text, _, _ = load_verses(
    target_file, char_level=False, pad=False
)

In [7]:
def preprocess_target(x):
    x = re.sub(r'([,’.;«»:?!“”—‘\-"()])', r" \1 ", x)
    x = x.replace(' ', ' <SEP> ')
    x = x.replace('|', ' <SYL> ')
    x = x.replace('  ', ' ')
    x = x.strip()
    x = '<GO> ' + x
    return x

def preprocess_input(x):
    x = re.sub(r'([,’.;«»:?!“”—‘\-"()])', r" \1 ", x)
    x = x.replace('  ', ' ')
    x = x.strip()
    x = '<GO> ' + x
    return x

In [8]:
input_text = [verse.strip() for verse in raw_input_text.split('\n') if verse.strip() != '']
input_text = list(map(preprocess_input, input_text))

In [9]:
target_text = [verse.strip() for verse in raw_target_text.split('\n') if verse.strip() != '']
target_text = list(map(preprocess_target, target_text))

In [10]:
input_tercets = []
target_tercets = []

for line in range(len(input_text) - 6):
    input_tercets.append(' '.join(input_text[line:line+3]) + ' <EOT>')
    target_tercets.append(' '.join(target_text[line+3:line+6]) + ' <EOT>')

In [11]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", char_level=False, lower=False
)
input_tokenizer.fit_on_texts(input_tercets)
input_text = input_tokenizer.texts_to_sequences(input_tercets)

input_vocab = set(input_tokenizer.word_index.keys())
input_vocab_size = len(input_vocab) + 1

In [12]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters="", char_level=False, lower=False
)
target_tokenizer.fit_on_texts(target_tercets)
target_text = target_tokenizer.texts_to_sequences(target_tercets)

target_vocab = set(target_tokenizer.word_index.keys())
target_vocab_size = len(target_vocab) + 1

In [13]:
padded_input = tf.keras.preprocessing.sequence.pad_sequences(
    input_text, padding="post"
)
padded_target = tf.keras.preprocessing.sequence.pad_sequences(
    target_text, padding="post"
)

In [14]:
input_train, input_test, target_train, target_test = train_test_split(
    padded_input, padded_target
)

## 2. The Transformer model


In [15]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train) // BATCH_SIZE

EPOCHS = 50

num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

max_length_targ, max_length_inp = target_train.shape[1], input_train.shape[1]

dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(
    BUFFER_SIZE
)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [16]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate,
)

## 3. Training

In [17]:
checkpoint_path = "./checkpoints/word-syll-gen"

transformer_trainer = TransformerTrainer(
    transformer, checkpoint_save_path=checkpoint_path
)

In [111]:
transformer_trainer.train(dataset, EPOCHS)

Epoch 1 Batch 0 Loss 0.0858 Accuracy 0.9725
Epoch 1 Batch 50 Loss 0.0840 Accuracy 0.9726
Epoch 1 Batch 100 Loss 0.0870 Accuracy 0.9715
Epoch 1 Batch 150 Loss 0.0897 Accuracy 0.9706
Epoch 1 Loss 0.0904 Accuracy 0.9704
Time taken for 1 epoch: 36.26 secs

Epoch 2 Batch 0 Loss 0.0925 Accuracy 0.9688
Epoch 2 Batch 50 Loss 0.0838 Accuracy 0.9722
Epoch 2 Batch 100 Loss 0.0859 Accuracy 0.9717
Epoch 2 Batch 150 Loss 0.0870 Accuracy 0.9713
Epoch 2 Loss 0.0877 Accuracy 0.9712
Time taken for 1 epoch: 36.05 secs

Epoch 3 Batch 0 Loss 0.0946 Accuracy 0.9698
Epoch 3 Batch 50 Loss 0.0831 Accuracy 0.9729
Epoch 3 Batch 100 Loss 0.0841 Accuracy 0.9724
Epoch 3 Batch 150 Loss 0.0859 Accuracy 0.9719
Epoch 3 Loss 0.0867 Accuracy 0.9716
Time taken for 1 epoch: 35.82 secs

Epoch 4 Batch 0 Loss 0.0849 Accuracy 0.9704
Epoch 4 Batch 50 Loss 0.0826 Accuracy 0.9732
Epoch 4 Batch 100 Loss 0.0842 Accuracy 0.9726
Epoch 4 Batch 150 Loss 0.0859 Accuracy 0.9721
Epoch 4 Loss 0.0864 Accuracy 0.9719
Time taken for 1 epoch: 

## 4. Generation

TODO change this :)

We define the *evaluate* function to preprocess the sentence in input to the encoder and to get the predicted ids of the translation.

The ids of the translation are obtained by applying *argmax* to the predicted logits of the decoder.

We begin feeding the decoder with the id of the start symbol and, at each new step, we pass to the decoder the sequence it has just thrown out.

The translation stops when the end symbol is reached.

In [112]:
def generate_greedy(encoder_input, decoder_input):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)
    result = ""

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.argmax(predictions, axis=-1)

        # concatenate the predicted_id to the output which is given to the decoder as its input.
        output = tf.concat(
            [tf.cast(output, dtype=tf.int32), tf.cast(predicted_id, dtype=tf.int32)],
            axis=-1,
        )
        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result

    # output.shape (1, tokens)

In [56]:
def postprocess(x):
    x = x.replace('<SEP>', ' ')
    x = x.replace(' <SYL> ', '|')
    x = x.replace('<SYL> ', '|')
    x = x.replace(' <SYL>', '|')
    x = x.replace('<GO>', '\n')
    x = x.replace('<EOT>', '\n\n')
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

In [40]:
def clean_encoder(x):
    x = x.replace('<SEP>', ' ')
    x = x.replace(' <SYL> ', '')
    x = x.replace('<SYL> ', '')
    x = x.replace(' <SYL>', '')
    x = x.replace('<GO>', ' <GO> ')
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

In [45]:
def clean_decoder(x):
    x = x.replace('<GO>', ' <GO> ')
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

In [113]:
encoder_input = [input_tokenizer.word_index["<GO>"]]
decoder_input = [target_tokenizer.word_index["<GO>"]]

generated_text = generate_greedy(encoder_input, decoder_input)
print(generated_text)

<SYL> e <SEP> <SYL> dis <SYL> se <SEP> : <SEP> <SEP> « <SEP> <SEP> <SYL> Che <SEP> <SYL> hai <SEP> ? <SEP> <SEP> <SYL> sì <SEP> <SYL> che <SEP> , <SEP> <SEP> <SYL> se <SEP> <SYL> tu <SEP> <SYL> m <SEP> ’ <SEP> <SEP> ac <SYL> cor <SYL> si <GO> <SYL> ne <SEP> <SYL> la <SEP> <SYL> men <SYL> te <SEP> <SYL> sua <SEP> <SYL> se <SYL> men <SYL> za <SEP> <SYL> non <SEP> <SYL> con <SYL> du <SYL> ce <SEP> , <SEP> <GO> <SYL> ma <SEP> <SYL> fia <SEP> <SYL> d <SEP> ’ <SEP> <SEP> un <SEP> <SYL> al <SYL> tro <SEP> <SYL> cir <SYL> cun <SYL> scrit <SYL> to <SEP> , <SEP> <SEP> <SYL> se <SYL> gno <SEP> . <SEP> <EOT> 


In [114]:
print(postprocess(generated_text))

|e |dis|se  :   «  |Che |hai  ?  |sì |che  ,  |se |tu |m  ’   ac|cor|si 
|ne |la |men|te |sua |se|men|za |non |con|du|ce  ,  
|ma |fia |d  ’   un |al|tro |cir|cun|scrit|to  ,  |se|gno  .


In [115]:
print(clean_encoder(generated_text))

e disse  :   «  Che hai  ?  sì che  ,  se tu m  ’   accorsi <GO> ne la mente sua semenza non conduce  ,  <GO> ma fia d  ’   un altro circunscritto  ,  segno  .  <EOT>


In [116]:
print(clean_decoder(generated_text))

<SYL> e <SEP> <SYL> dis <SYL> se <SEP> : <SEP> <SEP> « <SEP> <SEP> <SYL> Che <SEP> <SYL> hai <SEP> ? <SEP> <SEP> <SYL> sì <SEP> <SYL> che <SEP> , <SEP> <SEP> <SYL> se <SEP> <SYL> tu <SEP> <SYL> m <SEP> ’ <SEP> <SEP> ac <SYL> cor <SYL> si <GO> <SYL> ne <SEP> <SYL> la <SEP> <SYL> men <SYL> te <SEP> <SYL> sua <SEP> <SYL> se <SYL> men <SYL> za <SEP> <SYL> non <SEP> <SYL> con <SYL> du <SYL> ce <SEP> , <SEP> <GO> <SYL> ma <SEP> <SYL> fia <SEP> <SYL> d <SEP> ’ <SEP> <SEP> un <SEP> <SYL> al <SYL> tro <SEP> <SYL> cir <SYL> cun <SYL> scrit <SYL> to <SEP> , <SEP> <SEP> <SYL> se <SYL> gno <SEP> . <SEP> <EOT>


In [117]:
tokenized_enc = [input_tokenizer.texts_to_sequences(clean_encoder(generated_text))]
tokenized_enc = tokenized_enc[0]
tokenized_enc = list(chain.from_iterable(tokenized_enc))
generated_text_1 = generate_greedy(tokenized_enc, decoder_input)
print(generated_text_1)
print(postprocess(generated_text_1))

<SYL> di <SEP> <SYL> quel <SEP> <SYL> che <SEP> <SYL> cre <SYL> di <SEP> <SYL> set <SYL> ta <SEP> il <SEP> <SYL> cer <SYL> chio <SEP> <SYL> pri <SYL> ma <GO> <SYL> il <SEP> <SYL> qual <SEP> <SYL> tu <SEP> <SYL> se <SEP> ’ <SEP> <SEP> <SYL> di <SEP> <SYL> quel <SEP> <SYL> che <SEP> <SYL> ti <SEP> <SYL> ri <SYL> ce <SYL> ve <SEP> » <SEP> <SEP> . <SEP> <GO> <SYL> Co <SYL> sì <SEP> <SYL> par <SYL> lar <SEP> , <SEP> <SEP> <SYL> co <SYL> me <SEP> il <SEP> <SYL> be <SYL> ne <SEP> <SYL> det <SYL> to <EOT> 
|di |quel |che |cre|di |set|ta  il |cer|chio |pri|ma 
|il |qual |tu |se  ’  |di |quel |che |ti |ri|ce|ve  »   .  
|Co|sì |par|lar  ,  |co|me  il |be|ne |det|to


In [104]:
def last_2_decoder(x):
  x = x.split('<GO>')[1:]
  x = list(map(lambda x: x.strip(), x))
  x = list(map(lambda x: x + '<GO>' , x))
  x = ''.join(x)
  x = target_tokenizer.texts_to_sequences([x])[0]
  return x

In [118]:
tokenized_dec = last_2_decoder(generated_text)
generated_text_2 = generate_greedy(encoder_input, tokenized_dec)
print(generated_text_2)
print(postprocess(generated_text_2))

<GO> <SYL> Que <SYL> sti <SEP> <SYL> fuor <SEP> <SYL> Lu <SYL> ci <SEP> , <SEP> <SEP> <SYL> ri <SYL> ma <SYL> se <SEP> e <SEP> <SYL> non <SEP> <SYL> spe <SYL> sa <EOT> 
|Que|sti |fuor |Lu|ci  ,  |ri|ma|se  e |non |spe|sa


In [119]:
generated_text_3 = generate_greedy(tokenized_enc, tokenized_dec)
print(generated_text_3)
print(postprocess(generated_text_3))

<GO> <SYL> Que <SYL> sti <SEP> <SYL> che <SEP> <SYL> la <SEP> <SYL> Chie <SYL> sa <SEP> , <SEP> <SEP> <SYL> lui <SEP> <SYL> tra <SEP> <SYL> quel <SEP> <SYL> cer <SYL> to <EOT> 
|Que|sti |che |la |Chie|sa  ,  |lui |tra |quel |cer|to


In [122]:
def generate_topk(encoder_input, decoder_input, k=5, temperature=0.5):

    encoder_input = tf.expand_dims(encoder_input, 0)

    output = tf.expand_dims(decoder_input, 0)

    result = ""

    for i in range(200):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output
        )

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )

        # select the last character from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        predictions, indices = tf.math.top_k(predictions, k=k)

        predictions /= temperature
        predictions = np.squeeze(predictions, axis=0)
        indices = np.squeeze(indices, axis=0)
        indices = np.squeeze(indices, axis=0)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy() #qui potrebbe anche essere [0,0]
        predicted_id = indices[predicted_id]

        predicted_id = tf.expand_dims(predicted_id, 0)
        predicted_id = tf.expand_dims(predicted_id, 0)
        output = tf.concat([output, predicted_id], axis=-1)

        result += target_tokenizer.index_word[predicted_id.numpy()[0][0]] + " "

        # return the result if the predicted_id is equal to the end token
        if predicted_id == target_tokenizer.word_index["<EOT>"]:
            return result

    # output.shape (1, tokens)

In [123]:
encoder_input = [input_tokenizer.word_index["<GO>"]]
decoder_input = [target_tokenizer.word_index["<GO>"]]

generated_text = generate_topk(encoder_input, decoder_input)
print(generated_text)

<SYL> e <SEP> <SYL> dis <SYL> se <SEP> : <SEP> <SEP> « <SEP> <SEP> <SYL> Che <SEP> <SYL> hai <SEP> ? <SEP> <SEP> <SYL> sì <SEP> <SYL> che <SEP> , <SEP> <SEP> <SYL> se <SEP> <SYL> non <SEP> <SYL> con <SYL> te <GO> <SYL> da <SEP> <SYL> l <SEP> ’ <SEP> <SEP> al <SYL> tra <SEP> <SYL> par <SYL> te <SEP> <SYL> vi <SYL> di <SEP> , <SEP> <SEP> <SYL> si <SEP> <SYL> ri <SYL> tras <SYL> se <GO> <SYL> lo <SEP> <SYL> bel <SYL> lo <SEP> <SYL> d <SEP> ’ <SEP> <SEP> o <SYL> gne <SEP> <SYL> par <SYL> te <SEP> <SYL> si <SEP> <SYL> con <SYL> fes <SYL> sa <SEP> ; <SEP> <EOT> 


In [124]:
print(postprocess(generated_text))

|e |dis|se  :   «  |Che |hai  ?  |sì |che  ,  |se |non |con|te 
|da |l  ’   al|tra |par|te |vi|di  ,  |si |ri|tras|se 
|lo |bel|lo |d  ’   o|gne |par|te |si |con|fes|sa  ;
