Generiamo del testo dando in input un dataset txt 


In [None]:
!pip install tensorflow
!pip install numpy


In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time


In [None]:
#path_to_file = tf.keras.utils.get_file("digital.txt", "https://raw.githubusercontent.com/carloocchiena/rnn_text_generation/main/dataset/digitransf.txt" )

path_to_file = tf.keras.utils.get_file("metal.txt", "https://raw.githubusercontent.com/carloocchiena/rnn_text_generation/main/dataset/metal.txt" )

#when refreshing the dataset, clean also the keras temp folder C:\Users\Carlo\.keras\datasets

In [None]:
text = open(path_to_file, "rb").read().decode(encoding="utf-8")
print(f"Lenght of text: {len(text)} characthers")
print (text[:250])

In [None]:
vocab = sorted(set(text))
print (f"{len(vocab)} unique characters")

In [None]:
example_texts =["abcdefghijklmnopqrstuvw", "xyz"]
chars = tf.strings.unicode_split(example_texts, input_encoding="UTF-8")

In [None]:
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)

In [None]:
ids = ids_from_chars(chars)

In [None]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
chars  = chars_from_ids(ids)

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()

In [None]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, "UTF-8"))
all_ids

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

In [None]:
seq_length = 100
example_per_epoch = len(text)//(seq_length+1)

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
    print(chars_from_ids(seq))

In [None]:
for seq in sequences.take(5):
    print(text_from_ids(seq).numpy())

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input: ", text_from_ids(input_example).numpy())
    print("Target: ",text_from_ids(target_example).numpy())

In [None]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (dataset
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE, drop_remainder=True)
           .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
dataset

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)
        
        if return_state:
            return x, states
        else:
            return x

In [None]:
model = MyModel(vocab_size=len(ids_from_chars.get_vocabulary()), embedding_dim=embedding_dim, rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print (example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
sampled_indices

In [None]:
print ("input: \n", text_from_ids(input_example_batch[0].numpy()))
print ()
print ("Next Char Predictions: \n", text_from_ids(sampled_indices).numpy())

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print ("Prediction shape: ", example_batch_predictions.shape, "# (batch_size, sequence_lenght, vocab_size)")
print ("Mean loss:        ", mean_loss)

In [None]:
tf.exp(mean_loss).numpy()

A newly initialized model shouldn't be too sure of itself, the output logits should all have similar magnitudes. To confirm this you can check that the exponential of the mean loss is approximately equal to the vocabulary size. A much higher loss means the model is sure of its wrong answers, and is badly initialized:

In [None]:
vocab_size   

In [None]:
model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

In [None]:
#directory where checkpoints are saved
checkpoint_dir = "./training_checkpoints"
#name of checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [None]:
EPOCHS = 600


history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
class OneStep(tf.keras.Model):
    def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = chars_from_ids
        self.ids_from_chars = ids_from_chars
        
        #create a mask to prevent [UNK] from bein generated
        skip_ids = self.ids_from_chars(["[UNK]"])[:,None]
        sparse_mask = tf.SparseTensor(
        #Put a -inf at each bad index
        values=[-float("inf")]*len(skip_ids),
        indices=skip_ids,
        #match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)
        
    @tf.function
    def generate_one_step(self, inputs, states=None):
        #convert strings to token IDs
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()
        
        #run the model
        #predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)
        
        #use only the last prediction
        predicted_logits = predicted_logits [:, -1, :]
        predicted_logits = predicted_logits/self.temperature
        #apply the prediction mask, prevent "[UNK]" from bein generated
        predicted_logits = predicted_logits + self.prediction_mask
        
        #sample the output logits to generate token IDs
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=1)
        
        #convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)
        
        #return the characthers and model state
        return predicted_chars, states
    

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(["3. Vanity"])
result = [next_char]

for n in range(1000):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)
    
result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode("utf-8"), "\n\n" + "_"*80)
print("\nRun time:", end - start)

#### Export the generator
This single-step model can  be saved and restored, allowing you to use it anywhere a tf.saved_model is accepted.
Run these cells when needed only.

In [None]:
'''
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')
'''

In [None]:
'''
states = None
next_char = tf.constant(["Digital"])
result = [next_char]

for n in range(100):
    next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
    result.append(next_char)
    
print (tf.strings.join(result)[0].numpy().decode("utf-8"))
'''