## Setup

In [None]:
'''
Para importar de drive:

from google.colab import drive
drive.mount('/content/drive')

'''

path_to_directory = " _ "
file_name = " _ "
path_to_file = str(path_to_directory+file_name)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import TensorFlow and other libraries

In [None]:
import tensorflow as tf
import numpy as np
import os
import time


In [None]:
# Importamos el dataset con keras
tf.keras.utils.text_dataset_from_directory(
    path_to_directory,
    labels="inferred",
    label_mode="int",
    class_names=None,
    batch_size=32,
    max_length=None,
    shuffle=True,
    seed=None,
    validation_split=None,
    subset=None,
    follow_links=False,
)

Found 1 files belonging to 1 classes.


<_BatchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

### Read the data

First, look in the text:

In [None]:
# Leer el .txt
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# Tamaño del alfabeto
print(f'Length of text: {len(text)} characters')
# Ve primeras lineas
print(text[:116])
# Alfabeto
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

Length of text: 673492 characters
It is a truth universally acknowledged, that a single man in possession
of a good fortune must be in want of a wife.
80 unique characters


## Process the text

In [None]:
# Asigna uid numericos unicos a los caracteres para poder trabajar con ellos
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
# Inverso al anterior, recupera los caracteres desde los uid
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)
# Esto "destokeniza" y vuelve a unir los caracteres
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
# Tamaño del archivo en caracteres y el texto codificado caracteres a uid
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(673492,), dtype=int64, numpy=array([28, 66,  2, ..., 65,  9, 80])>

In [None]:
# El texto pero traducidos los caracteres a los ids
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
# Recupera el id a partir de las listas de IDs y los traduce con chars_from_ids a caracteres legibles
# Recupera y traduce la primera frase de Pride_and_Prejudice.txt
first_sentence = []
for ids in ids_dataset.take(116):
    first_sentence.append(chars_from_ids(ids).numpy().decode('utf-8'))
    print(chars_from_ids(ids).numpy().decode('utf-8'), end= "")


It is a truth universally acknowledged, that a single man in possession
of a good fortune must be in want of a wife.

In [None]:
# Tamaño de las secuencias de caracteres que vamos a usar
seq_length = 100


The `batch` method lets you easily convert these individual characters to sequences of the desired size.

In [None]:
# Divide el dataset en batches de la longitud introducida +1
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))
# Esto lo imprime feo porque no está hecoh el join

tf.Tensor(
[b'I' b't' b' ' b'i' b's' b' ' b'a' b' ' b't' b'r' b'u' b't' b'h' b' '
 b'u' b'n' b'i' b'v' b'e' b'r' b's' b'a' b'l' b'l' b'y' b' ' b'a' b'c'
 b'k' b'n' b'o' b'w' b'l' b'e' b'd' b'g' b'e' b'd' b',' b' ' b't' b'h'
 b'a' b't' b' ' b'a' b' ' b's' b'i' b'n' b'g' b'l' b'e' b' ' b'm' b'a'
 b'n' b' ' b'i' b'n' b' ' b'p' b'o' b's' b's' b'e' b's' b's' b'i' b'o'
 b'n' b'\n' b'o' b'f' b' ' b'a' b' ' b'g' b'o' b'o' b'd' b' ' b'f' b'o'
 b'r' b't' b'u' b'n' b'e' b' ' b'm' b'u' b's' b't' b' ' b'b' b'e' b' '
 b'i' b'n' b' '], shape=(101,), dtype=string)


It's easier to see what this is doing if you join the tokens back into strings:

In [None]:
for seq in sequences.take(3):
  print(text_from_ids(seq).numpy())

b'It is a truth universally acknowledged, that a single man in possession\nof a good fortune must be in '
b'want of a wife.\n\nHowever little known the feelings or views of such a man may be on his\nfirst enterin'
b'g a neighbourhood, this truth is so well fixed in the minds\nof the surrounding families, that he is c'


For training you'll need a dataset of `(input, label)` pairs. Where `input` and
`label` are sequences. At each time step the input is the current character and the label is the next character.

Here's a function that takes a sequence as input, duplicates, and shifts it to align the input and label for each timestep:

In [None]:

def split_input_target(sequence):
    """
    necesitamos pares input,label para predecir,
    input es el caracter actual y label el siguiente
    aqui duplicamos la secuencia y la desplazamos una posicion para hacer coincidir input con label
    (por eso antes habiamos puesto el +1 en la longitud)

    """
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

# Ejemplo
split_input_target(first_sentence[:15])


(['I', 't', ' ', 'i', 's', ' ', 'a', ' ', 't', 'r', 'u', 't', 'h', ' '],
 ['t', ' ', 'i', 's', ' ', 'a', ' ', 't', 'r', 'u', 't', 'h', ' ', 'u'])

In [None]:
# Este será el dataset con el que trabajemos, parejas (sequence, sequence desplazado a la derecha una posicion)
dataset = sequences.map(split_input_target)
# Ejemplo
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


Input : b'It is a truth universally acknowledged, that a single man in possession\nof a good fortune must be in'
Target: b't is a truth universally acknowledged, that a single man in possession\nof a good fortune must be in '


In [None]:
# Tamaño del batch para entrenamiento
BATCH_SIZE = 120

# Para barajar(shuffle)
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(120, 100), dtype=tf.int64, name=None), TensorSpec(shape=(120, 100), dtype=tf.int64, name=None))>

## Build The Model

In [None]:
# Longitud del alfabeto
vocab_size = len(ids_from_chars.get_vocabulary())

# Tamaño del vector que dará el embedding
embedding_dim = 256

# Neuronas en la capa RNN, con este dataset funciona bien a partir de 700/1000
rnn_units = 2000

In [None]:
# Defino la red neuronal
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    # Capa de entrada con el tamaño de embedding definido anteriormente
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    # La capa de RNN
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   # Devuelve toda la sequence, no solo el ultimo caracter. Este es el output propiamente dicho
                                   return_sequences=True,
                                   # Devuelve el state (pesos actuales) además del output
                                   return_state=True)

    #capa de salida, con activación lineal y tiene vocab_size neuronas, una por caracter que puede ser predicho
    self.dense = tf.keras.layers.Dense(vocab_size)

  # Llama la red neuronal sobre un input y devuelve la transformacion
  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
# Inicializo el objeto model a partir de la clase anterior
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
  # Tamaño del batch, longitud de la secuencia, tamaño del vocabulario
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape,  (BATCH_SIZE, seq_length, vocab_size))


(120, 100, 81) (120, 100, 81)


In [None]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  20736     
                                                                 
 gru (GRU)                   multiple                  13548000  
                                                                 
 dense (Dense)               multiple                  162081    
                                                                 
Total params: 13730817 (52.38 MB)
Trainable params: 13730817 (52.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Funcion de pérdida, logits mapea [0,1]--->[-inf,+inf] con nna logaritmica
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
# Lo compilo y le añado un optimizador (compile prepara el modelo para entrenamiento)
model.compile(optimizer='adam', loss=loss)

In [None]:
# Número de épocas para entrenar, entre 20 y 30 funciona mejor
EPOCHS = 30

In [None]:
# Entrenamos el modelo
history = model.fit(dataset, epochs=EPOCHS,verbose= 2)

Epoch 1/30
55/55 - 21s - loss: 1.8875 - 21s/epoch - 374ms/step
Epoch 2/30
55/55 - 23s - loss: 1.6869 - 23s/epoch - 422ms/step
Epoch 3/30
55/55 - 22s - loss: 1.5263 - 22s/epoch - 401ms/step
Epoch 4/30
55/55 - 21s - loss: 1.4004 - 21s/epoch - 375ms/step
Epoch 5/30
55/55 - 21s - loss: 1.3076 - 21s/epoch - 379ms/step
Epoch 6/30
55/55 - 21s - loss: 1.2349 - 21s/epoch - 383ms/step
Epoch 7/30
55/55 - 21s - loss: 1.1727 - 21s/epoch - 385ms/step
Epoch 8/30
55/55 - 21s - loss: 1.1145 - 21s/epoch - 387ms/step
Epoch 9/30
55/55 - 21s - loss: 1.0640 - 21s/epoch - 378ms/step
Epoch 10/30
55/55 - 22s - loss: 1.0165 - 22s/epoch - 399ms/step
Epoch 11/30
55/55 - 21s - loss: 0.9645 - 21s/epoch - 382ms/step
Epoch 12/30
55/55 - 21s - loss: 0.9126 - 21s/epoch - 388ms/step
Epoch 13/30
55/55 - 23s - loss: 0.8569 - 23s/epoch - 411ms/step
Epoch 14/30
55/55 - 21s - loss: 0.7949 - 21s/epoch - 378ms/step
Epoch 15/30
55/55 - 22s - loss: 0.7304 - 22s/epoch - 401ms/step
Epoch 16/30
55/55 - 21s - loss: 0.6619 - 21s/epoc

In [None]:
# Defino la clase OneStep para empaquetar el modelo
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=.3):
    # Temperature influye en la aleatoriedad del modelo
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Para eliminar characteres incorrectos.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(

        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Encaja shape con el alfabeto para poder encadenar predicciones
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)
  # Crea un grafo de tf para optimizar
  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Tokeniza en IDS
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Ejecuta el modelo y coge la ultima prediccion
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    predicted_logits = predicted_logits[:, -1, :]
    # Aplica la temperatura
    predicted_logits = predicted_logits/self.temperature
    # Aplica la mascara para eliminar caracteres incorrectos
    predicted_logits = predicted_logits + self.prediction_mask

    # Elige logits y genera UIDs
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # UID a caracter legible
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Devuelve caracteres y el estado del modelo
    return predicted_chars, states

In [None]:
# Para aplicar el modelo
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

Run it in a loop to generate some text. Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [None]:
# Inicializo el estado
states = None
# Inicializo el texto (prompt)
next_char = tf.constant(['Elizabeth'])
# Predicción
result = [next_char]

# Genera un texto de n caracteres con el modelo entrenadi y el prompt

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)

#
print('_'*80,'\n\n',result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)


________________________________________________________________________________ 

 Elizabeth looked archly, and turned away. Her resistance had not injured
her with the gentleman, and the only face
whose felt in the power of choice. I do not know anybody who seems
more to enjoy them in a desirable match for Jane,” said she, “I should not care about it myself; but the
grounds are delightful. They have at least knew that she
could not speak a word, especially to Miss Darcy, who had been concerned in the measures
taken to separate Mr. Bingley on the enjoyment of it had been
little. Eager to be alone, and fearful of its being the most
remarkable charm of the evening, and might now come to inquire particularly after her. But
this information, the misery she would be serious, however, to increase her vexations by dwelling on them.
She was confident of her being presentage to her feelings, capable of
conversation, which had been pleased to find that she
had satisfied the door. He then sat do

In [None]:
# Guardar el modelo
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')



In [None]:
# Descargar el modelo
nombre = "nombre_del_modelo"
model_name = "Trained_" + nombre + ".zip"
!zip -r model_name /content/one_step
from google.colab import files
files.downloadmodel_name)


updating: content/one_step/ (stored 0%)
updating: content/one_step/saved_model.pb (deflated 90%)
updating: content/one_step/variables/ (stored 0%)
updating: content/one_step/variables/variables.data-00000-of-00001 (deflated 7%)
updating: content/one_step/variables/variables.index (deflated 58%)
updating: content/one_step/assets/ (stored 0%)
updating: content/one_step/fingerprint.pb (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>