<a href="https://colab.research.google.com/github/Yasmine-ChemP/GMCtraining/blob/main/txt_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import os
import time
from tensorflow.keras import layers

In [None]:
path_to_file= tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
print(path_to_file)

/root/.keras/datasets/shakespeare.txt


In [None]:
text=open(path_to_file, "rb").read().decode(encoding='utf-8')
print(text[0:250])
#length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

Length of text: 1115394 characters


In [None]:
print(len(text.split('\n')))

40001


In [None]:
#the unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

65 unique characters


In [None]:
ids_from_chars= tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

def text_from_ids (ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

chars_from_ids= tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
ids_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int64, name=None)>

In [None]:
for ids in ids_dataset.take(10):
    print('{} : {}'.format(ids,chars_from_ids(ids).numpy().decode('utf-8')))

19 : F
48 : i
57 : r
58 : s
59 : t
2 :  
16 : C
48 : i
59 : t
48 : i


In [None]:
#chaque exemple aura son x qui represent 10 caracteres , et le y qui represente les 100 prochainss caracteres 

seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(5):
  print(text_from_ids(seq).numpy().decode('utf-8'))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k
now Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us ki
ll him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be d
one: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citi


In [None]:
#On va générer à partir de chaque exemple un input (x) et un label (y), en sachat que y représente le prochain caractére après x
#exemple
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [None]:
#Notre corpus a été traité et maintenant nous avons un dataset
#avec l'input et le label correspondant.
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(5):
    print("Input :", text_from_ids(input_example).numpy().decode('utf-8'))
    print("Target:", text_from_ids(target_example).numpy().decode('utf-8'))
    print('-------------------------------------------------------------')

Input : First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
Target: irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 
-------------------------------------------------------------
Input : are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you 
Target: re all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you k
-------------------------------------------------------------
Input : now Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us k
Target: ow Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us ki
-------------------------------------------------------------
Input : ll him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be 
Target: l him, and we'll have corn at our own price

In [None]:
dataset=dataset.shuffle(1000).batch(32,drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
print(vocab_size)

66


In [None]:
class text_generator(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

model = text_generator(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(example_batch_predictions[0])

(32, 100, 66) # (batch_size, sequence_length, vocab_size)
tf.Tensor(
[[-1.58620952e-03 -8.99900869e-03 -2.95061478e-03 ... -3.41817294e-03
  -7.65823806e-03  5.20007452e-05]
 [ 1.07349325e-02 -7.77952699e-03 -1.33121377e-02 ...  1.90391354e-02
  -6.85085426e-04 -7.31240259e-03]
 [ 1.72874816e-02 -4.24209330e-03 -1.88576393e-02 ...  2.96124835e-02
   2.87965825e-03 -1.28748771e-02]
 ...
 [-2.68900907e-03  8.39425344e-03 -8.42684414e-04 ... -1.69341289e-03
  -1.08202994e-02  1.80203393e-02]
 [-2.49829376e-03 -4.31051245e-03  2.66899494e-03 ...  1.02829784e-02
   1.40545852e-02 -4.36308514e-03]
 [-1.48637388e-02 -9.44947265e-03 -3.83418170e-04 ...  3.59617546e-03
   1.63448602e-02 -6.72300067e-03]], shape=(100, 66), dtype=float32)


In [None]:
#à chaque timestamp (caractère) on obtient la prédiction du prochain caractére
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy().decode('utf-8'))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy().decode('utf-8'))

Input:
 !

VOLUMNIA:
One on 's father's moods.

VALERIA:
Indeed, la, 'tis a noble child.

VIRGILIA:
A crack,

Next Char Predictions:
 ZmA;zaDk?sCYngzXUt:Ty,ulYw&
dyVqfFLpe$ .?Stl'UdylCGLItTvFcBwulcXuyXcpwsZNXKD.Q! I'bFODCFAY!YydCxdvlL


In [None]:
#On passe à de l'apprentissage.
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (32, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.1899853, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()

66.02182

In [None]:
#configuration du model
model.compile(optimizer='adam', loss=loss , metrics=["accuracy"])

In [None]:
#Configuration du modèle.
model.compile(optimizer='adam', loss=loss,metrics=['accuracy'])
# On peut enregistrer l'apprentissage de notre modèle en réalisant des checkpoints
# Cela nous permettra de pouvoir résumer l'apprentissage à un temps voulu
# Ou même de revenir en arriére.
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, verbose=1)

In [None]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 1: saving model to ./training_checkpoints/ckpt_1
Epoch 2/30
Epoch 2: saving model to ./training_checkpoints/ckpt_2
Epoch 3/30
Epoch 3: saving model to ./training_checkpoints/ckpt_3
Epoch 4/30
Epoch 4: saving model to ./training_checkpoints/ckpt_4
Epoch 5/30
Epoch 5: saving model to ./training_checkpoints/ckpt_5
Epoch 6/30
Epoch 6: saving model to ./training_checkpoints/ckpt_6
Epoch 7/30
Epoch 7: saving model to ./training_checkpoints/ckpt_7
Epoch 8/30
Epoch 8: saving model to ./training_checkpoints/ckpt_8
Epoch 9/30
Epoch 9: saving model to ./training_checkpoints/ckpt_9
Epoch 10/30
Epoch 10: saving model to ./training_checkpoints/ckpt_10
Epoch 11/30
Epoch 11: saving model to ./training_checkpoints/ckpt_11
Epoch 12/30
Epoch 12: saving model to ./training_checkpoints/ckpt_12
Epoch 13/30
Epoch 13: saving model to ./training_checkpoints/ckpt_13
Epoch 14/30
Epoch 14: saving model to ./training_checkpoints/ckpt_14
Epoch 15/30
Epoch 15: saving model to ./training_checkpoints/

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(10000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
Thou wast not, but your father Capulet: 'tis gone a gentleman;
Which to remained by the noise of it?

SICINIUS:
This is strange of him; how seemed you?

TRANIO:
Mistress, we will cry it out my fitting and no good.

HORTENSIO:
Petruchio, since my heart
I have a ripe: lust you in justice, if my life
Scapting our ancient gracious queen,
A most shame in queshio.'
Crewas, brother, if thou hast advented
Our partial 'er, knee with you;
Since when we were tricks, and merry waves! Bolan! pray,
Doth chois me, and I am now given me merry:
I am no a gentleman's face; I am
courted and pry my carry within.

Sendat add him, he had something near me yet,
Sir Viechous which honest malice or
arisonable burst, or who loved his love
To make again seems without innocence, which should
not scorn for fault. To call thee seeath
As now incessage there,
Without come and Pertiss, and now away so far
To look and Norfolk, Warwick, what a stranger in
the nobility of toman torment?
It is a good dinner-piery e