# Setup

## Step 1: Importing Libraries

In [27]:
import tensorflow as tf
import numpy as np
import os
import time

## Step 2: Dataset Install

In [28]:
# For the purposes of this tutorial, we'll fetch the shakespeare data set from the google api
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
# the arguments to the above would be changed for our project since we'd have our own data

# Read Data

In [29]:
# First, open the file
f = open(path_to_file, 'rb')
# Now, we read the raw file data
raw_data = f.read()
# Next, decode the data
decoded_text = raw_data.decode(encoding='utf-8')

# We should also make a set of the unique characters in the set
vocab = sorted(set(decoded_text))


In [30]:
# As a makeshift sanity test, let's print the length of the text and the first 250 characters
print("Length of text in characters:", len(decoded_text))
print("\nFirst 250 chars:\n")
print(decoded_text[:250])

Length of text in characters: 1115394

First 250 chars:

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



# Process the Text

## Vectorize Text

We need to find a way to numerically represent the text

Tensorflow's `keras.layers.StringLookup` can get a numerical representation of a character, but we first need to split up the string

In [31]:
# Split up string into chars
chars = tf.strings.unicode_split(decoded_text, input_encoding='UTF-8')

# Next, use keras.layers.StringLookup to numerically represent the characters
# To do this, we need to create the String Lookup layer; this is a preprocessing layer
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)
# Now we can get the ids from characters
ids = ids_from_chars(chars)

In [32]:
# We need to be able to invert the above process, so we do the following:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

chars = chars_from_ids(ids)

In [33]:
# We can define the following function to join characters back into strings
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


## Training Examples and Targets

Let's divide the text into example sequences, each of which has `seq_length` characters
Each input sequence -> corresponding targets contain same length of text shifted down by 1
Example: if we have a string "hello" with `seq_length = 4`, input sequence is "hell" and corresponding target is "ello"

In [34]:
# first, convert text vector into stream of character indices
all_ids = ids_from_chars(tf.strings.unicode_split(decoded_text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [35]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))


F
i
r
s
t
 
C
i
t
i


In [36]:
# set seq_length
seq_length = 100
# use batch method to convert individual characters to sequences of desired size
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [37]:
# We need input/label pairs for training, so we use this function
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [38]:
# We can use the map function and the above to create the following
dataset = sequences.map(split_input_target)

## Create Training Batches

In [39]:
# Set batch size
BATCH_SIZE = 64

# Need a buffer in which dataset shuffling may occur
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

# Build the Model

We need to define a model with three layers:
1. Input layer: a trainable lookup table that will map each character-ID to a vector with `embedding-dim` dimensions
2. `tf.keras.layer.GRU`: a type of RNN with size `units=rnn_units`
3. `tf.keras.layers.Dense`: output layer (the number of outputs is `vocab_size`). It outputs one 'logit' for each character in vocab. According to the model, these are the long-likelihood of each character

In [40]:
# We need to define some variables

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [41]:
# Now, let's make a model class
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(rnn_units,
                                      return_sequences=True,
                                      return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    
    def call(self, inputs, states=None, return_state = False, training = False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)

        x = self.dense(x, training=training)

        if return_state:
          return x, states
        else:
          return x

In [42]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


According to the tutorial:
> For each character the model looks up the embedding, runs the GRU one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character

# Trying the Model

In [43]:
# Let's check the shape of the model
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 66) # (batch_size, sequence_length, vocab_size)


In [44]:
model.summary()


Model: "my_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     multiple                  16896     
                                                                 
 gru_1 (GRU)                 multiple                  3938304   
                                                                 
 dense_1 (Dense)             multiple                  67650     
                                                                 
Total params: 4,022,850
Trainable params: 4,022,850
Non-trainable params: 0
_________________________________________________________________


In [45]:
# We can get predictions of the model by sampling from the output distribution
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [46]:
# We now have a prediction of the next character index for each timestep
print(sampled_indices)

[19 32  6 51 48 18 33 22 61 39 28  6  9 11 12 62 22 24 61  1 52  3 53 61
 62  2 10 24 32 36 33 16 40 17 39 25 61 16 26 55 23 34 48 50 32 46  4  7
 59 49 36 53  1 30 20 49 31 58 55 15 52 50 37 44 54 38 17 53 53  6 37 18
 57 26 37 63 13 10 38 32 13 23 53 31 25 16 37 45 64 15 26 45 55  1 60 39
 43 39 17 60]


In [47]:
# Let's print the actual next character compared to what the model says
print("Input:")
print(text_from_ids(input_example_batch[0]).numpy())
print("Model:")
print(text_from_ids(sampled_indices).numpy())

Input:
b"s summer by this sun of York;\nAnd all the clouds that lour'd upon our house\nIn the deep bosom of the"
Model:
b"FS'liETIvZO'.:;wIKv\nm!nvw 3KSWTCaDZLvCMpJUikSg$,tjWn\nQGjRspBmkXeoYDnn'XErMXx?3YS?JnRLCXfyBMfp\nuZdZDu"


Clearly, the model isn't doing too great. So let's move on to training it

# Training the Data
The first step in training the data is adding a loss function
The standard `tf.keras.sparse_categorical_crossentropy` serves our purpose fairly well

In [49]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [50]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)


Prediction shape:  (64, 100, 66)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.191824, shape=(), dtype=float32)


In [51]:
# using the loss function and tf.keras.optimizer.Adam, we can use configure training
model.compile(optimizer='adam', loss=loss)

In [52]:
# We can configure it so that checkpoints are saved during training
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [53]:
# Now, we execute training

# Set epochs of training
EPOCHS = 10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Generate Text
The above training takes a while to execute, but once it's done, we can generate text!

In [54]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states


In [55]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


In [56]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO:
Heaven and did son, espare defend.

GLOUCESTER:

KING EDWARD IV:
To thee,
That trues are illest eat: and there by thee
as he would give not known inprinity.
That would be look'd with her, or they
were soon so slain?
Once more of you to cry,
Dentile thee; thousand fiery?

GONZALO:
You duke, do not witness that the sentence
That should not burill in our
fainth: some it mine enemy?
I think, thy free ut tafen,
The sostions a happy moletainted
Though I the earth; and to 't in hope to you to pass,
And fit her elentage, and as to grant
Hereafter'd of nothing gizent
To saint in refender to chy horses! Ofthing early him
To hize as wencefter in my jogs,
My nature you cannot army,
Ground so carry gives, married to the Tircules
Of enbert'st the issue; and no more but that about him;
So and dost nature scope their work:
Why, leth he of King Richard.

DUKE OF AUMERLE:
You have to-four among a father.

GONZALO:
That's a greater grow it think for care.
Mountant the banished? O, methoughts, and 

# Export Generator

In [57]:
# We can save the model like so
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')






INFO:tensorflow:Assets written to: one_step/assets


INFO:tensorflow:Assets written to: one_step/assets


In [58]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))


ROMEO:
Come, good arm them; every thousand men,
Before a set how his prove as hate me,
If you ragker cause
