# RNN Text Generator Notebook
## Setup
### Import Tensorflow and other libraries

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np
import os
import time

### Download dataset

In [3]:
path_to_file = tf.keras.utils.get_file('input.txt', 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


### Read first data

In [4]:
# read, then decode for py2 compatibility
text = open(path_to_file, 'rb').read().decode(encoding = 'utf-8')
# length of text is the number of characters on it
print('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [5]:
# see first 200 characters
print(text[:300])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


In [6]:
# Check the unique characters in the file
vocab = sorted(set(text))
print('unique characters {}'.format(len(vocab)))
print(vocab)

unique characters 65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


### Vectorize the text
As map strings to numerical list. Create two vlookup tables as one mapping the characters to numbers, and the other from numbers to characters.

In [7]:
char2idx= {u:i for i, u in enumerate(vocab)}
idx2char=np.array(vocab)
print(char2idx)
print(idx2char)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


In [8]:
text_as_int=np.array([char2idx[c] for c in text])
text_as_int

array([18, 47, 56, ..., 45,  8,  0])

#### We mapped char to int and we mapped the character as indexes from 0 to len(unique)

In [9]:
print('{')
for char,_ in zip(char2idx, range(23)):
    print(' {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
 '\n':   0,
 ' ' :   1,
 '!' :   2,
 '$' :   3,
 '&' :   4,
 "'" :   5,
 ',' :   6,
 '-' :   7,
 '.' :   8,
 '3' :   9,
 ':' :  10,
 ';' :  11,
 '?' :  12,
 'A' :  13,
 'B' :  14,
 'C' :  15,
 'D' :  16,
 'E' :  17,
 'F' :  18,
 'G' :  19,
 'H' :  20,
 'I' :  21,
 'J' :  22,
  ...
}


In [10]:
#print how the first 10 chars from text are mapped to integ
print('{} ---- char mapped to int ---> {}'.format(repr(text[:10]), text_as_int[:10]))

'First Citi' ---- char mapped to int ---> [18 47 56 57 58  1 15 47 58 47]


### Prediction task
The model we try to perform is: Given a char, o sequence of char, what is the most probable next char? 

Inputs: sequence of char

Train: model to predict the output

Output: following char at each time step

| Model Specif | - Recurrent depends on the previously seen elements, given all char computed until this time.

### Create training examples and targets
Next divide the text into examples of sequences. Each input sequence will contain seq_length characters from the text.

For each input sequence, the corresponding targets contain the same length of text, except shifted one char to the right.

So break the text into chunks of seq_length+1. For instance, say seq_length is 4 and our text is "Hello". The input sequence would be "Hell" and target is "ello".

To do so, use ```tf.data.Dataset.from_tensor_slices``` function to convert vectorize text into a stream of character indices.


In [12]:
# The max length sentence we want for a single input of char
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

F
i
r
s
t


The ```batch``` method allowed us to convert these individual chars to sequences of the desired size.

In [13]:
sequences = char_dataset.batch(seq_length+1,drop_remainder = True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


For each sequence, duplicate and shift it to form the input and target text using ```map``` method to apply a simple function to each batch

In [14]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text
dataset= sequences.map(split_input_target)

Print examples input and target values:

In [16]:
for input_example, target_example in dataset.take(1):
    print('Input data:', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data: 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


Each index of these vectors are processed as one time step. For input step 0, model recieves index for 'T' and tries to predict index 'h' as the next character. 

In [17]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print('Step {:4d}'.format(i))
    print('  input: {}  ({:s})'.format(input_idx, repr(idx2char[input_idx])))
    print('  expected output: {}  ({:s})'.format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18  ('F')
  expected output: 47  ('i')
Step    1
  input: 47  ('i')
  expected output: 56  ('r')
Step    2
  input: 56  ('r')
  expected output: 57  ('s')
Step    3
  input: 57  ('s')
  expected output: 58  ('t')
Step    4
  input: 58  ('t')
  expected output: 1  (' ')


### Creating training batches
It uses ```tf.data``` to split the text into manageable sequences. We need to shuffle data and pack it into batches and eventually feeding this data into model.

In [18]:
# Batch size
BATCH_SIZE = 64
# buffer size to shuffle the dataset. Amount of time allocate to process the data
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset


<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

### Build The Model
Use ```tf.keras.Sequential``` to define the model. In that case, three layers are used to define the model:
* ```tf.keras.layers.Embedding``` : The input layer. A training vloopup table that will map numbers of each chars to a vector with ```embedding_dim``` dimensions;
* ```tf.keras.layers.GRU```: A type of RNN with size ```units=rnn_units```
* ```tf.keras.layers.Dense```: The output layer with ```vocab_size``` outputs.

In [19]:
# Length of the vocabulary in chars
vocab_size = len(vocab)
# Embedding dimension
embedding_dim = 256
# number of RNN units
rnn_units=1024

In [20]:
def build_model(vocab_size,embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size,embedding_dim,
                                 batch_input_shape=[batch_size,None]),
        tf.keras.layers.GRU(rnn_units,
                           return_sequences=True,
                           stateful=True,
                           recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [21]:
model = build_model(
vocab_size=len(vocab),
embedding_dim=embedding_dim,
rnn_units=rnn_units,
batch_size=BATCH_SIZE)

### Try the model
It is time to run the model and check if it behaves as expected. First check the shape of the output.

In [22]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


Check model summary

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3935232   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,018,497
Trainable params: 4,018,497
Non-trainable params: 0
_________________________________________________________________


To get the actual predictions from the model, we need to sample from the output distribution, to get actual indices. The distribution is defined by the logits over the character vocabulary. Try for the first example of the batch:

In [24]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

That will give us, at each timestep, a prediction of the next character index

In [25]:
sampled_indices

array([ 4,  4, 44, 48, 56,  5,  1,  3, 26, 32, 51, 19, 37, 26, 48, 31, 12,
       40, 12, 31, 52, 32, 45, 64, 55,  8, 12, 11, 23,  4, 26, 51,  9, 45,
       54, 39, 48, 34, 64, 24, 27, 56, 47, 22, 21, 63, 43,  8, 26,  3, 54,
        1,  7, 53, 18, 57, 43, 32,  6, 60, 49, 56, 12, 44, 10,  2, 50,  6,
       59, 41, 61,  6, 60, 42, 20, 18, 58, 33, 24, 26, 61, 47, 11,  2, 40,
       25, 29,  5, 47, 59, 63,  4, 41, 37, 39, 45, 60,  5, 21, 31],
      dtype=int64)

Now we should decode this prediction by this untrained model:

In [26]:
print("Input: \n", repr(''.join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n",repr(''.join(idx2char[sampled_indices ])))

Input: 
 'io.\nTrow you whither I am going? To Baptista Minola.\nI promised to inquire carefully\nAbout a schoolm'

Next Char Predictions: 
 "&&fjr' $NTmGYNjS?b?SnTgzq.?;K&Nm3gpajVzLOriJIye.N$p -oFseT,vkr?f:!l,ucw,vdHFtULNwi;!bMQ'iuy&cYagv'IS"


### Train the Model
At this point the model can be treated as typical classification model. Given RNN state, and the input this time step, predict the next class of the character.

#### Attach an optimizer, and a loss function
```tf.keras.losses.sparse_categorical_crossentropy``` loss function works. Also ```from logits``` need to be set cause return logits

In [29]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
example_batch_loss = loss(target_example_batch, example_batch_predictions)
print('Prediction shape:', example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')
print('Scalar loss:      ', example_batch_loss.numpy().mean())

Prediction shape: (64, 100, 65) # (batch_size, sequence_length, vocab_size)
Scalar loss:       4.173152


In [30]:
model.compile(optimizer = 'adam', loss=loss)

#### Configure Checkpoints
Use ```tf.keras.callbacks.ModelCheckpoint``` to ensure check are done and sabed during training.

In [36]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

#### Execute the training as EPOCHS = 10

In [37]:
EPOCHS = 10

In [38]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

ValueError: When using iterators as input to a model, you should specify the `steps_per_epoch` argument.

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
dataset