In [1]:
import tensorflow as tf

# Generating Text Using Character RNN

In [2]:
# Getting the Shakespeare text

shakespeare_url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

In [3]:
# Printing the first few lines

print (shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [32]:
# Processing the text 

# Vectorizing by character - each character is now mapped to an integer
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded_text = text_vec_layer([shakespeare_text])[0]

# The TextVectorization layer uses 0 for padding and 1 for unknown chars. We don't need them
# in this case, so we can deduct 2 from all character keys so that they start at 0
encoded_text -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded_text)

print ("Total number of characters: ", dataset_size)
print ("Number of unique characters: ", n_tokens)

Total number of characters:  1115394
Number of unique characters:  39


In [5]:
# Creating the training set

# We're going to create windows from the text. For example, 
#   A training example can be - "to be or not to b"
#   And it's corresponding label - "o be or not to be"

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  # Convert the tensor into a Dataset
  ds = tf.data.Dataset.from_tensor_slices(sequence)

  # Generate windows of length - length+1. Drop last windows that are less than desired size
  ds = ds.window(length + 1, shift=1, drop_remainder=True)

  # Map windows to 1D arrays (using the batch method)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)

  ds = ds.batch(batch_size)
  
  # Map the arrays to training and label (see example above).
  # Set the Prefetch so that the next batch load can start while the current batch is used in training
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [19]:
# Train / valid / test split

length = 100
tf.random.set_seed(42)

train_set = to_dataset(encoded_text[:1_000_000], length=length, shuffle=True)
valid_set = to_dataset(encoded_text[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded_text[1_060_000:], length=length)

In [None]:
# Building the model

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
  tf.keras.layers.GRU(128, return_sequences=True),
  tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, validation_data=valid_set, epochs=10, callbacks=[model_ckpt])

Training on kaggle...

In [23]:
# Load the trained model
loaded_model = tf.keras.models.load_model("models/my_shakespeare_model")

# Wrap it with the preprocessing step
shakespeare_model = tf.keras.Sequential([
  text_vec_layer,
  tf.keras.layers.Lambda(lambda X: X - 2), # no PAD or UNKNOWN tokens
  loaded_model
])




In [30]:
# Generating text

# Since this model outputs one character at a time, we can add the predicted character to the 
# seed text and resend to the model for prediction in a loop. This approach is called "greedy decoding" and
# in practice it just repeats the same word over and over.

# Instead we'll output all the probabilities of the next character and choose the next one according to a 
# parameter called "temperature". This parameter is between 0-1. Values closer to 0 will choose the higher
# probability character whereas values closer to 1 will choose the lower probability ones, adding to the
# randomness.

def next_char(text_model, text, temperature=1):
  y_proba = text_model.predict([text])[0, -1:]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]

  return text_vec_layer.get_vocabulary()[char_id + 2]

def extend_text(text_model, text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text_model, text, temperature)
  
  return text

print (extend_text(shakespeare_model, "To be or not to be", n_chars=100, temperature=0.01))


To be or not to be the death.

duke vincentio:
i have a signior man of the death,
and that i shall be so long and so l
