<a href="https://colab.research.google.com/github/bitblayde/Machine-and-Deep-learning-projects/blob/main/NLP/generate_shakespearean_text/statefulRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I've used as a reference Hands-On ML book from Aurélien Géron.

In [4]:
import numpy as np
import os
import sklearn
import tensorflow as tf
from tensorflow import keras

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

file = keras.utils.get_file("shakespeare.txt", url)

with open(file) as f:
  text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)

In [6]:
tokenizer.texts_to_sequences(["Hello"])

[[7, 2, 12, 12, 4]]

In [7]:
''.join(tokenizer.sequences_to_texts([[7, 2, 12, 12, 4]]))

'h e l l o'

In [8]:
n_characters = len(tokenizer.word_counts)
n_characters, tokenizer.word_counts

(39,
 OrderedDict([('f', 17567),
              ('i', 57369),
              ('r', 53758),
              ('s', 54219),
              ('t', 74024),
              (' ', 169892),
              ('c', 19443),
              ('z', 554),
              ('e', 100652),
              ('n', 53608),
              (':', 10316),
              ('\n', 40000),
              ('b', 14082),
              ('o', 71279),
              ('w', 21115),
              ('p', 12449),
              ('d', 33447),
              ('a', 63326),
              ('y', 22166),
              ('u', 29897),
              ('h', 54378),
              (',', 19846),
              ('m', 25083),
              ('k', 8672),
              ('.', 7885),
              ('l', 37215),
              ('v', 8591),
              ('?', 2462),
              ("'", 6187),
              ('g', 15755),
              (';', 3628),
              ('!', 2172),
              ('j', 948),
              ('-', 1897),
              ('q', 840),
              ('x', 641),


In [23]:
text_encoded = np.array(tokenizer.texts_to_sequences(text))
text_encoded = text_encoded.reshape(-1)

In [24]:
text_encoded -= 1

In [31]:
window_size = 101
buffer_size = 20000
batch_size = 64


full_dataset = []
train_size = 80*tokenizer.document_count // 100
split_dataset = np.array_split(text_encoded[:train_size], batch_size)

for current_dataset in split_dataset:
  dataset_window = tf.data.Dataset.from_tensor_slices(current_dataset)
  dataset_window = dataset_window.window(window_size, shift=window_size-1, drop_remainder=True)
  dataset_window = dataset_window.flat_map(lambda X : X.batch(window_size))
  full_dataset.append(dataset_window)
  
dataset = tf.data.Dataset.zip(tuple(full_dataset)).map(lambda *w : tf.stack(w))

dataset = dataset.map(lambda X : (X[:, :-1], X[:, 1:]))
dataset = dataset.map(lambda X, y : (tf.one_hot(X, n_characters), y))

dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [37]:
class ResetStates(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

In [39]:
epochs = 8
model = keras.Sequential([
                          keras.layers.GRU(128, dropout=0.2, batch_input_shape=[batch_size, None, n_characters], return_sequences=True, stateful=True),
                          keras.layers.GRU(256, dropout=0.2, return_sequences=True, stateful=True),
                          keras.layers.TimeDistributed(keras.layers.Dense(n_characters)),
                          keras.layers.Activation("softmax")
])

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.fit(dataset, epochs=epochs, callbacks=[ResetStates()])

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7ff34cd8c650>

In order to avoid the limitation of the input text dimension since it has to be equal to batch dim, we could create a stateless network and put it the current weights.

In [43]:
stateless_model = keras.Sequential([
                          keras.layers.GRU(128, dropout=0.2, batch_input_shape=[None, None, n_characters], return_sequences=True),
                          keras.layers.GRU(256, dropout=0.2, return_sequences=True),
                          keras.layers.TimeDistributed(keras.layers.Dense(n_characters)),
                          keras.layers.Activation("softmax")
])

stateless_model.build([None, None, n_characters])
stateless_model.set_weights(model.get_weights())

In [49]:
def preprocess(input):
  return tf.one_hot(np.array(tokenizer.texts_to_sequences(input))-1, n_characters)

def next_character(model, input, t=1):
  preprocessing_text = preprocess([input])
  X_new = model.predict(preprocessing_text)
  X_new = X_new[0, -1:, :]
  prob = tf.math.log(X_new) / t
  character = tf.random.categorical(prob, num_samples=1)+1
  return tokenizer.sequences_to_texts(character.numpy())[0]

string = "My na"
string += next_character(stateless_model, string)
string += next_character(stateless_model, string)
print(string)

My name


In [50]:
def generate_text(model, n_characters = 100, text = "a", t = 1):
  for _ in range(n_characters):
    text += next_character(model, text, t=t)
  return text


print(generate_text(stateless_model, n_characters = 100, text = "A", t = 1))

Am less,'ed's batier.
for that pas ne's, are spyerm'd that eeem arm'd out stay
do masting un his whoc
