In [22]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [2]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)

Downloading data from https://homl.info/shakespeare


In [3]:
with open(filepath) as f:
    shakespeare_text = f.read()

In [6]:
# map each character to an integer
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # char encoding instead of word encoding
tokenizer.fit_on_texts([shakespeare_text])

In [7]:
tokenizer.texts_to_sequences(["First"])
# [[20, 6, 9, 8, 3]]

[[20, 6, 9, 8, 3]]

In [9]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])
# ['f i r s t']

['f i r s t']

In [11]:
max_id = len(tokenizer.word_index) # number of distinct characters
dataset_size = tokenizer.document_count # total number of characters

In [13]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [14]:
encoded

array([19,  5,  8, ..., 20, 26, 10])

In [21]:
train_size = dataset_size * 90 // 100

In [30]:
raw_dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [31]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead

In [32]:
windowed_dataset = raw_dataset.window(window_length, shift=1, drop_remainder=True)

In [33]:
flattened_dataset = windowed_dataset.flat_map(lambda window: window.batch(window_length))

In [34]:
# create windows 
batch_size = 32
shuffled_dataset = flattened_dataset.shuffle(10000).batch(batch_size)
labeled_dataset = shuffled_dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [37]:
prepared_dataset = labeled_dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

In [38]:
dataset = prepared_dataset.prefetch(1)