<a href="https://colab.research.google.com/github/Zilleplus/MachineLearning/blob/main/Shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [None]:
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespear.text", shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
print(tokenizer.texts_to_sequences(["First"]))
print(tokenizer.sequences_to_texts([[20, 6,9,8,3]]))

[[20, 6, 9, 8, 3]]
['f i r s t']


In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text]))

In [None]:
# The windowed dataset, is a dataset of datasets
for e in tf.data.Dataset.from_tensor_slices([1, 2, 3]).window(2):
  print("This is a dataset:")
  print(e)
  print("The dataset contains 2 elements:")
  for k in e:
    print(k)
  break
print("----")
# used flat_map to transforms each window dataset into tensors, batch outputs 
# a iterator with one element, flat_map unravels to the single element.
flat_data_example = tf.data.Dataset\
      .from_tensor_slices([1, 2, 3])\
      .window(2)\
      .flat_map(lambda window: window.batch(2))
for e in flat_data_example:
  print(e)
print(type(flat_data_example))
# notice the last tensor with [3], that does not have window lenght=2, this is why we enable drop_remainder

In [None]:
dataset_size = encoded.size
train_size = (dataset_size*90)//100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
n_steps = 100
window_length = n_steps + 1 # Input shift 1 character ahead.
dataset = dataset\
    .window(window_length, shift=1, drop_remainder=True)\
    .flat_map(lambda window: window.batch(window_length))

In [None]:
for d in dataset:
  print(d)
  break

In [None]:
# randomize the data
batch_size = 32
dataset = dataset\
  .shuffle(10000)\
  .batch(batch_size)\
  .map(lambda windows: (windows[:, :-1], windows[:, 1:])) # split off the last element, the first dimension is batch, second is series

In [None]:
# use one hot encoding
max_id = len(tokenizer.word_index)
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
# add prefetch 
dataset = dataset.prefetch(buffer_size=1)

In [None]:
model = keras.models.Sequential([
  keras.layers.GRU(units=128, return_sequences=True, dropout=0.2, input_shape=[None, max_id]), # recurrent_dropout=0.2
  keras.layers.GRU(units=128, return_sequences=True, dropout=0.2), # recurrent_dropout=0.2
  keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax')) # add softmax at the end to get [0,1] intervals
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=20)

In [None]:
# find random samples from distribution p(0)=0.7 and p(1)=0.3
tf.random.categorical(tf.math.log([[0.7, 0.3]]), 10)

<tf.Tensor: shape=(1, 10), dtype=int64, numpy=array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])>