In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
# download texts from Shakespeare
shakespeare_url = "https://homl.info/shakespeare"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://homl.info/shakespeare


In [48]:
# encode every character to an integer (by default converts everything to lower case)
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)  # character level instead of word-leve encoding
tokenizer.fit_on_texts(shakespeare_text)

In [49]:
# convert word to the integer tokens of its characters
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [50]:
# convert a sequence of characters to a word
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [51]:
max_id = len(tokenizer.word_index)  # number of distinct characters
max_id

39

In [56]:
dataset_size = tokenizer.document_count  # total number of characters
dataset_size


1115394

In [53]:
# encode all the characters in the text
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1  # -1 for the character indices to start in 0

In [54]:
encoded.shape

(1115394,)

In [57]:
# use the first 90% of the string as training set
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])  # equivalent to a single time series with 1M characters
len(list(dataset))

1003854

In [58]:
# split the continue time series of characters into windows, each shifted one character (i.e. overlapping windows)
n_steps = 100
window_length = n_steps + 1  # target = input shifted 1 characted ahead
dataset = dataset.window(window_length, 
                         shift=1, 
                         drop_remainder=True)  # discard the last window if the size is now window_length

In [59]:
# Dataset of Datasets
dataset

<WindowDataset element_spec=DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>

In [60]:
# flatten the dataset so that every element has a tensor of size window_length
# now the dataset contains consecutive windows of 101 characters each
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [65]:
# shuffle the dataset and split the inputs from the targets.
# Note that we will eventually make an RNN and we want the targets to be sequences
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [68]:
# one hot vector encoding
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [69]:
# prefetch
dataset = dataset.prefetch(1)

In [70]:
# RNN model
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# history = model.fit(dataset, epochs=20)



In [71]:
# preprocessing function
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [None]:
# predict the next letter in some text
X_new = preprocess(["How are yo"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

In [72]:
# instead of making a direct prediction, we compute the probabilities of encountering each character next and then we get a random character based on that class probabilities
# if we just took the most likely character, the mode would repeat the same word over and over
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]  # probabilities for each character
    rescaled_logits = tf.math.log(y_proba) / temperature  # convert probability to logits (scaled by a factor)
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1  # sample a random character based on the respective classes logits
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [73]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

# Stateful RNN

In [74]:
# make sequential non-overlapping input sequences
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)  # make sequential windows
dataset = dataset.flat_map(lambda window: window.batch(window_length))  # Flatten the whoe dataset and make sequences of length window_length
dataset = dataset.batch(1)  # make each minibatch have only one window
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))  # split the window into inputs and outputs
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))  # encode the inputs
dataset = dataset.prefetch(1)  # prefetch


In [76]:
# stateful RNN
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2, batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation="softmax"))
])



In [77]:
# reset the state with a callback
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()
        

In [78]:
# compile and train
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])

# Sentiment Analysis

In [79]:
# load IMDB dataset with movie reviews already hot encoded
# the values 1, 2, and 3 are special values for padding, start of sequence, and unknown words
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [80]:
# decode a review
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token

" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

# Raw Preprocessing

In [2]:
# if we had to do the preprocessing from scratch, first download the data
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-02-28 16:00:08.500763: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-02-28 16:00:08.501235: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)  # truncates reviews at 300 characters
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?>", b" ")  # replace "<br/>" tags with spaces
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z]", b" ")  # replace any characters other than letters and quotes with spaces
    X_batch = tf.strings.split(X_batch)   # split into words
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch 

In [4]:
# make a vocabulary
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

2023-02-28 16:00:16.422011: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [5]:
# most common words
vocabulary.most_common()[:3]

[(b'<pad>', 224494), (b'the', 61156), (b'a', 38569)]

In [7]:
# truncate vocabulary
vocab_size = 10000
truncated_vocab = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [8]:
# encode the words into integers with lookup tables
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets=num_oov_buckets)

In [11]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   24,    12,    13, 10053]])>

In [12]:
# encode the words with the table defined above
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [13]:
# create model to decide if a review is positive or negative and train it
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)

Epoch 1/5


2023-02-28 16:07:42.808968: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-28 16:07:43.243785: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-28 16:07:43.865250: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-28 16:07:44.397582: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-02-28 16:07:45.514726: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Pretrained Embeddings

In [None]:
# load the nnlm-en-dim40 embedding and use it in a model
# import tensorflow_hub as hub

# model = keras.Sequential([
#     hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", dtype=tf.string, input_shape=[], output_shape=[50]),
#     keras.layers.Dense(128, activation="relu"),
#     keras.layers.Dense(1, activation="sigmoid")
# ])
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])