# Shakespeare Character Prediction (TensorFlow)

[![Open in Colab](https://lab.aef.me/files/assets/colab-badge.svg)](https://colab.research.google.com/github/adamelliotfields/lab/blob/main/files/tf/shakespeare.ipynb)
[![Open in Kaggle](https://lab.aef.me/files/assets/open-in-kaggle.svg)](https://kaggle.com/kernels/welcome?src=https://github.com/adamelliotfields/lab/blob/main/files/tf/shakespeare.ipynb)
[![Render nbviewer](https://lab.aef.me/files/assets/nbviewer_badge.svg)](https://nbviewer.org/github/adamelliotfields/lab/blob/main/files/tf/shakespeare.ipynb)
[![W&B](https://img.shields.io/badge/Weights_&_Biases-FFCC33?logo=WeightsAndBiases&logoColor=black)](https://wandb.ai/adamelliotfields/shakespeare)
[![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-md-dark.svg)](https://huggingface.co/adamelliotfields/shakespeare)

Tiny Shakespeare is a dataset created by Andrej Karpathy in his blog post, [The Unreasonable Effectiveness of Recurrent Neural Networks](https://karpathy.github.io/2015/05/21/rnn-effectiveness/).

This notebook includes training a LSTM model on the dataset, logging to W&B, and pushing the model to Huggingface.

In [None]:
from importlib.util import find_spec

if not find_spec("wandb"):
    import subprocess

    subprocess.run(["pip", "install", "-q", "wandb"])

In [None]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["KERAS_BACKEND"] = "tensorflow"

try:
    from google.colab import userdata

    SAVE_DIR = "/content/drive/MyDrive/"
    os.environ["WANDB_DISABLE_GIT"] = "true"
    os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
    os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")
    os.environ["TFDS_DATA_DIR"] = f"{SAVE_DIR}tensorflow_datasets"
    os.environ["HF_HUB_CACHE"] = "/content/drive/MyDrive/huggingface/hub"
except ImportError:
    SAVE_DIR = ""

In [None]:
import keras
import wandb
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from huggingface_hub import from_pretrained_keras, push_to_hub_keras
from wandb.integration.keras import WandbMetricsLogger

In [None]:
# !nvidia-smi --query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu,utilization.memory --format=csv

In [None]:
EPOCHS = 4
VERBOSE = 1
DROPOUT = 0.5
BATCH_SIZE = 1024
SEQUENCE_LENGTH = 100
WEIGHT_DECAY = 0.001
LEARNING_RATE = 0.00025

# NOTE: same username and project on wandb and huggingface
WANDB_PROJECT = "shakespeare"
WANDB_ENTITY = "adamelliotfields"

# TODO: loading keras models is very slow; serialize however huggingface does
MODEL_FILENAME = "lstm-shakespeare.model.keras"

In [None]:
def extract_text(dataset):
    text = dataset.map(lambda x: x["text"]).take(1)
    return next(iter(text)).numpy().decode("utf8")


(shakespeare_train, shakespeare_test, shakespeare_validation), shakespeare_info = tfds.load(
    "tiny_shakespeare",
    with_info=True,
    as_supervised=False,
    split=["train", "test", "validation"],
)

shakespeare_text = (
    extract_text(shakespeare_train)
    + extract_text(shakespeare_validation)
    + extract_text(shakespeare_test)
)

# head 2
print("\n".join(shakespeare_text.split("\n")[:2]))

In [None]:
shakespeare_speeches = shakespeare_text.split("\n\n")
shakespeare_speeches = [speech.replace("\n", " ") for speech in shakespeare_speeches]
speech_lengths = [len(speech) for speech in shakespeare_speeches]

print(f"Min: {np.min(speech_lengths)}")  # 4
print(f"Max: {np.max(speech_lengths)}")  # 3080
print(f"Mean: {np.mean(speech_lengths):.2f}")  # 152.44
print(f"Median: {np.median(speech_lengths)}")  # 83
print(f"Total: {len(speech_lengths)}")  # 7222

In [None]:
def create_sequences(text):
    sequences = []
    characters = []
    for i in range(0, len(text) - SEQUENCE_LENGTH):
        sequences.append(text[i : i + SEQUENCE_LENGTH])
        characters.append(text[i + SEQUENCE_LENGTH])
    return sequences, characters


sequences, characters = create_sequences(shakespeare_text)

In [None]:
encoder = keras.layers.TextVectorization(standardize=None, split="character", name="encoder")
encoder.adapt(tf.constant([shakespeare_text]))

# TODO: ideally the encoder would be a layer in the model
vocab = encoder.get_vocabulary()

In [None]:
# 30-40s
X = encoder(sequences)  # 1115294
y = encoder(characters)  # 1115294

In [None]:
ds = tf.data.Dataset.from_tensor_slices((X, y))
ds = ds.shuffle(100000)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(tf.data.AUTOTUNE)

In [None]:
# @title Model
keras.backend.clear_session()

if os.path.exists(f"{SAVE_DIR}{MODEL_FILENAME}"):
    model = keras.models.load_model(f"{SAVE_DIR}{MODEL_FILENAME}")
    # model = from_pretrained_keras(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
else:
    x_inputs = keras.Input(shape=(None,), dtype=tf.int64, name="input")
    x = keras.layers.Embedding(len(vocab), 512, mask_zero=True, name="embedding")(x_inputs)
    x = keras.layers.LSTM(512, return_sequences=True, name="lstm1")(x)
    x = keras.layers.Dropout(0.5, name="dropout1")(x)
    x = keras.layers.LSTM(512, return_sequences=True, name="lstm2")(x)
    x = keras.layers.Dropout(0.5, name="dropout2")(x)
    x = keras.layers.LSTM(512, name="lstm3")(x)
    x = keras.layers.Dropout(0.5, name="dropout3")(x)
    x = keras.layers.Dense(len(vocab), name="output")(x)

    model = keras.Model(x_inputs, outputs=x, name="LSTM-Shakespeare")
    model.compile(
        metrics=["accuracy"],
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=keras.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY),
    )
    model.summary()

In [None]:
# @title Train
with wandb.init(
    tags=["L4"],
    group="lstm",
    job_type="train",
    entity=WANDB_ENTITY,
    project=WANDB_PROJECT,
    config={"dropout": DROPOUT, "learning_rate": LEARNING_RATE, "weight_decay": WEIGHT_DECAY},
) as run:
    wandb.config.update(
        {
            "epochs": EPOCHS,
            "optimizer": "AdamW",
            "batch_size": BATCH_SIZE,
            "model": "LSTM-Shakespeare",
        }
    )

    # 5m on L4, 20m on T4
    model.fit(
        ds,
        epochs=EPOCHS,
        verbose=VERBOSE,
        callbacks=[WandbMetricsLogger(log_freq="epoch")],
    )

    model.save(f"{SAVE_DIR}{MODEL_FILENAME}")

In [None]:
# @title Huggingface
COMMIT_MESSAGE = "Initial commit"

push_to_hub_keras(
    model,
    f"{WANDB_ENTITY}/{WANDB_PROJECT}",
    include_optimizer=True,
    token=os.environ["HF_TOKEN"],
    commit_message=COMMIT_MESSAGE,
)

In [None]:
# @title Predict
def generate_text(m, enc, txt, n):
    vocab = enc.get_vocabulary()
    generated_txt = txt
    for _ in range(n):
        encoded = enc([generated_txt])
        # not stateful so don't need to reset_states first
        pred = m.predict(encoded, verbose=0)
        pred = tf.squeeze(tf.argmax(pred, axis=-1)).numpy()
        generated_txt += vocab[pred]
    return generated_txt


sample = "M"
print(generate_text(model, encoder, sample, 100))