Goal: Build language model.

References:
- http://karpathy.github.io/2015/05/21/rnn-effectiveness/
  - https://github.com/karpathy/char-rnn
  - https://cs.stanford.edu/people/karpathy/char-rnn/
  - https://gist.github.com/karpathy/587454dc0146a6ae21fc
- https://www.tensorflow.org/text/tutorials/text_generation

In [None]:
import tensorflow as tf
import numpy as np
ks = tf.keras
print("TensorFlow version:", tf.__version__)

import urllib
import math

# Get the data

Using a Shakespeare dataset

In [None]:
# Karpathy's datasets used in his blog post,
# http://karpathy.github.io/2015/05/21/rnn-effectiveness/,
# and listed here: https://cs.stanford.edu/people/karpathy/char-rnn/.

TEXT_URL = {
    'shakespeare': 'https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt',
    'linux': 'https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt',
    'tolstoy': 'https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt',
}['shakespeare']  # Select a dataset

with urllib.request.urlopen(TEXT_URL) as f:
  text = f.read()

print(f'Length of text: {len(text)} characters')

In [None]:
# Note that the text is stored as a byte string
print(type(text))

In [None]:
# look at sample of the data
print(text[2100:2600].decode("utf-8"))

In [None]:
# This will be character level model. Serious language models use word pieces (https://paperswithcode.com/method/wordpiece).

# Make a numpy array of ASCII chars
raw_seq = np.frombuffer(text, dtype=np.uint8)

# Token ID to ascii code conversion
token_to_ascii = np.array(sorted(set(raw_seq)))
VOCAB_SIZE = len(token_to_ascii)

# Ascii code to token ID conversion
ascii_to_token = np.full(256, -1, np.int_)
for token, ascii in enumerate(token_to_ascii):
  ascii_to_token[ascii] = token

# Convert ascii array to token ID array
token_seq = ascii_to_token[raw_seq]

print('vocab size:',VOCAB_SIZE)
print('seq:', token_seq[:20])
print(token_seq.shape)
print(token_seq.dtype)
print('\ntoken_to_char:', token_to_ascii)
print('any invalid?', np.any(token_seq == -1))
print('min:', np.min(token_seq),'  max:', np.max(token_seq))

In [None]:
BATCH_SIZE = 32
CONTEXT_SIZE = 100  # truncated sequence length
PAD_CHAR = token_to_ascii[0]
PAD_LEN = math.ceil(token_seq.size / (BATCH_SIZE*CONTEXT_SIZE)) * BATCH_SIZE*CONTEXT_SIZE - seq.size

parallel_seq = np.append(token_seq, [PAD_CHAR]*PAD_LEN).reshape(BATCH_SIZE, -1)

# pad with beginning of sequences from next row
full_batches = 2  # How many full batches end of each row should bleed into start of next row
parallel_seq = np.concatenate((parallel_seq, np.roll(parallel_seq[:,:CONTEXT_SIZE*full_batches+1],-1,0)),1)
print('shape:', parallel_seq.shape)

NUM_BATCHES = (parallel_seq.shape[1]-1) // CONTEXT_SIZE
print('num batches:', NUM_BATCHES)
print('assert',parallel_seq.size - NUM_BATCHES*BATCH_SIZE*CONTEXT_SIZE,'==',BATCH_SIZE)

In [None]:
def get_batch(batch_i, offset=0):
  # When offset==0 we have a training batch, and when offset==1 we have the training targets
  return parallel_seq[:, batch_i*CONTEXT_SIZE+offset: (batch_i+1)*CONTEXT_SIZE+offset]

# get an example batch
print(get_batch(0))
print('')
print(get_batch(NUM_BATCHES-1))

In [None]:
# Human readable render of first training batch
[row.tobytes().decode('utf8') for row in token_to_ascii[get_batch(0)]]

In [None]:
# Show training targets for the above batch
[row.tobytes().decode('utf8') for row in token_to_ascii[get_batch(0, offset=1)]]

In [None]:
# Second to last training batch. Each line is now the next line down in the first batch
[row.tobytes().decode('utf8') for row in token_to_ascii[get_batch(NUM_BATCHES-2)]]

# Define the model

In [None]:
# Define our model

CELL_CLS = {
    'rnn': ks.layers.SimpleRNNCell,
    'lstm': ks.layers.LSTMCell,
    'gru': ks.layers.GRUCell,
}['lstm']

class Model(ks.Model):

  def __init__(self):
    super(Model, self).__init__()
    self.input_embed = ks.layers.Dense(100)
    self.cells = [CELL_CLS(200), CELL_CLS(150), CELL_CLS(100)]
    self.output_stack = [ks.layers.Dense(50, activation='relu'), ks.layers.Dense(VOCAB_SIZE)]

  def call(self, x, s=None):
    # `x` is the input tensor and `s` is the recurrent state
    # Expecting x.shape == (batch_size, context_size), where batch_size and context_size can be variable from run to run
    bs = tf.shape(x)[0]
    cs = tf.shape(x)[1]
    x = tf.one_hot(x, VOCAB_SIZE)  # shape == (batch_size, context_size, VOCAB_SIZE), where VOCAB_SIZE is a global constant

    if s is None:
      s = [cell.get_initial_state(batch_size=bs, dtype=tf.float32) for cell in self.cells]

    # Embed one-hot tokens
    e = self.input_embed(x)  # shape == (batch_size, context_size, 100)

    # Recurrent cell stack
    outputs = []
    for h in tf.unstack(e, axis=1):  # loop over time within context window
      for l, cell in enumerate(self.cells):
        h, s[l] = cell(h, s[l])
      outputs.append(h)

    # Feed forward stack
    h = tf.stack(outputs, axis=1)  # stack along the time axis
    for layer in self.output_stack:
      h = layer(h)
    return h, s

# Training loop

In [None]:
learning_rate = 1e-3

model = Model()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,  # predictions will be given as logits (log unnormalized probabilities) rather than probabilities
)

optimizer = tf.keras.optimizers.Adam()

# Use GPU if available.
# https://www.tensorflow.org/guide/gpu
GPUs = tf.config.list_physical_devices('GPU')
device = '/GPU:0' if GPUs else '/CPU:0'
print('device =', device)

In [None]:
@tf.function
def train_step(batch, labels, state=None):
  with tf.GradientTape() as tape:
    # training=True is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    logits, state = model(batch, state, training=True)
    loss = loss_object(labels, logits)
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  return loss, logits, state


@tf.function
def accuracy(logits, target, normalize=True):
  argmaxs = tf.math.argmax(logits, axis=1)
  corrects = tf.math.equal(argmaxs, target)
  if normalize:
    return tf.reduce_mean(tf.cast(corrects, tf.float32))
  else:
    return tf.math.count_nonzero(corrects)

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
  # Performing truncated backprop through time (TBPTT).
  # States are carried over between batches, but gradients are not propagated beyond a batch.
  # At the end of each epoch the state is reset to its default (typically all zeros).
  state = None  # None tells the model to use the default state
  for batch_i in range(NUM_BATCHES):  
    # Move tensors to the configured device
    batch = get_batch(batch_i)
    labels = get_batch(batch_i, offset=1)
    with tf.device(device):
      loss_, logits_, state = train_step(batch, labels, state)

    if i % 100 == 0:
      print('  Step: %d | Train Loss: %.4f | Train Accuracy: %.2f' % (i, loss_.numpy(), accuracy(logits_, labels).numpy()))

  # Save model checkpoint
  model.save(f'./training_checkpoints/ckpt_{epoch}')

  print('')
  print('Finished epoch')
  print('')

## Load checkpoint

Reference: https://www.tensorflow.org/guide/keras/save_and_serialize

In [None]:
%ls training_checkpoints

In [None]:
model_copy = ks.models.load_model('./training_checkpoints/ckpt_0')
model_copy.compile()

# Generate text

## Sample

In [None]:
GENERATE_LENGTH = 1000
PROMPT = """
ROMEO:"""

prompt = np.frombuffer(bytes(PROMPT, 'utf-8'), dtype=np.uint8)
prompt = ascii_to_token[prompt]
generated = [prompt[None,:]]
state = None
for n in range(GENERATE_LENGTH):
  logits, state = model(generated[-1], state)
  next = tf.random.categorical(logits[:, 0], num_samples=1)
  generated.append(next.numpy())

In [None]:
output = np.concatenate(generated, 1)[0]
print(token_to_ascii[output].tobytes().decode('utf8'))

## Argmax

In [None]:
# TODO: implement beam search