<a href="https://colab.research.google.com/github/adammoss/MLiS2/blob/master/examples/llm/transformer_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

As an example application of a transformer, let's build a GPT like language model that predicts the probability of a sentence of $\tau$ tokens,

$
P \left( \boldsymbol{x}^{(1)}, \boldsymbol{x}^{(2)}, \ldots, \boldsymbol{x}^{(\tau)}    \right) = \prod_{t  = 1}^{\tau} P \left( \boldsymbol{x}^{(i)} |  \boldsymbol{x}^{(1)}, \ldots, \boldsymbol{x}^{(i-1)}    \right)
$

where $\boldsymbol{x}^{(t)}$ is a vector representing a token.

The script uses the 'tiny_shakespeare' dataset, but it's designed to work with other text sources as well. It showcases how to prepare datasets for training and testing, configure model parameters like batch size and context size, and fine-tune a pre-trained GPT-2 model for text generation.

In [1]:
!pip install tiktoken
!pip install keras_nlp
!pip install tensorflow_text

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0
Collecting keras_nlp
  Downloading keras_nlp-0.8.1-py3-none-any.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.2/465.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_nlp)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting t

In [2]:
import itertools
import operator
import numpy as np
import sys
from datetime import datetime
import os
import requests
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import time

In [3]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow_datasets as tfds

import tiktoken

import keras_nlp
import keras

import tensorflow_text as tf_text

TensorFlow version: 2.15.0
Using TensorFlow backend


In [4]:
batch_size = 64
context_size = 256
#docs = ['_chat.txt']
docs = ['tiny_shakespeare']
#docs = ['scientific_papers/arxiv']

In [5]:
train_text = ''
test_text = ''
for doc in docs:
  if doc == 'tiny_shakespeare':
    d = tfds.load(name=doc)['train']
    train_text += next(iter(d))['text'].numpy().decode("utf-8")
    d = tfds.load(name=doc)['test']
    test_text += next(iter(d))['text'].numpy().decode("utf-8")
  elif doc == 'scientific_papers/arxiv':
    d = tfds.load(name=doc)
  else:
    if not os.path.isfile(doc):
      from google.colab import files
      uploaded = files.upload()
    sentences = []
    with open(doc, 'r') as f:
      for x in f.readlines():
        if 'omitted' not in x:
          if len(x.split(']')) > 1:
            sentences.append(x.split(']')[1])
          else:
            sentences.append(x)
    text = ''.join(sentences)
    train_text += text[:int(0.8*len(text))]
    test_text += text[int(0.8*len(text)):]

Downloading and preparing dataset Unknown size (download: Unknown size, generated: 1.06 MiB, total: 1.06 MiB) to /root/tensorflow_datasets/tiny_shakespeare/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete4OAX0S/tiny_shakespeare-train.tfrecord*..…

Generating validation examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete4OAX0S/tiny_shakespeare-validation.tfreco…

Generating test examples...:   0%|          | 0/1 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/tiny_shakespeare/1.0.0.incomplete4OAX0S/tiny_shakespeare-test.tfrecord*...…

Dataset tiny_shakespeare downloaded and prepared to /root/tensorflow_datasets/tiny_shakespeare/1.0.0. Subsequent calls will reuse this data.


In [6]:
print(train_text[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [7]:
enc = tiktoken.encoding_for_model('gpt2')
vocab_size = enc.n_vocab

def ids_from_chars(chars):
  return enc.encode_ordinary(chars)

def text_from_ids(ids):
  return enc.decode(ids)

train_tokens = enc.encode_ordinary(train_text)
test_tokens = enc.encode_ordinary(test_text)

print(f'Length of train text: {len(train_tokens)} tokens')
print(f'Length of test text: {len(test_tokens)} tokens')

ids_train_ds = tf.data.Dataset.from_tensor_slices(train_tokens)
ids_test_ds = tf.data.Dataset.from_tensor_slices(test_tokens)

Length of train text: 301966 tokens
Length of test text: 17995 tokens


In [8]:
train_sequences = ids_train_ds.batch(context_size + 1, drop_remainder=True)
test_sequences = ids_test_ds.batch(context_size + 1, drop_remainder=True)

# This makes it slow! However the input required to fine-tune are strings, and wanted same number of tokens in inputs to compare loss
@tf.py_function(Tout=tf.string)
def join_input(sequence):
  return enc.decode(sequence)

In [9]:
train_ds = train_sequences.map(join_input)
test_ds = test_sequences.map(join_input)

BUFFER_SIZE = 10000

train_ds = (
    train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

test_ds = (
    test_ds
    .batch(batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [10]:
x = next(iter(train_ds))
print(x.numpy()[0])

b" paper: having read it,\nBid them repair to the market place; where I,\nEven in theirs and in the commons' ears,\nWill vouch the truth of it. Him I accuse\nThe city ports by this hath enter'd and\nIntends to appear before the people, hoping\nTo purge herself with words: dispatch.\nMost welcome!\n\nFirst Conspirator:\nHow is it with our general?\n\nAUFIDIUS:\nEven so\nAs with a man by his own alms empoison'd,\nAnd with his charity slain.\n\nSecond Conspirator:\nMost noble sir,\nIf you do hold the same intent wherein\nYou wish'd us parties, we'll deliver you\nOf your great danger.\n\nAUFIDIUS:\nSir, I cannot tell:\nWe must proceed as we do find the people.\n\nThird Conspirator:\nThe people will remain uncertain whilst\n'Twixt you there's difference; but the fall of either\nMakes the survivor heir of all.\n\nAUFIDIUS:\nI know it;\nAnd my pretext to strike at him admits\nA good construction. I raised him, and I"


In [11]:
x = next(iter(test_ds))
print(x.numpy()[0])



In [12]:
# To speed up training and generation, we do not use a the full GPT2 context length of 1024.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=context_size,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/tokenizer.json...
100%|██████████| 448/448 [00:00<00:00, 536kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/assets/tokenizer/merges.txt...
100%|██████████| 446k/446k [00:01<00:00, 439kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/assets/tokenizer/vocabulary.json...
100%|██████████| 0.99M/0.99M [00:01<00:00, 772kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/config.json...
100%|██████████| 484/484 [00:00<00:00, 646kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/model.weights.h5...
100%|██████████| 475M/475M [00:30<00:00, 16.1MB/s]
  return id(getattr(self, attr)) not in self._functional_layer_ids
  return id(getattr(self, attr)) not in self._functional_layer_ids


In [13]:
gpt2_lm.summary()

In [14]:
start = time.time()

output = gpt2_lm.generate("That Italian restaurant is", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
That Italian restaurant is now open in the city.

A new location at the historic Old Town Market on the west side of Market Street is opening next week. It will include a restaurant and restaurant lounge. The restaurant will also include a rooftop patio and bar.
TOTAL TIME ELAPSED: 19.65s


In [15]:
num_epochs = 10

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs, validation_data=test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c46cc58e2f0>

In [44]:
start = time.time()

conversation = "A"

for _ in range(1):
  input = ' '.join(conversation.split()[-30:])
  output = gpt2_lm.generate(input, max_length=200)
  conversation += output[len(input):]

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")

TOTAL TIME ELAPSED: 0.53s


In [45]:
print(conversation)

A man who, like a king, is the most gracious king of all.

KING RICHARD II:
I have heard of this man: he hath made himself a king;
His name is Richard II; his father was Henry,
And, as he was, his mother was Margaret.

CLARENCE:
I do, and I am a prince,
And I am a gentleman. But, as I have said,
He is a king: he is the king of England,
And I have sworn allegiance
With him; but he is a man that is not
To be king, nor a man that is to be king.

KING RICHARD II:
And so I do swear allegiance with him,
And, as I have told, he is the king of Scotland.

GLOUCESTER:
He is a man, and I have sworn allegiance with him.

KING RICHARD
