<a href="https://colab.research.google.com/github/adammoss/MLiS2/blob/master/workshops/workshop_llm/transformer_finetune_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

As an example application of a transformer, let's build a GPT like language model that predicts the probability of a sentence of $\tau$ tokens,

$
P \left( \boldsymbol{x}^{(1)}, \boldsymbol{x}^{(2)}, \ldots, \boldsymbol{x}^{(\tau)}    \right) = \prod_{t  = 1}^{\tau} P \left( \boldsymbol{x}^{(i)} |  \boldsymbol{x}^{(1)}, \ldots, \boldsymbol{x}^{(i-1)}    \right)
$

where $\boldsymbol{x}^{(t)}$ is a vector representing a token.

In this example we fine-tune GPT to act as a question-answering system.

In [1]:
!pip install tiktoken
!pip install keras_nlp
!pip install tensorflow_text

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.8 MB[0m [31m13.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0
Collecting keras_nlp
  Downloading keras_nlp-0.8.2-py3-none-any.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-core (from keras_nlp)
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
Collecting 

In [2]:
import itertools
import operator
import numpy as np
import sys
from datetime import datetime
import os
import requests
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import time

In [3]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import tensorflow_datasets as tfds

import tiktoken

import keras_nlp
import keras

import tensorflow_text as tf_text

TensorFlow version: 2.15.0
Using TensorFlow backend


In [4]:
batch_size = 16
context_size = 256

In [5]:
train_examples, val_examples = tfds.load(
    'web_questions',
    split=['train', 'test'])

Downloading and preparing dataset 1.21 MiB (download: 1.21 MiB, generated: 983.88 KiB, total: 2.17 MiB) to /root/tensorflow_datasets/web_questions/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/3778 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/web_questions/1.0.0.incompleteUH4GKM/web_questions-train.tfrecord*...:   0…

Generating test examples...:   0%|          | 0/2032 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/web_questions/1.0.0.incompleteUH4GKM/web_questions-test.tfrecord*...:   0%…

Dataset web_questions downloaded and prepared to /root/tensorflow_datasets/web_questions/1.0.0. Subsequent calls will reuse this data.


In [6]:
def deconstruct(x):
  return x['question'], x['answers'][0]

In [7]:
# Modify this to return QA pairs in a format suitable for fine-tuning the LLM

@tf.py_function(Tout=tf.string)
def join_input(x, y):
  return

In [8]:
train_ds = train_examples.map(deconstruct).map(join_input)
test_ds = val_examples.map(deconstruct).map(join_input)

train_ds = train_ds.take(10000)
test_ds = test_ds.take(10000)

BUFFER_SIZE = 10000

train_ds = (
    train_ds
    .shuffle(BUFFER_SIZE)
    .batch(batch_size, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

test_ds = (
    test_ds
    .batch(batch_size)
    .prefetch(tf.data.experimental.AUTOTUNE))

In [9]:
x = next(iter(train_ds))
print(x.numpy()[0])

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} pyfunc_0 returns 0 values, but expects to see 1 values.
	 [[{{node EagerPyFunc}}]] [Op:IteratorGetNext] name: 

In [None]:
x = next(iter(test_ds))
print(x.numpy()[0])

In [None]:
# To speed up training and generation, we do not use a the full GPT2 context length of 1024.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=context_size,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [None]:
gpt2_lm.summary()

In [None]:
num_epochs = 5

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs, validation_data=test_ds)

In [None]:
start = time.time()

# Make sure you ask a question in the same format as in fine-tuning
conversation = ""

output = gpt2_lm.generate(conversation, max_length=200)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")

print(output)