In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# today can reproduce GPT-2 in about an hour
# GPT-2 paper is pretty light on details, so we reference the GPT-3 paper as well (not huge change in architecture)

* GPT-2 - [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
* GPT-3 - [Language Models are Few-Shot Learners](https://arxiv.org/pdf/2005.14165)

In [None]:
# - original written in tensorflow
# - karpathy not a fan
# - We use the huggingface transformers code instead of https://github.com/openai/gpt-2.git
#
# - actual source is here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
import transformers

In [None]:
import torch
import tiktoken

In [None]:
import tabulate

In [None]:
# The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
# 
# note: the "language modeling" head is essentially unembedding it
#
# note: this is the 124M model (small), not the GPT-2 (XL) 1.5B model 
model_hf: transformers.GPT2LMHeadModel = transformers.GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
# show the raw tensors
#
# TODO(bschoen): Karpathy tends to use jupyter notebook on right of source, probably bigger monitor since GPU
#
sd_hf = model_hf.state_dict()

# print(tabulate.tabulate([(k, v.shape) for k, v in sd_hf.items()], headers=['key', 'shape']))

We can look at the raw tensors to interpret the shapes involved:

```python
# we derived this directly from the shapes
MLP_DIMENSIONALITY_FACTOR = 4

vocab_size = 50257
context_size = 1024
d_embed = d_embed

d_model_mlp = MLP_DIMENSIONALITY_FACTOR * d_embed

assert d_model_mlp == 3072

query_size = d_embed
key_size   = d_embed
value_size = d_embed

d_attn = query_size + key_size + value_size

assert d_attn == 2304

# so we get:

# key                                shape
# ---------------------------------  ------------------------
# token embeddings
transformer.wte.weight               torch.Size([vocab_size, d_embed])
# positional embeddings
transformer.wpe.weight               torch.Size([context_size, d_embed])
# add together to size `d_embed`
#
transformer.h.0.ln_1.weight          torch.Size([d_embed])
transformer.h.0.ln_1.bias            torch.Size([d_embed])
transformer.h.0.attn.c_attn.weight   torch.Size([d_embed, d_attn])
transformer.h.0.attn.c_attn.bias     torch.Size([d_attn])
transformer.h.0.attn.c_proj.weight   torch.Size([d_embed, d_embed])
transformer.h.0.attn.c_proj.bias     torch.Size([d_embed])
transformer.h.0.ln_2.weight          torch.Size([d_embed])
transformer.h.0.ln_2.bias            torch.Size([d_embed])
transformer.h.0.mlp.c_fc.weight      torch.Size([d_embed, d_model_mlp])
transformer.h.0.mlp.c_fc.bias        torch.Size([d_model_mlp])
transformer.h.0.mlp.c_proj.weight    torch.Size([d_model_mlp, d_embed])
transformer.h.0.mlp.c_proj.bias      torch.Size([d_embed])
# note: layer norms at the inputs
transformer.h.1.ln_1.weight          torch.Size([d_embed])
transformer.h.1.ln_1.bias            torch.Size([d_embed])
transformer.h.1.attn.c_attn.weight   torch.Size([d_embed, d_attn])
transformer.h.1.attn.c_attn.bias     torch.Size([d_attn])
...
transformer.h.11.mlp.c_proj.weight   torch.Size([d_model_mlp, d_embed])
transformer.h.11.mlp.c_proj.bias     torch.Size([d_embed])
# making up for layer norms at the inputs
transformer.ln_f.weight              torch.Size([d_embed])
transformer.ln_f.bias                torch.Size([d_embed])
# map back to vocab
lm_head.weight                       torch.Size([vocab_size, d_embed])



# key                                shape
# ---------------------------------  ------------------------
transformer.wte.weight               torch.Size([50257, 768])
transformer.wpe.weight               torch.Size([1024, 768])
transformer.h.0.ln_1.weight          torch.Size([768])
transformer.h.0.ln_1.bias            torch.Size([768])
transformer.h.0.attn.c_attn.weight   torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias     torch.Size([2304])
transformer.h.0.attn.c_proj.weight   torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias     torch.Size([768])
transformer.h.0.ln_2.weight          torch.Size([768])
transformer.h.0.ln_2.bias            torch.Size([768])
transformer.h.0.mlp.c_fc.weight      torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias        torch.Size([3072])
transformer.h.0.mlp.c_proj.weight    torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias      torch.Size([768])
# note: layer norms at the inputs
transformer.h.1.ln_1.weight          torch.Size([768])
transformer.h.1.ln_1.bias            torch.Size([768])
transformer.h.1.attn.c_attn.weight   torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias     torch.Size([2304])
...
transformer.h.11.ln_1.weight         torch.Size([768])
transformer.h.11.ln_1.bias           torch.Size([768])
transformer.h.11.attn.c_attn.weight  torch.Size([768, 2304])
transformer.h.11.attn.c_attn.bias    torch.Size([2304])
transformer.h.11.attn.c_proj.weight  torch.Size([768, 768])
transformer.h.11.attn.c_proj.bias    torch.Size([768])
transformer.h.11.ln_2.weight         torch.Size([768])
transformer.h.11.ln_2.bias           torch.Size([768])
transformer.h.11.mlp.c_fc.weight     torch.Size([768, 3072])
transformer.h.11.mlp.c_fc.bias       torch.Size([3072])
transformer.h.11.mlp.c_proj.weight   torch.Size([3072, 768])
transformer.h.11.mlp.c_proj.bias     torch.Size([768])
# making up for layer norms at the inputs
transformer.ln_f.weight              torch.Size([768])
transformer.ln_f.bias                torch.Size([768])
# map back to vocab
lm_head.weight                       torch.Size([50257, 768])
```

In [None]:
# note: positional embeddings in GPT-2 are *learned*, not fixed sinusoidal like they are in the original attention paper

# example view of the positional embedding weights
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
import matplotlib.pyplot as plt

# note: patterns across dimensions, but unintelligable (especially since also contains relative position)

# note: frequently used as a tool to inspect things
plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")
plt.xlabel("Embedding Dimension")
plt.ylabel("Positional Embedding")
plt.title("Positional Embedding Weights")
# plt.show()

In [None]:
# look at a few individual column at random
#
# TODO(bschoen): Channel is a good term for dimension in the embedding space
#
# Can see some respond more or less depending on the position
#
# ex: green seems to focus on after 800
#
# Karpathy: "why knows"
# Karpathy: Can tell model not fully trained, because would expect this to be smoother
# Karpathy: In principle no reason these need to be smooth
plt.plot(sd_hf["transformer.wpe.weight"][:, 150], label='Position Embedding Dimension 150')
plt.plot(sd_hf["transformer.wpe.weight"][:, 200], label='Position Embedding Dimension 200')
plt.plot(sd_hf["transformer.wpe.weight"][:, 250], label='Position Embedding Dimension 250')

plt.xlabel('Position')
plt.ylabel('Weight Value')
plt.title('Word Position Embeddings')

plt.legend()

In [None]:
# we can do the same thing with any weights
#
# you see some structure, but again who knows
#
# {if you're into mechanistic interpretability}
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")

In [None]:
# note: Karpathy took example from https://huggingface.co/openai-community/gpt2#how-to-use

# we can also sample from it using the weights
#
# note: we don't even use our initialized model?
#
generator = transformers.pipeline('text-generation', model='gpt2')

# set seed before generation
transformers.set_seed(42)

# note: different generations even if fixed seed
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [None]:
# want to write our own GPT-2 class so we can actually understand what's going on there, because it's just too complicated
# to use theirs as reference
# if want to create one from scratch

In [None]:
# first we're going to create an instance and:
#  - allow loading from pretrained (to basically check against the existing model)
#  - figure out the layers we need, and also try to train it ourselves from scratch

In [None]:
# now we should be able to load the weights
from gpt_from_scratch.gpt2_from_scratch.train_gpt2 import (
    GPT,
    GPTConfig,
    get_best_available_torch_device,
)

# model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig())

In [None]:
import torch
import torch.nn.functional as F

import tiktoken

In [None]:
# note: if doing with pipeline get the exact same thing,
#       so we're using the model weights correctly, just
#       some config in the sampling pipeline is different

# now we'll replicate the pipeline thing
num_return_sequences = 5
max_length = 30

device = get_best_available_torch_device()

# note: dropout is an example of something that's different in eval vs train
model.eval()
model.to(device)

# prefix tokens
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)



# 5 samples through
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)

x = tokens.to(device)

# generate! right now x is (B, T) where B = 5, T = 8
# set the seed to 42
torch.manual_seed(42)
torch.cuda.manual_seed(42)

while x.size(1) < max_length:

    # forward the model to get the logits
    with torch.no_grad():

        logits = model(x) # (B, T, vocab_size)

        # take the logits at the last position
        # throw away all the logits from things other than the last position
        logits = logits[:, -1, :] # (B, vocab_size)

        # get the probabilities
        probs = F.softmax(logits, dim=-1)

        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        #
        # "anything lower than the 50th, we clamp to 0 and never sample it"
        #
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)

        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)

        # append to the sequence
        x = torch.cat((x, xcol), dim=1)


In [None]:
# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

# Moving To TinyShakespeare

In [2]:
from gpt_from_scratch import file_utils

# load tinyshakespeare
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

input_filepath = file_utils.download_file_from_url(url)

# Read all text from the input file
input_text = input_filepath.read_text()

File found in cache: download_cache/4acd659e47adc1daeb7aff503accf0a3


In [3]:
# lines | words | byte count
!wc download_cache/4acd659e47adc1daeb7aff503accf0a3

   40000  202651 1115394 download_cache/4acd659e47adc1daeb7aff503accf0a3


In [4]:
import tiktoken

from gpt_from_scratch.gpt2_from_scratch import data_loader

from gpt_from_scratch.gpt2_from_scratch.train_gpt2 import (
    GPT,
    GPTConfig,
    get_best_available_torch_device,
)

import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F

# create tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

# load text via dataloader
#
# note: we leave these on CPU, so that the dataloader
#       isn't trying to hold the whole set on the GPU
#
#       so is prefetching moving more data to the GPU?
tokens = tokenizer.encode(input_text)
tokens = torch.tensor(tokens, dtype=torch.long)

# basically reshape sequence into 2D (by batches)
B, T = 4, 32

# create a train loader that will continually give us new batches
train_loader = data_loader.DataLoaderLite(B=B, T=T, tokens=tokens)


loaded 338025 tokens
1 epoch = 2640 batches (steps to make one pass through data)


In [5]:
# now we'll try multiple batches
device = get_best_available_torch_device()

model = GPT(GPTConfig())
model.to(device)

# Karpathy: "AdamW is basically a bugfix of Adam"
#
# note: pretty good default learning rate for early experimentation
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

# jesus christ it really does crush the loss:
#
# step 49, loss: 0.002793300896883011
#
# ah now that we're loading more of the dataset can see loss around 6.5
#
# step 49, loss: 6.906085968017578
#
# we get some big gains right away because there's some tokens that never occur in our set
#
# after proper initialization
#
# step 49, loss: 6.7791547775268555
#
for i in range(50):

    optimizer.zero_grad()

    # here's where we actually move to GPU
    x, y = train_loader.next_batch()

    x, y = x.to(device), y.to(device)

    logits, loss = model(x, y)

    loss.backward()

    optimizer.step()

    print(f"step {i}, loss: {loss.item()}")

step 0, loss: 10.97684097290039
step 1, loss: 9.678625106811523
step 2, loss: 8.977907180786133
step 3, loss: 9.124414443969727
step 4, loss: 8.801149368286133
step 5, loss: 8.393295288085938
step 6, loss: 8.975137710571289
step 7, loss: 8.78160285949707
step 8, loss: 8.152408599853516
step 9, loss: 8.008208274841309
step 10, loss: 8.296585083007812
step 11, loss: 7.606632232666016
step 12, loss: 7.849130630493164
step 13, loss: 7.489934921264648
step 14, loss: 7.557112216949463
step 15, loss: 7.378129482269287
step 16, loss: 7.419276237487793
step 17, loss: 8.321800231933594
step 18, loss: 7.283927917480469
step 19, loss: 7.783172130584717
step 20, loss: 7.545779705047607
step 21, loss: 7.821457862854004
step 22, loss: 6.483449935913086
step 23, loss: 6.845578193664551
step 24, loss: 6.892683029174805
step 25, loss: 6.64844274520874
step 26, loss: 6.843649387359619
step 27, loss: 7.587469577789307
step 28, loss: 7.22553014755249
step 29, loss: 6.929841041564941
step 30, loss: 7.044208

In [None]:
# sample some outputs to get an idea of where we are

from gpt_from_scratch import tokenizer_utils

def sample_model(
    prompt: str,
    num_samples: int,
    max_tokens: int,
    model: nn.Module,
    tokenizer: tokenizer_utils.Tokenizer,
    device: torch.device,
) -> None:

    # tokenize
    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long)

    tokens = tokens.unsqueeze(0).repeat(num_samples, 1) # (5, 8)

    # tokens in this case is just the prompt, and is small enough to fit on GPU
    x = tokens.to(device)

    while x.size(1) < max_tokens:

        # forward the model to get the logits
        with torch.no_grad():

            logits, loss = model(x) # (B, T, vocab_size)

            # take the logits at the last position
            # throw away all the logits from things other than the last position
            logits = logits[:, -1, :] # (B, vocab_size)

            # get the probabilities
            probs = F.softmax(logits, dim=-1)

            # do top-k sampling of 50 (huggingface pipeline default)
            # topk_probs here becomes (5, 50), topk_indices is (5, 50)
            #
            # "anything lower than the 50th, we clamp to 0 and never sample it"
            #
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

            # select a token from the top-k probabilities
            # note: multinomial does not demand the input to sum to 1
            ix = torch.multinomial(topk_probs, 1) # (B, 1)

            # gather the corresponding indices
            xcol = torch.gather(topk_indices, -1, ix) # (B, 1)

            # append to the sequence
            x = torch.cat((x, xcol), dim=1)

    # print the generated text
    for i in range(num_samples):

        tokens = x[i, :max_tokens].tolist()

        decoded = tokenizer.decode(tokens)
        
        print(f"\n [{i}] >", decoded)


In [None]:
sample_model(
    prompt="Romeo",
    num_samples=5,
    max_tokens=30,
    model=model,
    tokenizer=tokenizer,
    device=device,
)


In [None]:
# let's load tinystories for comparison
#
# note: `datasets` can list datasets but is deprecated
import huggingface_hub

In [None]:
# from https://huggingface.co/docs/huggingface_hub/en/guides/download#from-latest-version
import dataclasses
from typing import Callable
import pathlib


@dataclasses.dataclass(frozen=True)
class TrainAndVal[T]:
    """Helper for common pattern of transforming both train and val."""

    train: T
    val: T

    def apply[R](self, func: Callable[[T], R]) -> 'TrainAndVal[R]':
        return dataclasses.replace(self,
            train=func(self.train),
            val=func(self.val),
        )

def download_file_from_tinystories(filename: str) -> pathlib.Path:

    print(f"Downloading {filename}...")
    filepath = huggingface_hub.hf_hub_download(
        repo_id='roneneldan/TinyStories',
        filename=filename,
        repo_type="dataset",
    )

    print(f"Downloaded {filename} to {filepath}")
    return pathlib.Path(filepath)

# original in paper
# train_filename, val_filename = 'TinyStories-train.txt', 'TinyStories-valid.txt'

# GPT-4 only, significantly larger but newer
filenames = TrainAndVal('TinyStoriesV2-GPT4-train.txt', 'TinyStoriesV2-GPT4-valid.txt')

# download
filepaths = filenames.apply(download_file_from_tinystories)

In [None]:
# lines | words | byte count
!echo "TinyShakespeare"
!wc download_cache/4acd659e47adc1daeb7aff503accf0a3

!echo "TinyStories"
!wc /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-train.txt

In [None]:
@dataclasses.dataclass(frozen=True)
class WordCount:
    lines: int
    words: int
    bytes: int

In [None]:
wc_tinyshakespeare = WordCount(lines=40000, words=202651, bytes=1115394)
wc_tinystories = WordCount(lines=15600056, words=439223236, bytes=2227753162)

In [None]:
for field in dataclasses.fields(WordCount):
    
    field_tinyshakespeare = getattr(wc_tinyshakespeare, field.name)
    field_tinystories = getattr(wc_tinystories, field.name)

    ratio = float(field_tinystories) / float(field_tinyshakespeare)

    print(f' - {field.name}: {round(ratio, 2)}')