In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# note: oh karpathy spins up lambda labs then connects VSCode to it

# Starting to optimize

> ![NOTE] Starting from "what hardware do I have, and am I fully utilizing it"

Then looking up NVIDIA spec sheet for A100, we see:

| Specification | A100 80GB PCIe | A100 80GB SXM |
|---------------|----------------|---------------|
| FP64 | 9.7 TFLOPS | 9.7 TFLOPS |
| FP64 Tensor Core | 19.5 TFLOPS | 19.5 TFLOPS |
| FP32 | 19.5 TFLOPS | 19.5 TFLOPS |
| Tensor Float 32 (TF32) | 156 TFLOPS \| 312 TFLOPS\* | 156 TFLOPS \| 312 TFLOPS\* |
| BFLOAT16 Tensor Core | 312 TFLOPS \| 624 TFLOPS\* | 312 TFLOPS \| 624 TFLOPS\* |
| FP16 Tensor Core | 312 TFLOPS \| 624 TFLOPS\* | 312 TFLOPS \| 624 TFLOPS\* |
| INT8 Tensor Core | 624 TOPS \| 1248 TOPS\* | 624 TOPS \| 1248 TOPS\* |
| GPU Memory | 80GB HBM2e | 80GB HBM2e |
| GPU Memory Bandwidth | 1,935GB/s | 2,039GB/s |


We're currently at:

| Specification | A100 80GB PCIe | A100 80GB SXM |
|---------------|----------------|---------------|
| FP32 | 19.5 TFLOPS | 19.5 TFLOPS |

but it turns out we don't really need that much precision for deep learning

| Format | Sign | Range (exponent) | Precision (mantissa) |
|--------|------|------------------|----------------------|
| FP32   | 1    | 8                | 23                   |
| TF32   | 1    | 8                | 10                   |
| FP16   | 1    | 5                | 10                   |
| BF16   | 1    | 8                | 7                    |

Notes:
- All values are in bits.
- FP32: Full 32-bit floating point
- TF32: Tensor Float 32
- FP16: Half-precision floating point
- BF16: Brain Float 16

In [2]:
from gpt_from_scratch import file_utils

# load tinyshakespeare
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

input_filepath = file_utils.download_file_from_url(url)

# Read all text from the input file
input_text = input_filepath.read_text()

File found in cache: download_cache/4acd659e47adc1daeb7aff503accf0a3


In [3]:
# lines | words | byte count
!wc download_cache/4acd659e47adc1daeb7aff503accf0a3

   40000  202651 1115394 download_cache/4acd659e47adc1daeb7aff503accf0a3


In [20]:
import time

import tiktoken

from gpt_from_scratch.gpt2_from_scratch import data_loader
from gpt_from_scratch.gpt2_from_scratch.train_gpt2 import (
    GPT,
    GPTConfig,
    get_best_available_torch_device,
)

import torch
import torch.optim
import torch.nn as nn
import torch.nn.functional as F

# create tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

# load text via dataloader
#
# note: we leave these on CPU, so that the dataloader
#       isn't trying to hold the whole set on the GPU
#
#       so is prefetching moving more data to the GPU?
tokens = tokenizer.encode(input_text)
tokens = torch.tensor(tokens, dtype=torch.long)

# create a train loader that will continually give us new batches
train_loader = data_loader.DataLoaderLite(B=4, T=32, tokens=tokens)


loaded 338025 tokens
1 epoch = 20 batches (steps to make one pass through data)


# Timings

| Run | Timing |
|---  | ---    |
|Initial timing with Float32 - (B=4, T=32) | step 49, loss: 6.804825782775879, dt: 136.36ms, tok/sec: 938.68 |

In [21]:
# now we'll try multiple batches
device = get_best_available_torch_device()

model = GPT(GPTConfig())
model.to(device)

# Karpathy: "AdamW is basically a bugfix of Adam"
#
# note: pretty good default learning rate for early experimentation
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

for i in range(50):

    t0 = time.time()

    optimizer.zero_grad()

    # here's where we actually move to GPU
    x, y = train_loader.next_batch()

    x, y = x.to(device), y.to(device)

    logits, loss = model(x, y)

    loss.backward()

    optimizer.step()

    # torch.cuda.synchronize() # wait for the GPU to finish work

    t1 = time.time()

    dt = (t1 - t0) * 1000 # time difference in miliseconds

    tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

    print(f"| step {i} | loss: {loss.item():.4f} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec:.2f}")

RuntimeError: MPS backend out of memory (MPS allocated: 17.30 GB, other allocations: 158.27 MB, max allowed: 18.13 GB). Tried to allocate 768.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# sample some outputs to get an idea of where we are

from gpt_from_scratch import tokenizer_utils

def sample_model(
    prompt: str,
    num_samples: int,
    max_tokens: int,
    model: nn.Module,
    tokenizer: tokenizer_utils.Tokenizer,
    device: torch.device,
) -> None:

    # tokenize
    tokens = tokenizer.encode(prompt)
    tokens = torch.tensor(tokens, dtype=torch.long)

    tokens = tokens.unsqueeze(0).repeat(num_samples, 1) # (5, 8)

    # tokens in this case is just the prompt, and is small enough to fit on GPU
    x = tokens.to(device)

    while x.size(1) < max_tokens:

        # forward the model to get the logits
        with torch.no_grad():

            logits, loss = model(x) # (B, T, vocab_size)

            # take the logits at the last position
            # throw away all the logits from things other than the last position
            logits = logits[:, -1, :] # (B, vocab_size)

            # get the probabilities
            probs = F.softmax(logits, dim=-1)

            # do top-k sampling of 50 (huggingface pipeline default)
            # topk_probs here becomes (5, 50), topk_indices is (5, 50)
            #
            # "anything lower than the 50th, we clamp to 0 and never sample it"
            #
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)

            # select a token from the top-k probabilities
            # note: multinomial does not demand the input to sum to 1
            ix = torch.multinomial(topk_probs, 1) # (B, 1)

            # gather the corresponding indices
            xcol = torch.gather(topk_indices, -1, ix) # (B, 1)

            # append to the sequence
            x = torch.cat((x, xcol), dim=1)

    # print the generated text
    for i in range(num_samples):

        tokens = x[i, :max_tokens].tolist()

        decoded = tokenizer.decode(tokens)
        
        print(f"\n [{i}] >", decoded)


In [None]:
sample_model(
    prompt="Romeo",
    num_samples=5,
    max_tokens=30,
    model=model,
    tokenizer=tokenizer,
    device=device,
)


In [None]:
# let's load tinystories for comparison
#
# note: `datasets` can list datasets but is deprecated
import huggingface_hub

In [None]:
# from https://huggingface.co/docs/huggingface_hub/en/guides/download#from-latest-version
import dataclasses
from typing import Callable
import pathlib


@dataclasses.dataclass(frozen=True)
class TrainAndVal[T]:
    """Helper for common pattern of transforming both train and val."""

    train: T
    val: T

    def apply[R](self, func: Callable[[T], R]) -> 'TrainAndVal[R]':
        return dataclasses.replace(self,
            train=func(self.train),
            val=func(self.val),
        )

def download_file_from_tinystories(filename: str) -> pathlib.Path:

    print(f"Downloading {filename}...")
    filepath = huggingface_hub.hf_hub_download(
        repo_id='roneneldan/TinyStories',
        filename=filename,
        repo_type="dataset",
    )

    print(f"Downloaded {filename} to {filepath}")
    return pathlib.Path(filepath)

# original in paper
# train_filename, val_filename = 'TinyStories-train.txt', 'TinyStories-valid.txt'

# GPT-4 only, significantly larger but newer
filenames = TrainAndVal('TinyStoriesV2-GPT4-train.txt', 'TinyStoriesV2-GPT4-valid.txt')

# download
filepaths = filenames.apply(download_file_from_tinystories)

In [None]:
# lines | words | byte count
!echo "TinyShakespeare"
!wc download_cache/4acd659e47adc1daeb7aff503accf0a3

!echo "TinyStories"
!wc /Users/bronsonschoen/.cache/huggingface/hub/datasets--roneneldan--TinyStories/snapshots/f54c09fd23315a6f9c86f9dc80f725de7d8f9c64/TinyStoriesV2-GPT4-train.txt

In [None]:
@dataclasses.dataclass(frozen=True)
class WordCount:
    lines: int
    words: int
    bytes: int

In [None]:
wc_tinyshakespeare = WordCount(lines=40000, words=202651, bytes=1115394)
wc_tinystories = WordCount(lines=15600056, words=439223236, bytes=2227753162)

In [None]:
for field in dataclasses.fields(WordCount):
    
    field_tinyshakespeare = getattr(wc_tinyshakespeare, field.name)
    field_tinystories = getattr(wc_tinystories, field.name)

    ratio = float(field_tinystories) / float(field_tinyshakespeare)

    print(f' - {field.name}: {round(ratio, 2)}')