In [1]:
!pip install tiktoken datasets

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none

In [2]:
from dataclasses import dataclass
import math

import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import GPT2LMHeadModel


@dataclass
class GPTConfig:
    """This class defines the configuration for the GPT model."""

    block_size: int = 1024
    vocab_size: int = 50257

    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class CausalSelfAttention(nn.Module):
    """Attention module."""

    def __init__(self, config: GPTConfig) -> None:
        """Initialize MLP."""
        super().__init__()
        # Batch of key/query/value projects for all heads
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # Output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.SCALE_INIT = 1
        # Regularization
        self.n_head = config.n_head
        self.n_embed = config.n_embd
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            ),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Perform inference."""
        B, T, C = x.size()
        # Compute the query, key, value for all heads in the batch.
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embed, dim=2)
        # Each are (B, nh, T, hs)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        # attention materializes (T, T)
        # Queries and keys interact
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # Ensure tokens only attend to tokens before them and not to tokens in the future
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
        # Normalize attention
        att = F.softmax(att, dim=-1)
        # Compute a weighted sum of interesting tokens
        y = att @ v
        # Reassemble and concat everything
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        # Output projection
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    """Multi-layer perceptron."""

    def __init__(self, config: GPTConfig) -> None:
        """Initialize MLP."""
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.SCALE_INIT = 1

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Perform inference."""
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    """A transformer block."""

    def __init__(self, config: GPTConfig) -> None:
        """Initialize Block."""
        super().__init__()
        self.config = config
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Perform inference."""
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [5]:

class GPT(nn.Module):
    """This class defines the GPT model."""

    def __init__(self, config: GPTConfig) -> None:
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(
            {
                "wte": nn.Embedding(config.vocab_size, config.n_embd),
                "wpe": nn.Embedding(config.block_size, config.n_embd),
                "h": nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
                "ln_f": nn.LayerNorm(config.n_embd),
            }
        )
        # Final classifier
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.enc = tiktoken.get_encoding('gpt2')

        # Share weights for input and output embeddings. This is about 30% of
        # the model weights.
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)


    def _init_weights(self, module: nn.Module) -> None:
      """Perform additional weight initialization to match gpt-2."""
      std = 0.02
      if isinstance(module, nn.Linear):
        if hasattr(module, "SCALE_INIT"):
          std *= (2 * self.config.n_layer) ** -0.05
        torch.nn.init.normal_(module.weight, mean=0, std=std)
        if module.bias is not None:
          torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0, std=std)


    def forward(self,
                x: torch.Tensor,
                targets: torch.Tensor | None = None
      ) -> (torch.Tensor, float):
        """Perform generation."""
        B, T = x.size()
        assert T <= self.config.block_size  # Max sequence length
        # Forward token and positional embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=x.device)  # Shape (T)
        pos_emb = self.transformer.wpe(pos)  # (T, n_emb)
        tok_emb = self.transformer.wte(x)  # (B, T, n_emb)
        x = tok_emb + pos_emb
        # Forward transformer blocks
        for block in self.transformer.h:
            x = block(x)
        # Forward the final layernorm
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        loss = None
        if targets is not None:
          loss = F.cross_entropy(
              # Flatten to (BxT, vocab_size)
              logits.view(-1, logits.size(-1)),
              # Flatten to (BxT)
              targets.view(-1)
          )
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type: str) -> "GPT":
        """Load the GPT from the pretrained model."""
        assert model_type in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            "gpt2": dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024),  # 350M params
            "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280),  # 774M params
            "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600),  # 1558M params
        }[model_type]
        config_args["vocab_size"] = 50257  # always 50257 for GPT model checkpoints
        config_args["block_size"] = 1024  # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [
            k for k in sd_keys if not k.endswith(".attn.bias")
        ]  # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [
            k for k in sd_keys_hf if not k.endswith(".attn.masked_bias")
        ]  # ignore these, just a buffer
        sd_keys_hf = [
            k for k in sd_keys_hf if not k.endswith(".attn.bias")
        ]  # same, just the mask (buffer)
        transposed = [
            "attn.c_attn.weight",
            "attn.c_proj.weight",
            "mlp.c_fc.weight",
            "mlp.c_proj.weight",
        ]

        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(
            sd_keys
        ), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def sample(self, text: str, num_return_sequences: int, max_length: int) -> list[str]:
      """Sample from the model from text input."""
      tokens = self.enc.encode(text)
      tokens = torch.tensor(tokens, dtype=torch.long) # (8, )
      # Replicate input tokens
      tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)

      # x is (B, T)
      x = tokens.to(device)

      # With each loop iteration we'll append a token to the sequence. This is
      # adding one more column to x each time.
      while x.size(1) < max_length:
        with torch.no_grad():
          logits, _ = model(x)  # (B, T, vocab_size)
          # Take the logits at the last position (next character) and drop the others.
          # This is correct but inefficient implementation of sampling.
          # Question: What is T?
          logits = logits[:, -1, :]  # (B, vocab_size)
          probs = F.softmax(logits, dim=-1)
          # Do top-k sampling of 50 which is the huggingface default. Get the top 50
          # probabilities and set all other tokens to probability of zero. This helps
          # keep the model on track so it doesn't go off the rails as easily.
          # Both are (5, 50)
          topk_probs, topk_indicies = torch.topk(probs, 50, dim=-1)
          # Select a token from the top 5
          ix = torch.multinomial(topk_probs, 1)  # (B, 1)
          # Gather corresponding indicidies
          xcol = torch.gather(topk_indicies, -1, ix)
          # Append the new character to the sequence (one for each in the batch)
          x = torch.cat((x, xcol), dim=-1)

      samples = []
      for i in range(num_return_sequences):
        tokens = x[i, :max_length].tolist()
        decoded = self.enc.decode(tokens)
        samples.append(decoded)

      return samples

# From Pretrained model

In [6]:
num_return_sequences = 5
max_length = 30

device = torch.device("cuda")

model = GPT.from_pretrained("gpt2")
model.eval()
model = model.to(device)

loading weights from pretrained gpt: gpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

samples = model.sample("Hello, I'm a language model,", num_return_sequences, max_length)
for sample in samples:
  print(">", sample)


> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some


# Train from Random Model

In [8]:
model = GPT(GPTConfig())
model.eval()
model = model.to(device)

In [9]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

samples = model.sample("Hello, I'm a language model,", num_return_sequences, max_length)
for sample in samples:
  print(">", sample)

> Hello, I'm a language model, regardpeople pain corpses depositedدannie Yin Twain 12 Bicycle Bung ost convolutedSpellDN697rypt corpsesinteg Truck implication
> Hello, I'm a language model,Spell criticisms Bicycle fidد epist Sul aggressively Answer Bungixty taxp dollsGet192sequ SulBrad Smoking GL enlight criticisms
> Hello, I'm a language model, headset ost Sul deposited some headsetDN Dragonbound Russo Answer summer summer Chrys thighs lar thighs frozenKn lar signings lar recounted
> Hello, I'm a language model,Ham BungDN ostAugust Dragonbound criticisms summer rival ost Siren SocketGet Amir acre reiter unravel MarRules Amir maintain comprehensive


In [10]:
import datasets
from typing import Any

class DataLoader:
    """Data loader to load batches from the dataset."""

    def __init__(self, enc: Any, batch_size: int, token_len: int, device: Any) -> None:
      """Initialize Dataloader."""
      self.B = batch_size
      self.T = token_len
      self.chunk_size = self.B * self.T

      ds = datasets.load_dataset('tiny_shakespeare', trust_remote_code=True)
      self.data = ds['train']['text'][0]
      self.tokens = torch.tensor(enc.encode(self.data))
      self.pos = 0
      print(f"Loaded {len(self.tokens)} tokens")
      # Number of unique batches before we start the dataset over
      print(f"1 epoch = {len(self.tokens) // self.chunk_size} batches")

    def __iter__(self) -> 'Self':
      self.pos = 0
      return self

    def __next__(self) -> (torch.Tensor, torch.Tensor):
      """Get the next batch in the dataset."""
      # B = batch size
      # T = sequence of tokens (less than max sequence length)
      # The buf contains an extra token to use in the labels. The x
      # input doesn't include that last token. The labels starts with the first token.
      B, T = self.B, self.T
      buf = self.tokens[self.pos:self.pos + self.chunk_size + 1]
      x = buf[:-1].view(B, T)
      y = buf[1:].view(B, T)
      self.pos += self.chunk_size
      if (self.pos + self.chunk_size + 1) > len(self.tokens):
        print("Reached epoch")
        self.pos = 0
      return x, y


In [22]:
import time

torch.set_float32_matmul_precision('high')

model = GPT(GPTConfig())
model = model.to(device)
model = torch.compile(model)

data_loader = DataLoader(model.enc, batch_size=16, token_len=1024, device=device)
ds = iter(data_loader)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(50):
  t0 = time.time()
  optimizer.zero_grad()
  x, y = next(ds)
  x, y = x.to(device), y.to(device)
  with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
    logits, loss = model(x, y)
  loss.backward()
  optimizer.step()
  torch.cuda.synchronize()
  t1 = time.time()
  dt = (t1 - t0) * 1000
  tokens_per_sec = data_loader.chunk_size / (t1 - t0)
  print(f"step {i} loss {loss.item()} dt: {dt:0.2f}ms tok/sec: {tokens_per_sec:0.2f}")

Loaded 301966 tokens
1 epoch = 18 batches
step 0 loss 10.957916259765625 dt: 42332.08ms tok/sec: 387.04
step 1 loss 9.631839752197266 dt: 138.84ms tok/sec: 118010.02
step 2 loss 9.32974624633789 dt: 139.15ms tok/sec: 117741.10
step 3 loss 9.037336349487305 dt: 139.25ms tok/sec: 117659.05
step 4 loss 8.813928604125977 dt: 138.72ms tok/sec: 118105.55
step 5 loss 8.6907377243042 dt: 138.85ms tok/sec: 118000.90
step 6 loss 8.476306915283203 dt: 138.42ms tok/sec: 118362.88
step 7 loss 8.191732406616211 dt: 138.84ms tok/sec: 118002.52
step 8 loss 7.913112640380859 dt: 138.36ms tok/sec: 118417.14
step 9 loss 7.701312065124512 dt: 138.78ms tok/sec: 118058.68
step 10 loss 7.5275444984436035 dt: 138.72ms tok/sec: 118105.34
step 11 loss 7.394217491149902 dt: 138.89ms tok/sec: 117964.64
step 12 loss 7.205788612365723 dt: 138.87ms tok/sec: 117984.09
step 13 loss 7.127068519592285 dt: 139.04ms tok/sec: 117834.38
step 14 loss 7.085083484649658 dt: 138.45ms tok/sec: 118339.03
step 15 loss 6.9181241989