In [1]:
# Following Andrej Karpathy's "Let's reproduce GPT-2 (124M)"
# https://www.youtube.com/watch?v=l8pRSuU81PU

from dataclasses import dataclass
import math
import torch
import torch.nn as nn
from torch.nn import functional as func

In [5]:
@dataclass
class GPTConfig:
    block_size: int = 256
    vocab_size: int = 65
    n_layer: int = 6
    n_head: int = 6
    n_embed: int = 384

@dataclass
class GPT2_124M_Config:
    block_size = 1024
    vocab_size = 50257 # 50k BPE merges, 256 bytes tokens, EOT token
    n_layer = 12
    n_head = 12
    n_embed = 768

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embed & config.n_head == 0
    
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed) # k, q, v projections concatenated
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)
        self.config = config
        self.c_proj.residual_rescale = True

        # mask, lower triangular, wrapped in two singleton dimensions
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, seq length, n_embed

        # nh = num heads
        # hs = head size
        # n_embed = nh * hs
        q, k, v = self.c_attn(x).split(self.config.n_embed, dim=2)
        q = q.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2) # B, nh, T, hs
        k = k.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2)
        v = v.view(B, T, self.config.n_head, C // self.config.n_head).transpose(1, 2)

        #att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        #att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float("-inf"))
        #att = func.softmax(att, dim=-1)
        #y = att @ v # (B, hn, T, T) x (B, nh, T, hs) = (B, nh, T, hs)
        y = func.scaled_dot_product_attention(q, k, v, is_causal=True)
        
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.c_proj(y)
        

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)
        self.c_proj.residual_rescale = True

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embed)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embed)
        self.mlp = MLP(config)

    def forward(self, x):
        x += self.attn(self.ln_1(x))
        x += self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embed), # Token embedding
            wpe = nn.Embedding(config.block_size, config.n_embed), # Positional embedding
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embed),
        ))

        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False)

        self.transformer.wte.weight = self.lm_head.weight

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, "residual_rescale"):
                std = (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def configure_optimizers(self, weight_decay, learning_rate, device):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {"params": decay_params, "weight_decay": weight_decay},
            {"params": nodecay_params, "weight_decay": 0.0}
        ]
        use_fused = "cuda" in device
        return torch.optim.AdamW(optim_groups, lr=3e-4, betas=(0.9,0.95), eps=1e-8, fused=use_fused)
    
    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, "seq len limit"

        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = func.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    
    @classmethod
    def from_pretrained(cls, model_type):
        assert model_type in {"gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"}
        from transformers import GPT2LMHeadModel

        config_args = {
            "gpt2": dict(n_layer=12, n_head=12, n_embed=768), # 124M
            "gpt2-medium": dict(n_layer=24, n_head=16, n_embed=1024), # 350M
            "gpt2-large": dict(n_layer=36, n_head=20, n_embed=1280), # 774M
            "gpt2-xl": dict(n_layer=48, n_head=25, n_embed=1600), # 1.558B
        }[model_type]

        config_args["vocab_size"] = 50257
        config_args["block_size"] = 1024

        config = GPTConfig(**config_args)

        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith(".attn.bias")]

        hf_model = GPT2LMHeadModel.from_pretrained(model_type)
        hf_sd = hf_model.state_dict()

        hf_sd_keys = hf_sd.keys()
        hf_sd_keys = [k for k in hf_sd_keys if not k.endswith(".attn.masked_bias")]
        hf_sd_keys = [k for k in hf_sd_keys if not k.endswith(".attn.bias")]
        transposed = ["attn.c_attn.weight", "attn.c_proj.weight", "mlp.c_fc.weight", "mlp.c_proj.weight"]

        assert len(sd_keys) == len(hf_sd_keys), "mismatched keys"

        for k in hf_sd_keys:
            if any(k.endswith(w) for w in transposed):
                assert hf_sd[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(hf_sd[k].t())
            else:
                assert hf_sd[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(hf_sd[k])

        return model



In [7]:
model = GPT.from_pretrained("gpt2")
#print("\n".join(model.state_dict().keys()))

#model = GPT(GPT2_124M_Config())
model.eval()
model.to("cuda")

import tiktoken
enc = tiktoken.get_encoding("gpt2")

In [97]:

num_return_sequences = 1
max_length = 100

tokens = enc.encode("glurp")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
x = tokens.to("cuda")

torch.manual_seed(8)
torch.cuda.manual_seed(8)

while x.size(1) < max_length:
    with torch.no_grad():
        logits, _ = model(x)
        logits = logits[:, -1, :] # last position
        probs = func.softmax(logits / 1, dim=-1)
        #logits -= (logits / 1).max(1, keepdim=True).values
        #logexp = (logits).exp()
        #probs = logexp / logexp.sum(dim=0)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_indices, -1, ix)
        x = torch.cat((x, xcol), dim=1)

for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)

    print(f"[{i}]\n{decoded}\n")

[0]
glurp.org and are available on the web at http://www.youtube.com/user/DrACnix

The E3 2016 is happening in October, so if you love seeing games, you have more time than others to see some amazing games. The game reveal is getting underway in the second half of October.

We always look forward to your participation in our E3. And the winners of the E3 should sign our petition to make their games available via



In [92]:
max_lr = 3e-4
min_lr = 0.1 * max_lr
warmup_steps = 10
max_steps = 50
def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)


total_batch_size = 2**19
partial_batch_size = 16
sequence_length = 1024
grad_accum_steps = total_batch_size // (partial_batch_size * sequence_length)

max_steps = 50

optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=max_lr, device=device)
for step in range(max_steps):
    optimizer.zero_grad()
    for partial_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        loss /= grad_accum_steps
        loss.backward()
    
    norm = torch.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr
    
    optimizer.step()
    torch.cuda.synchronize()
    print(f"step {step} | loss: {loss.item():.5f} | norm: {norm:.4f}")

NameError: name 'device' is not defined