## GPT
In this chapter we will build GPT-model from scratch.

In [42]:
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plot
%matplotlib inline

First let's build a simple bigram model, that will prepare us to GPT architecture.

In [5]:
PATH = 'tinyshakespeare.txt'

def read(path: str) -> str:
    with open(path) as fileobj:
        return fileobj.read()


TEXT = read(PATH)
print(f'{len(TEXT)=}')

len(TEXT)=1115394


In [11]:
def get_chars(text: str) -> str:
    return ''.join(sorted(set(text)))


CHARS = get_chars(TEXT)
print(f'{len(CHARS)=}, {CHARS=!r}')

len(CHARS)=65, CHARS="\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"


In [17]:
CHAR_TO_CLASS = {ch: i for i, ch in enumerate(CHARS)}
CLASS_TO_CHAR = dict(enumerate(CHARS))

def as_tokens(text: str) -> torch.Tensor:
    return torch.tensor([CHAR_TO_CLASS[ch] for ch in text])


def from_tokens(tokens: torch.Tensor) -> str:
    chars = (CLASS_TO_CHAR[c.item()] for c in tokens)
    return ''.join(chars)


print(f'{as_tokens("test")=}')
print(f'{from_tokens(as_tokens("test"))=}')

as_tokens("test")=tensor([58, 43, 57, 58])
from_tokens(as_tokens("test"))='test'


In [20]:
def split_at(tokens, split):
    i = int(len(tokens) * split)
    return tokens[:i], tokens[i:]
    
TRAINING_SET, VALIDATION_SET = split_at(as_tokens(TEXT), 0.9)
print(f'{len(TRAINING_SET)=} {len(VALIDATION_SET)=}')

len(TRAINING_SET)=1003854 len(VALIDATION_SET)=111540


In [40]:
# for use in SGD
BATCH_SIZE = 32

# how many tokens can our network see at most
CONTEXT_LEN = 8


def get_batch(training_set, batch_size=BATCH_SIZE, context_len=CONTEXT_LEN):
    end_index = len(training_set) - context_len 
    # torch.randint is not inclusive
    offsets = torch.randint(end_index, (batch_size,))
    inputs = torch.stack([training_set[i:i+context_len] for i in offsets])
    outputs = torch.stack([training_set[i+1:i + context_len + 1] for i in offsets])
    # for each input in a batch the corresponding output is next character given current and all of the previous characters in the input
    # inputs and outputs have the same shape, it's just that input uses all of the previous elements, it's just a storage optimization
    return inputs, outputs




In [69]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # first nn.Embedding arg is size of the dictionary, second arg is a size of a single embedded vector
        self.emb = nn.Embedding(vocab_size, vocab_size)

    def forward(self, inputs, outputs=None):
        # inputs.shape == outputs.shape == (BATCH_SIZE, CONTEXT_LEN) == (B, T)
        logits = self.emb(inputs)
        # logits.shape == (B, T, C), C == vocab_size
        B, T, C = logits.shape
        
        if outputs is None:
            loss = None
        else:
            # .view(...) calls are to work around the F.cross_entropy argument expectations (C should be the second dimension)                    
            logits = logits.view(B * T, C)
            outputs = outputs.view(B * T)
            loss = F.cross_entropy(logits, outputs)
        return logits, loss

    def generate(self, input_, max_chars_to_generate):
        for _ in range(max_chars_to_generate):
            logits, _ = self.forward(input_)
            # logits.shape == (B, T, C)
            logits = logits[:, -1, :]
            # logits.shape == (B, C)
            probs = F.softmax(logits, dim=1)
            next_ = torch.multinomial(probs, num_samples=1)
            # next_.shape == (B, 1)
            input_ = torch.cat((input_, next_), dim=1)
        return input_


def train_and_generate(learning_rate=1e-2, num_iterations=1000, verbose=False, prompt=""):
    model = BigramModel(len(CHARS))
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)    
    for i in range(num_iterations):
        inputs, outputs = get_batch(TRAINING_SET)
        _, loss = model(inputs, outputs)
        if verbose and i % 1000 == 0:
            print(f'{loss.item()=}')
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
    text = from_tokens(model.generate(as_tokens(prompt).view(1, -1), max_chars_to_generate=150)[0])
    print(f'\n{text}\n')
    return model


MODEL = train_and_generate(verbose=True, num_iterations=10000, prompt="Call me")

loss.item()=4.535597801208496
loss.item()=2.5105106830596924
loss.item()=2.4556617736816406
loss.item()=2.4507646560668945
loss.item()=2.512702226638794
loss.item()=2.4667935371398926
loss.item()=2.4720287322998047
loss.item()=2.478483200073242
loss.item()=2.372753381729126
loss.item()=2.435739278793335

Call mevot jurinotn: havedom't, w ndat, weraris qurrigerot, be in brk m rmpl
Jarwoarnt aimasonee d H:-
Thave thearamucll INGis
s t
Ancrt, Ed hus an me y thin



We can see that the generated text looks like shit. Let's improve that by using GPT architecture.

In [81]:
import math

class AttentionHead(nn.Module):
    # Attention is another word for communication, 
    def __init__(self, block_size, n_embd, head_size):
        super().__init__()

        # what token contains
        self.key = nn.Linear(n_embd, head_size, bias=False)
        
        # what token is interested in
        self.query = nn.Linear(n_embd, head_size, bias=False)

        # what token provides???
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # register_buffer to avoid training on it
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.head_size = head_size

    def forward(self, inputs):
        # inputs.size = B, T, C (num_batches, block_size, num_channels)
        # outputs.size = B, T, Head_Size
        B, T, C = inputs.shape

        # (k|q|v).shape == B, T, head_size
        k = self.key(inputs)
        q = self.query(inputs)
        v = self.value(inputs)

        wei = q @ k.transpose(-2, -1) / (self.head_size ** 0.5)
        # wei.shape = B, T, T
        wei = wei.masked_fill(self.tril[:T, :T] == 0, -math.inf)
        wei = F.softmax(wei, dim=2)
        return wei @ v


class AttentionMultiHead(nn.Module):
    # MultiHead just uses separate heads and concatenates them
    def __init__(self, block_size, n_embd, head_size, num_heads):
        super().__init__()
        self.heads = [AttentionHead(block_size=block_size, n_embd=n_embd, head_size=head_size) for _ in range(num_heads)]
        self.proj = nn.Linear(num_heads * head_size, n_embd)

    def forward(self, inputs):
        outputs = [head(inputs) for head in self.heads]
        return self.proj(torch.cat(outputs, -1))


class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.module = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, inputs):
        return self.module(inputs)


class Block(nn.Module):
    def __init__(self, block_size, n_embd, num_heads):
        super().__init__()
        assert n_embd % num_heads == 0
        head_size = n_embd // num_heads
        self.multi_head = AttentionMultiHead(n_embd=n_embd, block_size=block_size, num_heads=num_heads, head_size=head_size)
        self.ffd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, inputs):
        inputs = inputs + self.multi_head(self.ln1(inputs))
        inputs = inputs + self.ffd(self.ln2(inputs))
        return inputs


class GptModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, num_heads, num_layers):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, n_embd)
        # why do we need position embeddings?
        self.position_embeddings = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(block_size=block_size, n_embd=n_embd, num_heads=num_heads) for _ in range(num_layers)])
        self.ln_final = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embeddings(idx) # (B,T,C)
        pos_emb = self.position_embeddings(torch.arange(T)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_final(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, inputs, block_size, max_chars_to_generate):
        for _ in range(max_chars_to_generate):
            logits, _ = self.forward(inputs[:, -block_size:])
            # logits.shape == (B, T, C)
            logits = logits[:, -1, :]
            # logits.shape == (B, C)
            probs = F.softmax(logits, dim=1)
            next_ = torch.multinomial(probs, num_samples=1)
            # next_.shape == (B, 1)
            inputs = torch.cat((inputs, next_), dim=1)
        return inputs



def train_and_generate(learning_rate=1e-2, num_iterations=1000, verbose=False, prompt=""):
    block_size = 8
    model = GptModel(vocab_size=len(CHARS), n_embd=32, block_size=block_size, num_heads=4, num_layers=2)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)    
    for i in range(num_iterations):
        inputs, outputs = get_batch(TRAINING_SET)
        _, loss = model(inputs, outputs)
        if verbose and i % 1000 == 0:
            print(f'{loss.item()=}')
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
    text = from_tokens(model.generate(as_tokens(prompt).view(1, -1), max_chars_to_generate=150, block_size=block_size)[0])
    print(f'\n{text}\n')
    return model


MODEL = train_and_generate(verbose=True, num_iterations=20000, prompt="Call me")
        

loss.item()=4.383978843688965
loss.item()=2.34675931930542
loss.item()=2.1892449855804443
loss.item()=2.0623483657836914
loss.item()=1.9102087020874023
loss.item()=2.212172746658325
loss.item()=2.0802464485168457
loss.item()=2.195741653442383
loss.item()=1.933051586151123
loss.item()=2.0005807876586914
loss.item()=2.011009693145752
loss.item()=2.0354316234588623
loss.item()=1.9687249660491943
loss.item()=2.0049242973327637
loss.item()=1.8740549087524414
loss.item()=2.027743101119995
loss.item()=1.8891254663467407
loss.item()=1.9559295177459717
loss.item()=1.931981086730957
loss.item()=1.8831462860107422

Call me,
be seth of Good the migh metmentery; and bed ha wither unquisy, in yo nor blose, contal I maifs my met gacke a grembint is'ttninether.
Reives who:
T



That's not especially good text, and it contains some real words and that's with a pretty small block_size. Also loss is lower with GPT architecture.

I don't fully comprehend how does GPT work: key/query/value understanding is very sketchy.