## GPT
In this chapter we will build GPT-model from scratch.

In [42]:
import torch
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plot
%matplotlib inline

First let's build a simple bigram model, that will prepare us to GPT architecture.

In [5]:
PATH = 'tinyshakespeare.txt'

def read(path: str) -> str:
    with open(path) as fileobj:
        return fileobj.read()


TEXT = read(PATH)
print(f'{len(TEXT)=}')

len(TEXT)=1115394


In [11]:
def get_chars(text: str) -> str:
    return ''.join(sorted(set(text)))


CHARS = get_chars(TEXT)
print(f'{len(CHARS)=}, {CHARS=!r}')

len(CHARS)=65, CHARS="\n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"


In [17]:
CHAR_TO_CLASS = {ch: i for i, ch in enumerate(CHARS)}
CLASS_TO_CHAR = dict(enumerate(CHARS))

def as_tokens(text: str) -> torch.Tensor:
    return torch.tensor([CHAR_TO_CLASS[ch] for ch in text])


def from_tokens(tokens: torch.Tensor) -> str:
    chars = (CLASS_TO_CHAR[c.item()] for c in tokens)
    return ''.join(chars)


print(f'{as_tokens("test")=}')
print(f'{from_tokens(as_tokens("test"))=}')

as_tokens("test")=tensor([58, 43, 57, 58])
from_tokens(as_tokens("test"))='test'


In [20]:
def split_at(tokens, split):
    i = int(len(tokens) * split)
    return tokens[:i], tokens[i:]
    
TRAINING_SET, VALIDATION_SET = split_at(as_tokens(TEXT), 0.9)
print(f'{len(TRAINING_SET)=} {len(VALIDATION_SET)=}')

len(TRAINING_SET)=1003854 len(VALIDATION_SET)=111540


In [40]:
# for use in SGD
BATCH_SIZE = 32

# how many tokens can our network see at most
CONTEXT_LEN = 8


def get_batch(training_set, batch_size=BATCH_SIZE, context_len=CONTEXT_LEN):
    end_index = len(training_set) - context_len 
    # torch.randint is not inclusive
    offsets = torch.randint(end_index, (batch_size,))
    inputs = torch.stack([training_set[i:i+context_len] for i in offsets])
    outputs = torch.stack([training_set[i+1:i + context_len + 1] for i in offsets])
    # for each input in a batch the corresponding output is next character given current and all of the previous characters in the input
    # inputs and outputs have the same shape, it's just that input uses all of the previous elements, it's just a storage optimization
    return inputs, outputs




In [64]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # first nn.Embedding arg is size of the dictionary, second arg is a size of a single embedded vector
        self.emb = nn.Embedding(vocab_size, vocab_size)

    def forward(self, inputs, outputs=None):
        # inputs.shape == outputs.shape == (BATCH_SIZE, CONTEXT_LEN) == (B, T)
        logits = self.emb(inputs)
        # logits.shape == (B, T, C), C == vocab_size
        B, T, C = logits.shape
        
        if outputs is None:
            loss = None
        else:
            # .view(...) calls are to work around the F.cross_entropy argument expectations (C should be the second dimension)                    
            logits = logits.view(B * T, C)
            outputs = outputs.view(B * T)
            loss = F.cross_entropy(logits, outputs)
        return logits, loss

    def generate(self, input_, max_chars_to_generate):
        for _ in range(max_chars_to_generate):
            logits, _ = self.forward(input_)
            # logits.shape == (B, T, C)
            logits = logits[:, -1, :]
            # logits.shape == (B, C)
            probs = F.softmax(logits, dim=1)
            next_ = torch.multinomial(probs, num_samples=1)
            # next_.shape == (B, 1)
            input_ = torch.cat((input_, next_), dim=1)
        return input_


def train_and_generate(learning_rate=1e-2, num_iterations=1000, verbose=False, prompt=""):
    model = BigramModel(len(CHARS))
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)    
    for i in range(num_iterations):
        inputs, outputs = get_batch(TRAINING_SET)
        _, loss = model(inputs, outputs)
        if verbose and i % 1000 == 0:
            print(f'{loss=}')
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
    text = from_tokens(model.generate(as_tokens(prompt).view(1, -1), 50)[0])
    print(f'{text=}')
    return model


MODEL = train_and_generate(verbose=True, num_iterations=10000, prompt="Call me")

loss=tensor(4.6535, grad_fn=<NllLossBackward0>)
loss=tensor(2.4651, grad_fn=<NllLossBackward0>)
loss=tensor(2.3875, grad_fn=<NllLossBackward0>)
loss=tensor(2.4232, grad_fn=<NllLossBackward0>)
loss=tensor(2.4819, grad_fn=<NllLossBackward0>)
loss=tensor(2.3876, grad_fn=<NllLossBackward0>)
loss=tensor(2.4621, grad_fn=<NllLossBackward0>)
loss=tensor(2.3209, grad_fn=<NllLossBackward0>)
loss=tensor(2.3721, grad_fn=<NllLossBackward0>)
loss=tensor(2.3765, grad_fn=<NllLossBackward0>)
text="Call me taks n, a.\nImplartsthok'ss s.\nILEN he wend mo bes"
