In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

BLOCK_SIZE = 64
BATCH_SIZE = 128
LEARNING_RATE = 3e-4
MAX_ITERS = 3000
EVAL_ITERS = 100
N_EMBD = 384
N_LAYER = 8
N_HEAD = 8
DROPOUT = 0.2

cuda


In [2]:
# 81 unique chars in the text
chars = ''
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    chars = sorted(list(set(text)))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [3]:
# Tokenization is the process of breaking a text into smaller units called tokens (token = char here)
# Many ML models work with numerical data
string_to_int = { ch : i for i, ch in enumerate(chars) }
int_to_string = { i : ch for i, ch in enumerate(chars) }

# lambda s = function that takes input s and converts every char to int
encode = lambda s : [string_to_int[c] for c in s]
# l is a list of int
decode = lambda l : ''.join([int_to_string[i] for i in l])

print(encode('hello'))
print(decode([61, 58, 65, 65, 68]))

[61, 58, 65, 65, 68]
hello


A tensor is a multi-dimensional array used in PyTorch for efficient numerical computation, similar to arrays in NumPy. 
Tensors are designed to work with GPUs through CUDA and ML and DL functionalities like computing gradients.

torch.long represents a 64-bit signed integer (from -2^63 to 2^63-1) and is typically used for indicing even though
int may be sufficient for the 81 chars here

In [4]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


A bigram language model is a type of statistical language model that predicts the likelihood of a word based on the previous word in a sequence.
Ex: "the cat sat," the bigrams are: ("the", "cat") and ("cat", "sat").
The model will predict "cat" after "the" based on frequency and "sat" after "cat" based on frequency.

Block Size: Refers to how many tokens (e.g., words or characters) are used together as input for a model to process or predict.

Example: If you're training a language model with a block size of 3, you'd train it on sequences like:
"the cat sat" → [the, cat, sat]

During training, the model essentially learns the likelihood or probability of the third word (target) for any given pair of preceding words (context). It builds a kind of mapping like: P('sat' | 'the', 'cat')

After the model is trained, you can provide it with any two consecutive words, and it will predict the third word based on what it has learned.
Input: ['cat', 'sat']
Prediction: The model will predict 'on'.

In [5]:
# Split data into 80:20
# No need to shuffle data because it's sequential
split_point = int(0.8 * len(data))
train_data = data[ : split_point]
validate_data = data[split_point : ]

The get_batch function:
1. Randomly selects BATCH_SIZE number of starting positions in the wizard of oz text (ex. ch. 8, 1, 2, 9)
2. For each starting position, get the block of BLOCK_SIZE with int of the chars from the starting position (ex. [1, 58, ...])
3. Store x and y in GPU memory

In [6]:
def get_batch(data_type):
    data = train_data if data_type == 'train' else validate_data
    # Batch size number of random int between 0 and len(text) - block size
    starting_position = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    # print(starting_position)

    # Stack blocks/sequences in batches
    x = torch.stack([data[i : i + BLOCK_SIZE] for i in starting_position])
    y = torch.stack([data[i + 1 : i + BLOCK_SIZE + 1] for i in starting_position])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs: ', x)
print('targets: ', y)

inputs:  tensor([[67, 57,  1,  ..., 68, 76,  1],
        [67,  1, 61,  ..., 60, 61, 73],
        [ 1, 73, 61,  ..., 55, 58, 59],
        ...,
        [56, 54, 74,  ..., 54, 60, 62],
        [71, 57,  1,  ..., 72, 73,  1],
        [57,  1, 54,  ...,  1, 70, 74]], device='cuda:0')
targets:  tensor([[57,  1, 72,  ..., 76,  1, 72],
        [ 1, 61, 62,  ..., 61, 73,  1],
        [73, 61, 58,  ..., 58, 59, 68],
        ...,
        [54, 74, 72,  ..., 60, 62, 56],
        [57,  1, 73,  ..., 73,  1, 55],
        [ 1, 54, 73,  ..., 70, 74, 62]], device='cuda:0')


@torch.no_grad() - no gradient to improve performance
Not needed; just reporting loss

In [7]:
@torch.no_grad()
def estimate_loss():
    out = {}
    # Puts model in evaluate mode (like dropout which drops out random neurons for more noise)
    model.eval()
    for split in ['train', 'validate']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, y = get_batch(split)
            logits, loss = model(X, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    # Puts model in train mode (like weights)
    model.train()
    return out

In [8]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(N_EMBD, head_size, bias=False)
        self.query = nn.Linear(N_EMBD, head_size, bias=False)
        self.value = nn.Linear(N_EMBD, head_size, bias=False)
        # No look ahead masking
        # self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        self.register_buffer('tril', torch.tril(torch.ones(128, 128)))
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        # Input size (batch, time-step, channels)
        B, T, C = x.shape
        key = self.key(x)   # (B, T, head size)
        query = self.query(x)   # (B, T, head size)

        # Compute attention scores ('affinities')
        weight = query @ key.transpose(-2, -1) * key.shape[-1] ** -0.5   # (B, T, head size) @ (B, head size, T) -> (B, T, T)

        weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf'))   # (B, T, T), turn 0 to -inf
        # weight = weight.masked_fill(
        #     torch.tril(torch.ones(T, T, device=x.device)) == 0, float('-inf')
        # )

        weight = F.softmax(weight, dim = -1)   # (B, T, T)
        weight = self.dropout(weight)

        # Perform the weighted aggregation of the values
        value = self.value(x)   # (B, T, head size)
        out = weight @ value   # (B, T, T) @ (B, T, head size) -> (B, T, head_size)
        
        # Output size (batch, time-step, head size)
        return out

In [9]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        # Have heads run in parallel
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(head_size * num_heads, N_EMBD)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, ...h3]) where h = head
        out = self.dropout(self.projection(out))
        return out

In [10]:
class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),   # <= 0 becomes 0 and > 0 stays as positive
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(DROPOUT)
        )

    def forward(self, x):
        return self.net(x)

head_size = number of features that each head will be capturing in multi-head attention
Layer norms help smooth out features.
Post norm converges better for this project.

In [11]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.self_attention = MultiHeadAttention(n_head, head_size)
        self.feed_fwd = FeedForward(n_embd)
        self.layer_norm1 = nn.LayerNorm(n_embd)
        self.layer_norm2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.self_attention(x)
        x = self.layer_norm1(x + y)   # Add a norm
        y = self.feed_fwd(x)
        x = self.layer_norm2(x + y)
        return x

def __init__ part:
    - the super() calls the constructor of the parent class, nn.Module, to initialize the Bigram model as a NN model
    - embeddings table is a lookup table: 
        Token	Index	Embedding Vector
         'a'	  0	    [0.1, -0.4, 0.7, 0.3, 0.2]
        The raw scores (logits) are learned during training.
        They are initially random and then updated to minimize the prediction error (loss).
        Backpropagation and optimization are used to adjust these raw scores based on how well the model predicts the next token.
        The logits are not probabilities but raw predictions, which are converted into probabilities using softmax.
        
        Ex. 0.1 is the model's raw score (logit) for 'a' being the next token

def forward: 
    - Index = the batch (batch_size * block_size)
    - Logits = predictions for the next char (raw scores over the vocab) = 1 raw score for each vocab for each token in the input
               shape = batch_size * block_size * vocab_size
    - batch_size = number of sequences in a batch (number of [ ] processed in parallel/at the same time with GPU)
    - block_size = length of each sequence (number of tokens in a block, [ ] = 1 block)
    - Channels = size of the output vector/neurons for each token = vocab_size
    - Cross-Entropy Loss/Loss = measures how 'wrong' the prediction is (only used during training); compares model predictions (logits) to
                              the correct next tokens (targets); want to minimize loss

    In the else statement:
    - Unpack the logits tensor to get the batch size, block size, and vocabulary size (4, 8, 81)
    - Flatten the logits (32, 81) and targets (32,) to use for cross_entropy

Transformer architecture is a deep learning model designed for sequence-to-sequence tasks, such as language translation. It relies on **self-attention mechanisms** to weigh the importance of different parts of the input data, allowing it to process sequences in parallel rather than sequentially, which speeds up training. The model is composed of **encoder** and **decoder** layers, each containing multi-head attention and feedforward networks.

**Multi-head attention** is a mechanism that allows the model to focus on different parts of the input sequence simultaneously. It splits the attention into multiple "heads," each learning a different representation of the data, and then combines them to form a richer understanding of the sequence.

def generate:
    - Get the logits (raw scores) from self.forward for the next token prediction
    - Change logits shape from (batch_size, block_size, vocab_size) to (batch_size, vocab_size)
    - Converts raw logits into probabilities using softmax, ensuring they sum to 1
    - Samples a token from the probability instead of picking the max value (adds randomness) with shape (batch_size, 1) (one sampled token per batch)
        - dim = -1 means last dimension
    - Append new tokens to the end of the sequences (so index shape (batch_size, block_size) becomes (batch_size, block_size + 1)) 

In [12]:
class GPTLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        # Make 4 sequential blocks
        self.decoder_blocks = nn.Sequential(*[Block(N_EMBD, n_head=N_HEAD) for _ in range(N_LAYER)])

        # For normalizing the decoders in order to feed into Softmax
        self.final_layer_norm = nn.LayerNorm(N_EMBD)
        self.lang_model_head = nn.Linear(N_EMBD, vocab_size)

        self.apply(self._init_weights)

    # Apply initialization on the weights
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        # logits = self.token_embedding_table(index)
        batch_size, block_size = index.shape

        # index and targets are both (B, T) tensor of int. Torch broadcasting rules
        token_embedding = self.token_embedding_table(index)   # (B, T, C)
        pos_embedding = self.position_embedding_table(torch.arange(block_size, device=device))   # (T, C)
        x = token_embedding + pos_embedding  # (B, T, C)
        x = self.decoder_blocks(x)   # (B, T, C)
        x = self.final_layer_norm(x)   # (B, T, C) 
        logits = self.lang_model_head(x)   # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, channels = logits.shape   # B, T, C
            logits = logits.view(batch_size * block_size, channels)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = -1)
            index_next = torch.multinomial(probs, num_samples = 1)
            index = torch.cat((index, index_next), dim = 1)
        return index

Initialize a starting token (0) (The context is initialized to give the model a starting point for generating new tokens)
Generate 500 new tokens from the model.
Convert the generated token sequence into a human-readable format.
Right now it's untrained (random weights) so nonsense generated_chars are produced

In [13]:
model = GPTLanguageModel(vocab_size)
m = model.to(device)

The AdamW optimizer includes weight decay for regularization and updates the model's parameters (weights) during training.
The code trains the model by:

1. Getting a batch of data.
2. Performing a forward pass to compute predictions and loss.
3. Clear the gradient to start fresh in the next iter b/c previous gradient is from previous data which can be biased. The optimizer adjusts each weight in the opposite direction of the gradient to reduce the loss (gradient descent).
4. Calculating gradients using backpropagation.
5. Updating the model parameters (weights) for the next iteration.
6. Get the final loss for the last batch

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for iter in range(MAX_ITERS):
    if iter % EVAL_ITERS == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['validate']:.3f}")
        
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print('Final loss: ', loss.item())

step: 0, train loss: 4.425, val loss: 4.427
step: 100, train loss: 2.321, val loss: 2.393
step: 200, train loss: 1.859, val loss: 1.982
step: 300, train loss: 1.614, val loss: 1.775
step: 400, train loss: 1.466, val loss: 1.666
step: 500, train loss: 1.363, val loss: 1.605
step: 600, train loss: 1.280, val loss: 1.556
step: 700, train loss: 1.224, val loss: 1.530
step: 800, train loss: 1.162, val loss: 1.514
step: 900, train loss: 1.110, val loss: 1.505
step: 1000, train loss: 1.063, val loss: 1.505
step: 1100, train loss: 1.010, val loss: 1.515


KeyboardInterrupt: 

In [15]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`