In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2


In [18]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[0:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [19]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


# Encoding of characters

This is very simple. But there are other methods like `tiktoken`, `sentencepiece` etc.

In [20]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"vocab size: {vocab_size}")
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }  # string to integer
itos = { i:ch for i,ch in enumerate(chars) }  # integer to string
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


vocab size: 65


In [22]:
print(encode("to be or not to be"))
print(decode(encode("to be or not to be")))

[58, 53, 1, 40, 43, 1, 53, 56, 1, 52, 53, 58, 1, 58, 53, 1, 40, 43]
to be or not to be


In [23]:
stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [24]:
# Encode and store it in a torch.tensor object
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long) # torch.long is desired memory format of Tensor
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [26]:
# `data` is just numbers representing the lyrics
decode(encode(text[0:100]))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

# Train the model

We don't feed the whole training data into the model. Too computationally expensive. Instead, we feed random chunks of data at a time. The size of these chunks is called  `block_size` in this example but others may call it `context_length`.

In [27]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In this particular example, there's actually a lot of examples packed into it because of the masking.

`3`
<br>
`3,  1`
<br>
`3,  1, 20`  # in the context of seeing `3, 1` then `20` comes next
<br>
etc.

In [28]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is: {context} the target is: {target}")

when input is: tensor([18]) the target is: 47
when input is: tensor([18, 47]) the target is: 56
when input is: tensor([18, 47, 56]) the target is: 57
when input is: tensor([18, 47, 56, 57]) the target is: 58
when input is: tensor([18, 47, 56, 57, 58]) the target is: 1
when input is: tensor([18, 47, 56, 57, 58,  1]) the target is: 15
when input is: tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
when input is: tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


This also helps the transformer get used to seeing contexts at these size ranges (from 1 to `block_size`). This is also helpful during inference so the model knows what sizes it can predict. Keep in mind that this also limits what the model can predict. After `block_size` the model has to truncate. The above is implicitly the time dimension, since a sequence where order matters is generated.

Now consider the **batch** dimension. It's taking multiple random chunks of text and grouping them together to keep things efficient and keep the GPUs busy since they can be processed in parallel. The chunks are processed completely independently.

In [30]:
torch.manual_seed(1337)  # won't match video's since I'm using a different dataset
block_size = 8   # the maximum context length for predictions
batch_size = 4   # how many independent sequences will we process in parallel


# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y;
    # this will happen on any forward or backward pass through the model
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))  # get random off sets
    x = torch.stack([data[i:i+block_size] for i in ix])        # then stack them all up together
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])    # needed to generate the loss function
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print('\ntargets:')
print(yb.shape)
# the number of rows is the number of batches
# the number of columns is the block_size
print('\nyb:\n', yb)


inputs:
torch.Size([4, 8])

targets:
torch.Size([4, 8])

yb:
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [31]:
xb # our input to the transformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

In [34]:
# again, each is a random chunk of data
# represented by integer values. you can verify this here
# this is just the first batch of `yb`
decode(np.array(yb)[0])

"et's hea"

In [33]:
for b in range(batch_size):  # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()}, the target is: {target}")

when input is [24], the target is: 43
when input is [24, 43], the target is: 58
when input is [24, 43, 58], the target is: 5
when input is [24, 43, 58, 5], the target is: 57
when input is [24, 43, 58, 5, 57], the target is: 1
when input is [24, 43, 58, 5, 57, 1], the target is: 46
when input is [24, 43, 58, 5, 57, 1, 46], the target is: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43], the target is: 39
when input is [44], the target is: 53
when input is [44, 53], the target is: 56
when input is [44, 53, 56], the target is: 1
when input is [44, 53, 56, 1], the target is: 58
when input is [44, 53, 56, 1, 58], the target is: 46
when input is [44, 53, 56, 1, 58, 46], the target is: 39
when input is [44, 53, 56, 1, 58, 46, 39], the target is: 58
when input is [44, 53, 56, 1, 58, 46, 39, 58], the target is: 1
when input is [52], the target is: 58
when input is [52, 58], the target is: 1
when input is [52, 58, 1], the target is: 58
when input is [52, 58, 1, 58], the target is: 46
when input i

In [34]:
print(xb)

tensor([[28, 33, 20, 21, 32,  1, 19, 30],
        [24, 18,  1,  5, 21, 22, 27, 18],
        [31,  1, 26, 18,  1, 28, 33, 32],
        [30, 31,  1, 21, 18, 30,  1, 28]])


# Feed this into a simple language model

Bigram language model is covered in his make more series.

In [60]:
# super simple bigram model
# nn module is a basic blocking block for graphs 
# https://pytorch.org/docs/stable/nn.html
class BigramLanguageModel(nn.Module):   # subclassing nn.Module

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        # for example token 30 (tensor above) will grab the 30th row of this
        # embedding table, see output below
        

    def forward(self, idx, targets=None):
        # idx is x; inputs renamed to idx (not sure why it's renamed)
        # targets is y;

        # idx and targets are both (B,T) tensor of integers
        # logits are the predictions
        logits = self.token_embedding_table(idx) # (B,T,C)  # what is channel? the character size

        # use this conditional to evaluate the loss (quality of predictions)
        # if it is possible
        # likelihood maximization = minimization of neg log likelihood = minimization of the cross-entropy
        # https://discuss.pytorch.org/t/difference-between-cross-entropy-loss-or-log-likelihood-loss/38816/2
        if targets is None:  # this is needed for the `generate` function below where loss isn't used
            loss = None
        else:
            # need to change order of dimensions so it can be read by `cross_entropy`
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # a 2D view
            targets = targets.view(B*T)  # this will be 1D
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # generate text, by just adding one more for every batch dimension in the time dimension
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)  # loss is being ignored here
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the multinomial probability distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence... whatever is predicted is concatenated
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [61]:
# initialize the model; all we have is embedding
# per the docs  https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
# A simple lookup table that stores embeddings of a fixed dictionary and size.
# it's a thin wrapper around a tensor of vocab_size x vocab_size
m = BigramLanguageModel(vocab_size)
m.token_embedding_table

Embedding(37, 37)

In [62]:
# understanding this embedding table object
# for example token 30 (tensor above) will grab the 30th row of this
# embedding table
# Single index lookup
index = torch.tensor([30])
embedding_vals_idx30 = m.token_embedding_table(index)
print(embedding_vals_idx30.shape)  # Output: torch.Size([1, 300])

torch.Size([1, 37])


In [63]:
embedding_vals_idx30

tensor([[-0.9883,  0.5120,  0.8213, -0.4068,  1.1888, -1.4860,  0.8820,  0.1223,
          0.4179, -0.6261, -0.7949,  0.8804, -0.6083,  0.4067,  0.1444,  0.7383,
          0.2105,  0.7643,  0.6467,  0.6577,  0.3708, -0.4748,  0.5696,  0.8532,
          2.7071,  0.7921, -0.3820, -0.5626,  1.5889,  0.7005, -0.6309, -0.5384,
          2.2132, -0.6222, -0.6999, -0.2165,  2.0037]],
       grad_fn=<EmbeddingBackward0>)

In [64]:
# out = m(xb, yb)
# print(out.shape)

In [65]:
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 37])
tensor(4.1957, grad_fn=<NllLossBackward0>)


In [66]:
# test generation
# idx is being set to a 1x1 tensor of 0
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


u
aYkgBnnA
woMAlgevAWlHbyfvA aojhy
CBaA mvdpSIClBsSts'HiWMTuBevBhncl'HiCyMAkbl'iWjMyWcoHpwBIdgkWMkcv


This is garbage because the model isn't trained.

Also, the bigram model isn't using any history, other than the last character.