In [71]:
# Validate GPU
import torch
assert(torch.cuda.is_available())
assert(torch.cuda.device_count() > 0)
assert(torch.cuda.current_device() == 0)
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x0000021CCD0B51E0>
NVIDIA GeForce GTX 1080 Ti


In [72]:
# Load dataset
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [73]:
print("Length of data in characters: ", len(text), "\n---------------------\n")
print(text[:100])

Length of data in characters:  196215 
---------------------

Asim Shrestha: my ssd is finally not dog
Srijan Subedi: What’s the spec
Srijan Subedi: Ahieeee
Asim 


In [74]:
# Look at all the unique characters within the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\^_`abcdefghijklmnopqrstuvwxyzकजदनभमशािुो्ਇਕਖਗਜਟਦਮਰਸਹ਼ਿੁੂੇੈੰੱ༼༽‍’“”…◕☁☠♂⛵❤つ️🇦🇨🇮🇳🇵🇷🍆🍌🍑🍕🎄🎅🎉🏀🏍🐐🐦👀👈👉👍👎👑👨👳💀💋💤💥💦💩💯📿🔝🔬🕎🕯🖐😂😆😉😌😍😎😐😔😘😡😢😬😭😮😱🙌🙏🚫🛌🛐🛥🤓🤔🤕🤛🤝🤡🤢🤣🤨🤪🤬🤮🤲🥜🥲🥵🥶🦶🧠🧢🫡
215


In [75]:
# Encoding / decoding function are just lookup tables from the string / number and vice versa
strToNum = { ch:i for i,ch in enumerate(chars) }
numToStr = { i:ch for i,ch in enumerate(chars) }
encode = lambda string : [strToNum[char] for char in string]
decode = lambda num_list : ''.join([numToStr[i] for i in num_list])

startString = "Hey... whats up?"
encoded = encode(startString)
decoded = decode(encoded)

print(startString)
print(encoded)
print(decoded)

Hey... whats up?
[40, 67, 87, 15, 15, 15, 1, 85, 70, 63, 82, 81, 1, 83, 78, 31]
Hey... whats up?


In [76]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([196215]) torch.int64


In [77]:
# Separate data into train/test split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [78]:
block_size = 8 # Can also be called context length
train_data[:block_size + 1] # Add plus one

tensor([33, 81, 71, 75,  1, 51, 70, 80, 67])

In [79]:
# Example. It learns that given some series of input, the next character will be TARGET
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([33]) the target: 81
When input is tensor([33, 81]) the target: 71
When input is tensor([33, 81, 71]) the target: 75
When input is tensor([33, 81, 71, 75]) the target: 1
When input is tensor([33, 81, 71, 75,  1]) the target: 51
When input is tensor([33, 81, 71, 75,  1, 51]) the target: 70
When input is tensor([33, 81, 71, 75,  1, 51, 70]) the target: 80
When input is tensor([33, 81, 71, 75,  1, 51, 70, 80]) the target: 67


In [80]:
torch.manual_seed(1337) # Add a seed to reproduce the same results as karpathy
batch_size = 4 # How many independent sequences we process in parallel
block_size = 8 # What is the maximum context length for predictions

def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    batch_data = train_data if split == 'train' else val_data
    ix = torch.randint(len(batch_data) - block_size, (batch_size,))
    new_x = torch.stack([batch_data[i : i + block_size] for i in ix])
    new_y = torch.stack([batch_data[i + 1: i + block_size + 1] for i in ix])
    return new_x, new_y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('----')

for b in range(batch_size): # Batch dimension
    for t in range(block_size): # Time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()} the target is: {target}')

inputs:
torch.Size([4, 8])
tensor([[ 71,  72,  63,  76,   1,  51,  83,  64],
        [ 70,  63,  27,   1,  70,  82,  82,  78],
        [  1,  51,  70,  71,  76,  27,   1,  33],
        [ 21,   1, 157,   0,  51,  80,  71,  72]])
targets:
torch.Size([4, 8])
tensor([[ 72,  63,  76,   1,  51,  83,  64,  67],
        [ 63,  27,   1,  70,  82,  82,  78,  81],
        [ 51,  70,  71,  76,  27,   1,  33,  71],
        [  1, 157,   0,  51,  80,  71,  72,  63]])
----
When input is [71] the target is: 72
When input is [71, 72] the target is: 63
When input is [71, 72, 63] the target is: 76
When input is [71, 72, 63, 76] the target is: 1
When input is [71, 72, 63, 76, 1] the target is: 51
When input is [71, 72, 63, 76, 1, 51] the target is: 83
When input is [71, 72, 63, 76, 1, 51, 83] the target is: 64
When input is [71, 72, 63, 76, 1, 51, 83, 64] the target is: 67
When input is [70] the target is: 63
When input is [70, 63] the target is: 27
When input is [70, 63, 27] the target is: 1
When input is

In [81]:
# Bi-gram language model definition. One of the simplest language models available

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    token_embedding_table: nn.Embedding

    def __init__(self, input_vocab_size):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        print(input_vocab_size)
        self.token_embedding_table = nn.Embedding(input_vocab_size, input_vocab_size)

    def forward(self, idx, targets = None):
        # Idx and targets are both (B, T) tensors of integers
        new_logits = self.token_embedding_table(idx) # (B, T, C)

        if targets is None:
            new_loss = None
        else:
            b, t, c = new_logits.shape
            new_logits = new_logits.view(b * t, c)
            targets = targets.view(b * t)
            new_loss = F.cross_entropy(new_logits, targets)


        return new_logits, new_loss

    def generate(self, idx, max_new_tokens):
        # Idx is a (B, T) array of indicies in the current context
        for _ in range(max_new_tokens):
            # Get the predictions
            new_logits, new_loss = self(idx)

            # Focus only on the last time step
            new_logits = new_logits[:, -1, :] # Get last element of the time dimension which is (B, C)

            # Apply softmax to get the probabilities
            probs = F.softmax(new_logits, dim=1) # Becomes (B, C)

            # Sample 1 sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # Becomes (B, 1)

            # Append sampled index to the running sequence
            idx = torch.cat([idx, idx_next], dim=1) # Becomes (B, T+1)
        return idx



bigram_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_model(xb, yb)
print(logits.shape, loss)

test_idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(bigram_model.generate(test_idx, max_new_tokens=100)[0].tolist()))


215
torch.Size([32, 215]) tensor(6.0666, grad_fn=<NllLossBackward0>)

༼🛥🧠ਗ👉"्ਟz🥵श\8 💋🐦m😂’ਹ🇨🤪iਹ*ਇ😔つੰज"_🐐)g༽ੈT🧠☁💋ੂ🥶शhੰ5j4L्😡😱☠c🤓🍆ੱ😬️0k🤝Zੰm्G-जभ🤓t🤬ਜ0ਰਜ🐦👎ज♂😱ਹ\👳👎W=😉(🇦j਼🥲.-‍😭s


In [82]:
# Create an optimizer
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=1e-3)

In [83]:
batch_size = 32
for steps in range(10000):
    # Sample a batch of data
    xb, yb = get_batch('train')

    # Forward pass
    logits, loss = bigram_model(xb, yb)

    # Evaluate the loss
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.314354181289673


In [84]:
test_idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(bigram_model.generate(test_idx, max_new_tokens=100)[0].tolist()))


Shrij
Sha,N🍕👍Gijadino…ੂ😌‍ਿ🕎.dedija: pl: Shike🐐💤%ucad ed cta ny
🐦🤮125
Asf popeallas @🕯olase
Lube tube


# The matematical trick in self-attention

In [70]:
torch.manual_seed(1337)
b, t, c = 4, 8, 2
x = torch.randn(b, t, c)
x.shape

torch.Size([4, 8, 2])