In [1]:
# Work on reimplementing Andrej Karpathy's nano-gpt from this video
# https://youtu.be/kCc8FmEb1nY?si=cdnNJrPwDEPQPn0C

In [15]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [3]:
# load data
with open("input.txt", mode='r') as f:
  words = f.read()

In [4]:
print(len(words))
print(words[:1000])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for re

In [13]:
# vocab
vocab = sorted(set(words))
vocab_size = len(vocab)
print(vocab_size)
print(vocab)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [10]:
# simple character level tokenization
stoi = {c:i for i,c in enumerate(vocab)}
itos = {i:c for i,c in enumerate(vocab)}
# encode and decode functions
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

decode(encode('Is it working?'))

'Is it working?'

In [21]:
# train/validation split
n = int(0.9 * len(words))
train = words[:n]
test = words[n:]
print(f'train size: {len(train)}, test size: {len(test)}')

train size: 1003854, test size: 111540


In [22]:
# we'd like to use torch tensors for training NNs
train = torch.tensor(encode(train), dtype=torch.long)
test = torch.tensor(encode(test), dtype=torch.long)

In [23]:
# context size = one sampled block gives us block_size of examples to train on
block_size = 8
block = train[:block_size+1]
print(block)
for i in range(1, block_size+1):
  print(f'for input context: {block[:i]} target is: {block[i]}')

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
for input context: tensor([18]) target is: 47
for input context: tensor([18, 47]) target is: 56
for input context: tensor([18, 47, 56]) target is: 57
for input context: tensor([18, 47, 56, 57]) target is: 58
for input context: tensor([18, 47, 56, 57, 58]) target is: 1
for input context: tensor([18, 47, 56, 57, 58,  1]) target is: 15
for input context: tensor([18, 47, 56, 57, 58,  1, 15]) target is: 47
for input context: tensor([18, 47, 56, 57, 58,  1, 15, 47]) target is: 58


In [24]:
# so training examples are defined like this
x = train[:block_size]
y = train[1:block_size+1]
print(x)
print(y)

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])


In [35]:
# we want to work with batches of data as we aren't going to feed in the whole dataset
torch.manual_seed(1347)
batch_size = 4

def get_batch(split):
  data = train if split == "train" else test
  ix = torch.randint(len(data) - block_size, (batch_size,)) # randint range is exclusive
  x, y = [], []
  for i in ix:
    x.append(data[i:i+block_size])
    y.append(data[i+1:i+block_size+1])
  x = torch.stack(x)
  y = torch.stack(y)
  return x, y

X, Y = get_batch('train')

for i in range(batch_size):
  x, y = X[i], Y[i]
  print(x, y)
  for j in range(block_size):
    print(f'for input context: {x[:j+1]} target is: {y[j]}')

tensor([13, 33, 32, 27, 24, 37, 15, 33]) tensor([33, 32, 27, 24, 37, 15, 33, 31])
for input context: tensor([13]) target is: 33
for input context: tensor([13, 33]) target is: 32
for input context: tensor([13, 33, 32]) target is: 27
for input context: tensor([13, 33, 32, 27]) target is: 24
for input context: tensor([13, 33, 32, 27, 24]) target is: 37
for input context: tensor([13, 33, 32, 27, 24, 37]) target is: 15
for input context: tensor([13, 33, 32, 27, 24, 37, 15]) target is: 33
for input context: tensor([13, 33, 32, 27, 24, 37, 15, 33]) target is: 31
tensor([ 1, 44, 39, 58, 46, 43, 56,  1]) tensor([44, 39, 58, 46, 43, 56,  1, 44])
for input context: tensor([1]) target is: 44
for input context: tensor([ 1, 44]) target is: 39
for input context: tensor([ 1, 44, 39]) target is: 58
for input context: tensor([ 1, 44, 39, 58]) target is: 46
for input context: tensor([ 1, 44, 39, 58, 46]) target is: 43
for input context: tensor([ 1, 44, 39, 58, 46, 43]) target is: 56
for input context: te