In [1]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from gpt import SingleHeadAttention, MultiHeadAttention, FeedForward, Block, GPT

In [3]:
# hyperparameters
batch_size = 32
block_size = 64
max_iters = 5000
eval_interval = 100
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 204
n_head = 6
n_layer = 4
dropout= 0.0

In [4]:
torch.manual_seed(2525)

<torch._C.Generator at 0x1cf71e78bd0>

In [5]:
with open('tiny-shakespeare.txt', 'r', encoding='utf-8') as f:
    text=f.read()

In [6]:
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [7]:
len(text)

1115394

In [8]:
chars = sorted(list(set(text)))
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [9]:
vocab_size = len(chars)
vocab_size

65

In [10]:
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for s, i in stoi.items()}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [11]:
print(encode("Sword in Mouth, Fire Eyes"))
print(decode(encode("Hello, World!")))

[31, 61, 53, 56, 42, 1, 47, 52, 1, 25, 53, 59, 58, 46, 6, 1, 18, 47, 56, 43, 1, 17, 63, 43, 57]
Hello, World!


In [12]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [13]:
print(data[:200])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


In [14]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [15]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

In [16]:
xb, yb = get_batch('train')

In [17]:
print("inputs:")
xb.shape, xb

inputs:


(torch.Size([32, 64]),
 tensor([[46, 43,  1,  ..., 58,  1, 39],
         [57, 58, 43,  ...,  1, 46, 53],
         [47, 56, 50,  ..., 43,  1, 50],
         ...,
         [43, 52,  1,  ..., 58, 46, 43],
         [45, 56, 39,  ...,  1, 50, 53],
         [50, 41, 46,  ...,  1, 39, 45]]))

In [18]:
print("targets:")
yb.shape, yb

targets:


(torch.Size([32, 64]),
 tensor([[43,  1, 46,  ...,  1, 39,  1],
         [58, 43, 56,  ..., 46, 53, 61],
         [56, 50, 63,  ...,  1, 50, 53],
         ...,
         [52,  1, 58,  ..., 46, 43, 47],
         [56, 39, 52,  ..., 50, 53, 60],
         [41, 46,  1,  ..., 39, 45, 53]]))

In [19]:
model = GPT(vocab_size)
m = model.to(device)

In [20]:
print(sum(p.numel() for p in m.parameters()), 'M parameters')

2045777 M parameters


In [21]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [22]:
optimiser = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [23]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()


step 0: train loss 4.2093, val loss 4.2130
step 100: train loss 2.4895, val loss 2.4948
step 200: train loss 2.3561, val loss 2.3703
step 300: train loss 2.2680, val loss 2.2864
step 400: train loss 2.1559, val loss 2.1854
step 500: train loss 2.0416, val loss 2.0983
step 600: train loss 1.9517, val loss 2.0308
step 700: train loss 1.8919, val loss 1.9898
step 800: train loss 1.8265, val loss 1.9304
step 900: train loss 1.7719, val loss 1.9013
step 1000: train loss 1.7304, val loss 1.8792
step 1100: train loss 1.6948, val loss 1.8563
step 1200: train loss 1.6562, val loss 1.8164
step 1300: train loss 1.6318, val loss 1.8062
step 1400: train loss 1.6024, val loss 1.7845
step 1500: train loss 1.5820, val loss 1.7430
step 1600: train loss 1.5563, val loss 1.7435
step 1700: train loss 1.5470, val loss 1.7361
step 1800: train loss 1.5268, val loss 1.7129
step 1900: train loss 1.5155, val loss 1.7029
step 2000: train loss 1.4938, val loss 1.6972
step 2100: train loss 1.4822, val loss 1.6781


In [24]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


And much add leave a cure
Here had with his rage besolved ve me they me, rests!
I do mean you opine of faith,--o yond with good with
you, a crave. O Vilta, out--the maid, cross I have been,
But so I wonder fray thee to the king.

But Mowbray made to Bolingbroke:
Out, the boy look but only the benefit.

MOPSA:
Say you, methinks, Clarence, father, thy same to their
that with childish an execute, I love the most.
Now, mine way
Made 'Twixt 'tweres, we have no cool! When you are
care many man of the maid's wrath: alive again,
I must be record for your grumented son.
Ah, a noble sit, Jures. Greum! Ah, blood Marcius.

JULIET:
Wail the had trobb'd, and only of door.

Second Murderer:
It eremember, the hate the treason, thou lay'st quickly'st.
I'll strength thee and a-trueverence.
'Thou art the carest on a change at is try'vouch.
Fivinctiry, 'tis not me only, and spreak
Met some from whither firice and me
Make traitor betting from, fiend swaiting to
Fair one damnous more, senate, for what grea