In [1]:
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import AdamW

In [2]:
batch_size = 64
block_size = 256
dim_embd = 384
num_heads = 6
head_size = 384
seed = 1337
learning_rate = 3e-4
ffrwd_in_features = 384

g = torch.manual_seed(seed)
max_iters = 2500
eval_interval = 500
eval_iters = 200

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [4]:
with open('./input.txt') as f:
    text = f.read()

print(f"{len(text) = } characters",
       end='\n-------------Data-------------\n')

print(text[:100])

len(text) = 1115394 characters
-------------Data-------------
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### Build our char level vocabulary

In [5]:
chars = sorted(list(set(text)))

stoi = {ch:idx for idx, ch in enumerate(chars)}
itos = {idx:ch for idx, ch in enumerate(chars)}

vocab_size = len(chars)
print(vocab_size)

65


In [6]:
encode = lambda text: [stoi[c] for c in text]
decode = lambda tokens: ''.join([itos[c] for c in tokens])

raw = 'i love this game.'
tokens = encode(raw)
print(tokens)
print(decode(tokens))

[47, 1, 50, 53, 60, 43, 1, 58, 46, 47, 57, 1, 45, 39, 51, 43, 8]
i love this game.


### Encode entire data

In [7]:
data = torch.tensor(encode(text)).long()
print(data.shape, data.type())
print(data[:100])

torch.Size([1115394]) torch.LongTensor
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


### Data split

In [8]:
train_data, val_data = train_test_split(data, shuffle=False, test_size=0.1)
print(train_data[:block_size])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [9]:
def get_batch(split_type, device):
    data = train_data if split_type=='train' else val_data
    ix = torch.randint(low=0, high=len(data)-block_size, size=(batch_size,), generator=g)
    x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y

xb, yb = get_batch('train', device=device)
print(xb.shape, yb.shape)

torch.Size([64, 256]) torch.Size([64, 256])


In [10]:
@torch.no_grad()
def estimate_loss(model, eval_iters, device):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

### Mathematical Trick to vectorized weighted aggregation

In [11]:
B,T,C = (4,8,2)
x = torch.randn(B,T,C)

In [12]:
## Version 1
xbow = torch.zeros_like(x)

for t in range(T):
    x_prev = x[:, :t+1, :]
    xbow[:,[t], :] = torch.mean(x_prev, dim=1, keepdim=True)

In [13]:
## Version 2
wei = torch.tril(torch.ones((T,T))).float()
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x
torch.allclose(xbow2, xbow)

True

In [14]:
## Version 3: using softmax
tril = torch.tril(torch.ones((T,T))).float()
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

### Simple model with only one self attention (only one head)

In [15]:
class SelfAttention_Head(nn.Module):
    """One head of self-attention"""
    def __init__(self, input_features, head_features, block_size, dop=0.3):
        super().__init__()
        self.block_size = block_size
        self.head_features = head_features
        self.input_features = input_features

        self.WQ = nn.Linear(input_features, head_features, bias=False)
        self.WK = nn.Linear(input_features, head_features, bias=False)
        self.WV = nn.Linear(input_features, head_features, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dop)

    def forward(self, x):

        T = x.shape[1]
        q = self.WQ(x)
        k = self.WK(x)

        wei = q @ k.transpose(1,2) * self.head_features**-0.5
        ## this mask to disconnect the token from following tokens in the seqeunce
        ## we will use this mask during training only
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        attention_weights = self.dropout(F.softmax(wei, dim=1)) ## this is attention weights

        v = self.WV(x)
        out = attention_weights @ v

        return out

In [16]:
class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel"""
    def __init__(self, num_heads, input_features, head_features, block_size, dop=0.3):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttention_Head(input_features, head_features, block_size) for _ in range(num_heads)])
        ## Projection to make output compatible with residual adding
        self.proj = nn.Linear(head_features*num_heads, input_features)
        self.dropout = nn.Dropout(dop)
    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.proj(x)
        x = self.dropout(x)
        return x

In [17]:
class FeedForward(nn.Module):
    def __init__(self, in_features, dop=0.3):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_features, in_features * 4),
            nn.ReLU(),
            ## Projection to make output compatible with residual adding
            nn.Linear(in_features * 4, in_features),
            nn.Dropout(dop)
        )

    def forward(self, x):
        return self.fc(x)

In [18]:
## LayerNorm: same as BatchNorm ,but it normalize the rows not the columns
class Block(nn.Module):
    def __init__(self, num_heads, input_features, head_features, block_size):
        super().__init__()
        self.sa = MultiHeadAttention(num_heads, input_features, head_features//num_heads, block_size)
        self.ffwd = FeedForward(head_features)
        self.ln1 = nn.LayerNorm(input_features)
        self.ln2 = nn.LayerNorm(head_features)

    def forward(self, x):
        ## (x +) is for residual connections
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [19]:
class Attentioned_LM(nn.Module):
    def __init__(self, vocab_size, dim_embd, block_size, head_size, num_heads):
        super().__init__()
        self.block_size = block_size
        self.embd_layers = nn.Embedding(vocab_size, dim_embd)
        self.position_encoding_layer = nn.Embedding(block_size, dim_embd)
        self.blocks = nn.Sequential(
            Block(num_heads, dim_embd, head_size, block_size),
            Block(num_heads, dim_embd, head_size, block_size),
            Block(num_heads, dim_embd, head_size, block_size),
            Block(num_heads, dim_embd, head_size, block_size),
            Block(num_heads, dim_embd, head_size, block_size),
            Block(num_heads, dim_embd, head_size, block_size),
            nn.LayerNorm(head_size)
        )
        self.lm_head = nn.Linear(head_size, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        device = idx.device

        embdings = self.embd_layers(idx)
        pos_encodes = self.position_encoding_layer(torch.arange(T, device=device))
        x = embdings + pos_encodes ## (B,T,dim_embd) + (T,dim_embd) = (B,T,dim_embd) broadcast happened
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch, seq_len, embds = logits.shape
            logits = logits.view(batch*seq_len, embds)
            targets = targets.view(batch*seq_len)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop input tokens for the length of block_size, so we able to positional encode them
            idx_context = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## In this code we still preserve history tokens, however we don't use them.


In [20]:
m = Attentioned_LM(vocab_size, dim_embd, block_size, head_size, num_heads).to(device)

logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

input = torch.zeros((1, 1), dtype=torch.long).to(device)
print(decode(m.generate(input, max_new_tokens=100)[0].tolist()))

optimizer = AdamW(m.parameters(), lr=learning_rate)

torch.Size([16384, 65])
tensor(4.3529, device='cuda:0', grad_fn=<NllLossBackward0>)

rXvLFCUS majJr?b,t.HfxNDzd,HEqGfsA3Wj gE,qVEX
nO,BepG AQj!
it3xfs?',ycT,HRitA!dGd?CzOqmVjMyFwLOu&LVg


In [21]:
for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss(m, eval_iters, device=device)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
    xb, yb = get_batch('train', device=device)

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 4.3530, val loss 4.3490
step 500: train loss 0.3217, val loss 0.3787
step 1000: train loss 0.0449, val loss 0.0694
step 1500: train loss 0.0195, val loss 0.0341
step 2000: train loss 0.0138, val loss 0.0256


In [26]:
with torch.no_grad():
    m.eval()
    input = torch.zeros((1, 1), dtype=torch.long).to(device)
    print(decode(m.generate(input, max_new_tokens=1000)[0].tolist()))
    m.train()

 pppppprrrrrrrrRuiiiiiiiiiiiiiaiiniiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiittesssssssssstcrrrrrrnnnnnnnnnnnnntttteeeeeeeeeeeeeeeeeeeeeee
 ssslllllllllrrrcchrrssssssssssssssssmmmmmmmmmmmmmmmmmmmmmymmmmmsssssssssssssssssssllfeintan'e, nsntNN
Gno pere the
un tac notne ther, ver torth dyoucsor:
 leg it thant dourshalt salit leand, vadeig tpith mfraf,
soer dnut samus. sne.
E'W
sad yereicje,
Whin dressnmer, maner hint hef it, donn, itly plifly paly jago,
Ali, ye,
nY eWhan w won trokontot fite meirouonuncist
a shic uthe .
n Youn blelves,platred, syoks out hon haneremant!
UUute forra,
IN aswer
Anew y lo to to th mounint yoet:
I, sandirson foie lhslm seneses oufrece loiand mise.
SThmeNAnisf! te the ohint ughy to thumeerTlel ande thruces tot wosle th,
Khener ic rin.
FOUE:
Parto, threw youssocooud fouskins aren,
Wh
Theunse smear woldows, fag may alrolk thoe ravel wod yt toakounny
MOrwe torpo sou hpistrefptre, me lemnerlev eaned hhevnengres axod poroce an tirs.

MARDiss an nhobne a

In [29]:
torch.save(m, '/content/nanoGpt.pt')

In [30]:
# GG