Read in Dataset

In [32]:
import torch
torch.manual_seed(1337)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 256
n_embed = 384
num_heads = 6
n_layer = 6
dropout = 0.2
batch_size = 64
eval_interval = 500
max_iters = 5000
learning_rate = 3e-4
eval_iters = 200

cuda


In [33]:
input = ''
with open('data/input.txt', 'r') as f:
    input = f.read()

Extract all characters used in dataset

In [34]:
characters = sorted(list(set(input)))
vocab_size = len(characters)
vocab_size

65

Create encode and decode functions

In [35]:
def encode(str):
    encoded = []
    for c in str:
        encoded.append(characters.index(c))
    return encoded

def decode(codes):
    decoded = ''
    for code in codes:
        decoded = decoded + characters[code]
    return decoded

Encode data and separate it into training and validation data

In [36]:
data = torch.tensor(encode(input), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [37]:
n = int(0.9*len(data))
training_data = data[:n]
validation_data = data[n:]
print(len(training_data), len(validation_data))

1003854 111540


Define batch size, block size, and get_batch function. The batch and block size help send data to the GPU in batches for more efficient training. The targets batches are offset by 1 from the inputs batches, because we will be passing the inputs to the transformer, and the targets should be the predicted output given those inputs, hense training the transformer how to predict the next sequece of characters

In [38]:
def get_batch(data):
    random_offsets = torch.randint(len(data) - block_size, (batch_size,))
    inputs = torch.stack([data[i:i+block_size] for i in random_offsets])
    targets = torch.stack([data[i+1:i+block_size+1] for i in random_offsets])
    inputs, targets = inputs.to(device), targets.to(device)
    return inputs, targets

inputs, targets = get_batch(training_data)

print(inputs)
print(targets)

tensor([[ 0, 26, 53,  ..., 56, 43, 47],
        [60, 43, 56,  ..., 56,  1, 41],
        [26, 21, 33,  ..., 26, 21, 13],
        ...,
        [ 5, 57,  1,  ...,  1, 35, 47],
        [56, 53, 53,  ..., 59, 50, 42],
        [42, 47, 56,  ..., 39, 56,  1]], device='cuda:0')
tensor([[26, 53, 58,  ..., 43, 47, 45],
        [43, 56,  1,  ...,  1, 41, 53],
        [21, 33, 31,  ..., 21, 13, 10],
        ...,
        [57,  1, 52,  ..., 35, 47, 50],
        [53, 53, 58,  ..., 50, 42,  1],
        [47, 56, 43,  ..., 56,  1, 51]], device='cuda:0')


hhu

In [39]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v
        return out


In [40]:
class MultiAttentionHead(nn.Module):

    def __init__(self, num_heads, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.projection(out)
        out = self.dropout(out)
        return out

In [41]:
class FeedForward(nn.Module):

    def __init__(self, n_embed) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.ReLU(),
            nn.Linear(4*n_embed, n_embed),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [42]:
class Block(nn.Module):
    # Transformer block
    def __init__(self, n_embed, n_head) -> None:
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiAttentionHead(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)
    
    def forward(self, x):
        ln1x = self.ln1(x)
        ln2x = self.ln2(x)
        x = x + self.sa(ln1x)
        x = x + self.ffwd(ln2x)
        return x


In [43]:
class BatchNorm1D:
    # Layer Norm
    def __init__(self, dim, eps=1e-5):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True)
        xhat = (x-xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [44]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)

        self.blocks = nn.Sequential(*[Block(n_embed, num_heads) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)

        self.lm_head = nn.Linear(n_embed, vocab_size)
    
    def forward(self, idx, targets=None):
        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) # B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T, C
        x = tok_emb + pos_emb # B,T,C
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # B,T,vocab_size

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)    
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is B,T
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:,-1,:] # becomes (B,C)
            probabilities = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probabilities, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    
model = BigramLanguageModel()
m = model.to(device)

logits, loss = m(inputs, targets)

print(logits.shape)
print(loss)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated = m.generate(idx=context, max_new_tokens=100)[0].tolist()
print(decode(generated))

torch.Size([16384, 65])
tensor(4.3389, device='cuda:0', grad_fn=<NllLossBackward0>)

iIYcfZdn,jYB&,!WEwD!.McsSN.cEpY?FCmvQAbnXkWHvVw!?FlPJKiNuJkUOByuyuayo:E'!:q?UIQVdWPARgnngXeotAT
3hxv


In [45]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for i, split in enumerate([training_data, validation_data]):
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            _, loss = model(X,Y)
            losses[k] = loss.item()
        out[i] = losses.mean()
    model.train()
    return out

Train

In [46]:
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)

for steps in range(max_iters):
    if steps % eval_interval == 0:
        losses = estimate_loss()
        training_losses = losses[0]
        validation_losses = losses[1]
        print(f"step {steps}: train 1oss {training_losses:.4f}, val loss {validation_losses:.4f}")

    inputs, targets = get_batch(training_data)

    logits, loss = m(inputs, targets)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train 1oss 4.3359, val loss 4.3320
step 500: train 1oss 2.0385, val loss 2.1175
step 1000: train 1oss 1.6418, val loss 1.8124
step 1500: train 1oss 1.4722, val loss 1.6656
step 2000: train 1oss 1.3713, val loss 1.5915
step 2500: train 1oss 1.3105, val loss 1.5516
step 3000: train 1oss 1.2568, val loss 1.5221
step 3500: train 1oss 1.2151, val loss 1.5037
step 4000: train 1oss 1.1785, val loss 1.4897
step 4500: train 1oss 1.1454, val loss 1.4839


In [47]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated = m.generate(idx=context, max_new_tokens=500)[0].tolist()
print(decode(generated))


LADY GREY:
Holishour.

GLOUCEST:
And me, whither doth this day loving does?

PERDITA:
O her!

GLOUCESTER:
Why, good stand I forbear?

GLOUCESTER:
Why ohopen early prost thou to act morrow;
Live on thy prince and now too rison will,
By vault foresh were needly store fulling sorrow.

KING EDWARD IV:
Welcome, see his brother, who sun begin,
His entreaten in thy blood, as on his tongues
Which blind to vey'll make his good tie mages.
Go chost the wrinks so groan;
And let for what is a grance to cheal


In [48]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters: {num_params}")
torch.save(model.state_dict(), 'transformer_params.pth')

Number of parameters: 10788929


In [51]:
test_load_model = BigramLanguageModel()
tm = test_load_model.to(device)
tm.load_state_dict(torch.load('shakespear_transformer_params.pth'))
tm.eval()

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated = tm.generate(idx=context, max_new_tokens=5000)[0].tolist()
print(decode(generated))


CORIOLANUS:
I have no suit fair?
Sister'd more beautifieds; for you have piny,
Which you pany of this temper'd with you.

Second Servingman:
We will, so you shall be honour; who has at you
Hunts you would Gremio, have stoned and Gaunt
A true deeds to give your sorrow;
You are you but confessor, sir.

KING LEWIS XI:
Go, you'll be gone, for I pardon my soul oath.

ANGELO:
Good my leave was to me.

LADY GREY:
Good night, I think, good my lord, I make constant no pair;
For I kneel a word in England's claim'd
Girl have brock'd from the friendship safety.
I dread, proud heaven! O my son
You desire have of my son! I am at thy oath
That like us fancy is as God, to pass yourself accasion.
Welcome, sir, come fear, three flood with us
his gracious parts, to queen, that vawards from his wife:
For my both to chance it well. What should you not your name?
I know not you, lords, sweet Clarence: dry the queen,
When you do forswear this noble of absence;
For blind the Coventry, now God the child;
He c