In [1]:
# loading the dataset, downloading the tiny shakespeare dataset from github
!wget https://raw.githubusercontent.com/bharatyadv/mini-gpt/main/input.txt

--2024-05-13 16:04:41--  https://raw.githubusercontent.com/bharatyadv/mini-gpt/main/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-05-13 16:04:41 (35.3 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
#reding the input file
with open('input.txt','r',encoding='utf-8') as f:
  data = f.read()

In [6]:
print('length of dataset in characters: ', len(data))

length of dataset in characters:  1115394


In [7]:
#printing first 1000 characters
print(data[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

import math
from torch.utils.data import Dataset

In [8]:
#identfying unique text/characters/alphabets in the text and their total numbers
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [10]:
#converting characters to some specific value according to a grammar (tokenization)
stoi = { ch:i for i,ch in enumerate(chars) } #converting characters into their ascii values
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] #This encodes a string in list of integers
decode = lambda l: ''.join([itos[i] for i in l])#This decodes list of integers in a string

In [11]:
#encoding the entire dataset in integers so that it is easire to work with
import torch #using PyTorch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:1000]) #first 1000 characters as integer


torch.Size([1115394]) <built-in method type of Tensor object at 0x792accaef510>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 

In [12]:
#splitting data for training and testing
n = int(0.6*len(data)) #90% for training & 10% for testing
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 128

In [None]:
class PoemDatset(Dataset):

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]

        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [None]:
train_dataset = PoemDatset(data, block_size)

data has 20146 characters, 69 unique.


In [None]:
from model import GPT, GPTConfig
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=8, n_head=8, n_embd=128)
model = GPT(mconf)

##### initialize a trainer instance and kick off training


In [15]:
#explaining more systematically
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")
  #for a block when train in all these contexts
# this training method helps the transformer get used to seeing context from length 1 to length of block size

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [16]:
# mini batching the blocks for parallel processing in gpu for higher efficincy
torch.manual_seed(1337) #seed can be random
batch_size = 4 #the number of independent sequences we will process in parallel
block_size = 8 #maximum context length

def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #random offest in training data
    x = torch.stack([data[i:i+block_size] for i in ix]) #stacking the one dimension tensor
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) #making a 4 by 8 tensor
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size): #batch dimension
    for t in range(block_size): #time dimension
        context = xb[b, :t+1]
        taget = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

#it feeds the tansformer the 4X8 matrix for input and a 4X8 matrix of targets so it can match
#input to their coresponding targets. for input 4,3 the target is also at 4,3

inputs:
torch.Size([4, 8])
tensor([[39, 57,  1, 58, 46, 47, 57,  1],
        [17, 31, 32, 17, 30, 10,  0, 15],
        [58, 46, 43, 56,  6,  1, 32, 63],
        [53, 59,  1, 39, 56, 58,  1, 57]])
targets:
torch.Size([4, 8])
tensor([[57,  1, 58, 46, 47, 57,  1, 44],
        [31, 32, 17, 30, 10,  0, 15, 53],
        [46, 43, 56,  6,  1, 32, 63, 56],
        [59,  1, 39, 56, 58,  1, 57, 61]])
-----
when input is [39] the target: 58
when input is [39, 57] the target: 58
when input is [39, 57, 1] the target: 58
when input is [39, 57, 1, 58] the target: 58
when input is [39, 57, 1, 58, 46] the target: 58
when input is [39, 57, 1, 58, 46, 47] the target: 58
when input is [39, 57, 1, 58, 46, 47, 57] the target: 58
when input is [39, 57, 1, 58, 46, 47, 57, 1] the target: 58
when input is [17] the target: 58
when input is [17, 31] the target: 58
when input is [17, 31, 32] the target: 58
when input is [17, 31, 32, 17] the target: 58
when input is [17, 31, 32, 17, 30] the target: 58
when input is 

In [17]:

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
# the prediction is grabage because it only using one character to predict next character but we will use longer context later

torch.Size([32, 65])
tensor(4.6082, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
from trainer import Trainer, TrainerConfig

tconf = TrainerConfig(max_epochs=50, batch_size=128, learning_rate=6e-4,
                      lr_decay=True, warmup_tokens=128*20, final_tokens=2*len(train_dataset)*block_size)
trainer = Trainer(model, train_dataset, None, tconf)
trainer.train()

epoch 1 iter 156: train loss 2.30137. lr 3.002355e-04: 100%|██████████| 157/157 [00:23<00:00,  6.71it/s]
epoch 2 iter 156: train loss 2.26963. lr 6.000000e-05: 100%|██████████| 157/157 [00:22<00:00,  6.88it/s]
epoch 3 iter 156: train loss 2.18049. lr 3.002355e-04: 100%|██████████| 157/157 [00:22<00:00,  6.93it/s]
epoch 4 iter 156: train loss 1.91827. lr 5.999996e-04: 100%|██████████| 157/157 [00:23<00:00,  6.80it/s]
epoch 5 iter 156: train loss 1.47330. lr 2.992934e-04: 100%|██████████| 157/157 [00:23<00:00,  6.82it/s]
epoch 6 iter 156: train loss 1.30616. lr 6.000000e-05: 100%|██████████| 157/157 [00:23<00:00,  6.82it/s]
epoch 7 iter 156: train loss 1.21500. lr 3.011776e-04: 100%|██████████| 157/157 [00:23<00:00,  6.76it/s]
epoch 8 iter 156: train loss 0.95630. lr 5.999967e-04: 100%|██████████| 157/157 [00:23<00:00,  6.75it/s]
epoch 9 iter 156: train loss 0.63780. lr 2.983513e-04: 100%|██████████| 157/157 [00:22<00:00,  6.85it/s]
epoch 10 iter 156: train loss 0.47310. lr 6.000000e-05:

In [None]:
#character level
from utils import sample

context = "What chance, good lady"
x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None,...].to(trainer.device)
y = sample(model, x, 2000, temperature=1.0, sample=True, top_k=10)[0]
completion = ''.join([train_dataset.itos[int(i)] for i in y])
print(completion)

What chance, good lady, hath bereft you thus? —COMUS.    IT was a quiet and still afternoon when I strolled forth in the goodly city of Edina. The confusion and bustle in the streets were terrible. Men were talking. Women were screaming. Children were choking. Pigs were whistling. Carts they rattled. Bulls they bellowed. Cows they lowed. Horses they neighed. Cats they caterwauled. Dogs they danced. Danced! Could it then be possible? Danced! Alas, thought I, my dancing days are over! Thus it is ever. What a host of gloomy recollections will ever and anon be awakened in the mind of genius and imaginative contemplation, especially of a genius doomed to the everlasting and eternal, and continual, and, as one might say, the—continued—yes, the continued and continuous, bitter, harassing, disturbing, and, if I may be allowed the expression, the very disturbing influence of the serene, and godlike, and heavenly, and exalted, and elevated, and purifying effect of what may be rightly termed the 