nano GPT model following Andrej Karpathy's GPT tutorial on Youtube (part 2 of 2). First 

In [28]:
# packages
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------



In [31]:
#Processing the input data (same as before)
with open('lovecraft.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

#Encoding and decoding functions from strings to list of numbers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] #String to list
decode = lambda l: "".join(itos[i] for i in l) #List to string

#Wrapping data into a torch tensor
data = torch.tensor(encode(text), dtype = torch.long)

#Training/validation split; we will use a 95/5 ratio
n = int(len(data) * 0.95)
train_data = data[:n]
val_data = data[n:]

#Split data into batches and blocks to train 

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [5]:
# GPT implementation with multi-head self-attention

#B = batches, T = time length, C = no. channels 
B,T,C = (4,8,2)
head_size = 16
tril = torch.tril(torch.ones(T,T)) #mask
x = torch.randn(B,T,C) #Sample input data

key = nn.Linear(C, head_size, bias=False) #What this token represents for other tokens
query = nn.Linear(C,head_size, bias=False) #What this token is looking for
value = nn.Linear(C, head_size, bias=False) #Who the token is 
k = key(x) #(B,T,head_size)
q = query(x) #(B,T,head_size)

wei = q @ k.transpose(-2,-1) * head_size**-0.5 #(B,T,T), head_size scalar is to make rows have variance = 1 on init
wei = wei.masked_fill(tril == 0, float('-inf'))  #Weights have no relation to the future
wei = F.softmax(wei, dim = -1) #Weights how important certain times are 

v = value(x) #(B,T,head_size)
out = wei @ v #(B,T,head_size) -- what the output probability for the next token is

In [11]:
# One head of multihead attention

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) #What this token represents for other tokens
        self.query = nn.Linear(n_embd,head_size, bias=False) #What this token is looking for
        self.value = nn.Linear(n_embd, head_size, bias=False) #Who the token is        
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))

    def forward(self,x, targets = None):
        B,T,C = x.shape
        k = self.key(x) #(B,T,head_size)
        q = self.query(x) #(B,T,head_size)

        tril = torch.tril(torch.ones(T,T))
        wei = q @ k.transpose(-2,-1) * head_size ** -0.5  #(B,T,T)
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        v = self.value(x) #(B,T,head_size)
        out = wei @ v #(B,T,head_size)

        return out
    

#Bigram model with one head of self attention
class BigramOneHead(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are (B,T) tensors of integers
        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) #T,C

        x = tok_emb + pos_emb #B,T,C
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            idx_crop = idx[:,-block_size:] #crop idx to fit block size
            logits, loss = self(idx_crop) #Get predictions
            logits = logits[:,-1,:] #Take logits for the last time step; (B,C) tensor
            probs = F.softmax(logits, dim=-1) #Probabilities for the next token
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1) tensor after sampling the next token
            idx = torch.cat((idx,idx_next), dim = 1) #Concatenate new token into running sequence, (B,T+1) tensor
        return idx

model = BigramOneHead()
m = model.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)


In [20]:
training_steps = 100
for i in range(15):
    for steps in range(training_steps):
        #Get training data
        xb, yb = get_batch('train')

        #Evaluate loss
        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    print(loss.item())


2.475289821624756
2.4645557403564453
2.470508575439453
2.4486727714538574
2.431278705596924
2.423784017562866
2.445096731185913
2.4381465911865234
2.4220027923583984
2.42972731590271
2.409843683242798
2.4368419647216797
2.3971002101898193
2.4058828353881836
2.37831711769104


In [21]:
idx = torch.zeros((1,1), dtype=torch.long)
new_text = m.generate(idx, max_new_tokens=500)[0].tolist()
print(decode(new_text))



thn blly
yvers,, uo s enbly thakst fid as an to na cansss teng, ulgh thent uld fig elbened vry a Waultthom ond on
a, fo icreithe whe self wor Nlt wald bin
dip, they. Eswhe at ack ler ance om ch men 
ard thachurrsy and at a-sost aed dimathe thanllof jeere Moua-Enoth towen an tilefAd bubero worusoonss id the ardted phutr alls toannsone fow ght ighe fonkof Iid and wa randint fuldengt negoulnlstam tioryu
wedicpirishrs suld of gougeed so Je—to-ng opecr meding
ry‘sspinl st wyinglyit o his ads shea st 


In [29]:


class MultiHeadAttention(nn.Module):
#Combining multiple attention heads in parallel

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) 
        self.dropout = nn.Dropout(dropout) #Use dropout to prevent overtraining

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
#Single layer NN with ReLU
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        return self.net(x)
        
class Block(nn.Module):

    def __init__(self, n_emb, n_head):
        super().__init__()
        head_size = n_emb // n_head 
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) #Skip connections 
        x = x + self.ffwd(self.ln2(x))
        return x


class nanoGPT(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) #T,C

        x = tok_emb + pos_emb #B,T,C
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            idx_crop = idx[:,-block_size:] #crop idx to fit block size
            logits, loss = self(idx_crop) #Get predictions
            logits = logits[:,-1,:] #Take logits for the last time step; (B,C) tensor
            probs = F.softmax(logits, dim=-1) #Probabilities for the next token
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1) tensor after sampling the next token
            idx = torch.cat((idx,idx_next), dim = 1) #Concatenate new token into running sequence, (B,T+1) tensor
        return idx


model = nanoGPT()
m = model.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)

In [38]:
#Generate some text

idx = torch.zeros((1,1), dtype=torch.long)
new_text = m.generate(idx, max_new_tokens=1000)[0].tolist()
print(decode(new_text))



Nahuron!forth Laking and to famora of awaken them, was beying most in the
teneles,
sensubhine tornious fartherna‘s whilst were
upstary of his finded seages of the citying of main‘s stumbled taste compared
the own hands—crept despareading which ship, and he would perfect or
Test.
Disptor‘s later pale of magin and moral terrorison abouth the stirre, moon, the surviously offerened with swars substonic becaugh relieved us. Wight as all hideously shunned wastead over in shellent sanisce of moor entified, rimomen both azified telling the spopped interchyesly men a hill have tonesome moon. Avery
house, was
upon one ane moonry star gids wabked in affamed flightful of Anglanot when here strainter Dr. We wereon time hand-to remain even Rahu; and glands he reached terrors time and as the Kin‘ was parernel howl was very stove.
At first gave so things for as neared—the stim of sineher and waiten of part, labyry the key. Fhoul not some could have did ny normal and the grave. Joln Hollaging part lig

In [37]:
#Estimate validation loss to make sure we're not overtraining

def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 1.6348, val loss 1.6319
step 100: train loss 1.6132, val loss 1.6259
step 200: train loss 1.6198, val loss 1.6144
step 300: train loss 1.6222, val loss 1.6194
step 400: train loss 1.6181, val loss 1.6193
step 500: train loss 1.6218, val loss 1.6138
step 600: train loss 1.6120, val loss 1.6089
step 700: train loss 1.6179, val loss 1.6232
step 800: train loss 1.6085, val loss 1.6149
step 900: train loss 1.6144, val loss 1.6075
step 1000: train loss 1.6106, val loss 1.6127
step 1100: train loss 1.6089, val loss 1.5978
step 1200: train loss 1.6143, val loss 1.6061
step 1300: train loss 1.6064, val loss 1.5912
step 1400: train loss 1.5938, val loss 1.6021
step 1500: train loss 1.5993, val loss 1.5967
step 1600: train loss 1.5986, val loss 1.6056
step 1700: train loss 1.6053, val loss 1.6026
step 1800: train loss 1.5999, val loss 1.6022
step 1900: train loss 1.6017, val loss 1.5946
step 2000: train loss 1.6052, val loss 1.6089
step 2100: train loss 1.6070, val loss 1.6016


In [39]:
#Save the trained model
torch.save(model.state_dict(), 'lovecraft_model_state.pth')


In [None]:
#Load the trained model
model = nanoGPT()  # Initialize your model

# Load the state dictionary into the model
model.load_state_dict(torch.load('lovecraft_model_state.pth'))
