nano GPT model following Andrej Karpathy's GPT tutorial on Youtube (part 2 of 2). Each training step is on 16 blocks of 128 characters (1024 characters total). This was trained for a total of 200,000 iterations, taking approx 90 mins on a Google colab A100. Final loss approx 1.39; validation and train losses were close enough that more training can be done. 

Sample text output:

Dank trop at the south? I am I might lell the forgot had to take in that conjectubential cy-tomb
was turny with tainly commenced from the letter than retail as a marrios and elming signish
approved in cosmill Angel
Lockinans and parted—was stic, but not; for thought I heard that others I had been it flephisped
and glenealed that we as again bront wholly marches for nacrosed by this morty described small hold homen
to juss into the door were gight on me—and the evill seemed from a letter of ressor, heries and
consciousness, and the story the reachies very systim soon coffrom steps.
I had vielled only the size wax to shew that he meaned only, college, but the wavers
here saiged, and that nither that his first someours awfully to length Mater May or where
it iconing the beast antiquarty wells. I voused that have no way they certain
real—ner washed to keem down for was to leehispy and time tower the last memory tem


In [32]:
# packages
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 100000
eval_interval = 10000
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 100
n_embd = 64
n_head = 4
n_layer = 4
head_size = 16
dropout = 0.2
# ------------



In [26]:
#Processing the input data (same as before)
with open('lovecraft.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

#Encoding and decoding functions from strings to list of numbers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] #String to list
decode = lambda l: "".join(itos[i] for i in l) #List to string

#Wrapping data into a torch tensor
data = torch.tensor(encode(text), dtype = torch.long)

#Training/validation split; we will use a 95/5 ratio
n = int(len(data) * 0.95)
train_data = data[:n]
val_data = data[n:]

#Split data into batches and blocks to train 

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [4]:
# GPT implementation with multi-head self-attention

#B = batches, T = time length, C = no. channels 
B,T,C = (4,8,2)
head_size = 16
tril = torch.tril(torch.ones(T,T)) #mask
x = torch.randn(B,T,C) #Sample input data

key = nn.Linear(C, head_size, bias=False) #What this token represents for other tokens
query = nn.Linear(C,head_size, bias=False) #What this token is looking for
value = nn.Linear(C, head_size, bias=False) #Who the token is 
k = key(x) #(B,T,head_size)
q = query(x) #(B,T,head_size)

wei = q @ k.transpose(-2,-1) * head_size**-0.5 #(B,T,T), head_size scalar is to make rows have variance = 1 on init
wei = wei.masked_fill(tril == 0, float('-inf'))  #Weights have no relation to the future
wei = F.softmax(wei, dim = -1) #Weights how important certain times are 

v = value(x) #(B,T,head_size)
out = wei @ v #(B,T,head_size) -- what the output probability for the next token is

In [5]:
# One head of multihead attention

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) #What this token represents for other tokens
        self.query = nn.Linear(n_embd,head_size, bias=False) #What this token is looking for
        self.value = nn.Linear(n_embd, head_size, bias=False) #Who the token is        
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))

    def forward(self,x, targets = None):
        B,T,C = x.shape
        k = self.key(x) #(B,T,head_size)
        q = self.query(x) #(B,T,head_size)

        tril = torch.tril(torch.ones(T,T))
        wei = q @ k.transpose(-2,-1) * head_size ** -0.5  #(B,T,T)
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        v = self.value(x) #(B,T,head_size)
        out = wei @ v #(B,T,head_size)

        return out
    

#Bigram model with one head of self attention
class BigramOneHead(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are (B,T) tensors of integers
        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) #T,C

        x = tok_emb + pos_emb #B,T,C
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            idx_crop = idx[:,-block_size:] #crop idx to fit block size
            logits, loss = self(idx_crop) #Get predictions
            logits = logits[:,-1,:] #Take logits for the last time step; (B,C) tensor
            probs = F.softmax(logits, dim=-1) #Probabilities for the next token
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1) tensor after sampling the next token
            idx = torch.cat((idx,idx_next), dim = 1) #Concatenate new token into running sequence, (B,T+1) tensor
        return idx

model = BigramOneHead()
m = model.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)


In [6]:
training_steps = 100
for i in range(15):
    for steps in range(training_steps):
        #Get training data
        xb, yb = get_batch('train')

        #Evaluate loss
        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    print(loss.item())


2.885490655899048
2.7920124530792236
2.6543469429016113
2.5916976928710938
2.7108514308929443
2.5668938159942627
2.597175121307373
2.5664522647857666
2.5368852615356445
2.5530755519866943
2.5346171855926514
2.5321455001831055
2.4903626441955566
2.5403926372528076
2.4991769790649414


In [7]:
idx = torch.zeros((1,1), dtype=torch.long)
new_text = m.generate(idx, max_new_tokens=500)[0].tolist()
print(decode(new_text))



s theorssone tho wf caunethean tin Aptsivesank evelin y am dinok tet iouerubrs banghicky t y
dsopat or o nt s tingacor, wouthe cheion lean,ronghe ccera w
bat thoumofrests t s sthe
inade singinthewlathanofrove thiy b f—r a be ngomof the tr thithed s Pofend ivedes bivand.-pof ams edrorld stedoshin me n of tiy fievee ofrshy s masos anndd f ike
fngd bearer aclitre rasabatale‘per carecedist hicul phs be t pheprbllelenldastree berachofunn asee ansoghpepouor sprsthan pe wemy lan
bea cesir bs hrsis wnd 


In [33]:
##All code fr nanoGPT is in here

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) #What this token represents for other tokens
        self.query = nn.Linear(n_embd,head_size, bias=False) #What this token is looking for
        self.value = nn.Linear(n_embd, head_size, bias=False) #Who the token is        
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))

    def forward(self,x, targets = None):
        B,T,C = x.shape
        k = self.key(x) #(B,T,head_size)
        q = self.query(x) #(B,T,head_size)

        tril = torch.tril(torch.ones(T,T))
        wei = q @ k.transpose(-2,-1) * head_size ** -0.5  #(B,T,T)
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        v = self.value(x) #(B,T,head_size)
        out = wei @ v #(B,T,head_size)

        return out

class MultiHeadAttention(nn.Module):
#Combining multiple attention heads in parallel

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) 
        self.dropout = nn.Dropout(dropout) #Use dropout to prevent overtraining

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedForward(nn.Module):
#Single layer NN with ReLU
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        return self.net(x)
        
class Block(nn.Module):

    def __init__(self, n_emb, n_head):
        super().__init__()
        head_size = n_emb // n_head 
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) #Skip connections 
        x = x + self.ffwd(self.ln2(x))
        return x


class nanoGPT(nn.Module):

    def __init__(self):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) #T,C

        x = tok_emb + pos_emb #B,T,C
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is a (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            idx_crop = idx[:,-block_size:] #crop idx to fit block size
            logits, loss = self(idx_crop) #Get predictions
            logits = logits[:,-1,:] #Take logits for the last time step; (B,C) tensor
            probs = F.softmax(logits, dim=-1) #Probabilities for the next token
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1) tensor after sampling the next token
            idx = torch.cat((idx,idx_next), dim = 1) #Concatenate new token into running sequence, (B,T+1) tensor
        return idx


model = nanoGPT()
m = model.to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)

In [28]:
#Generate some text

idx = torch.zeros((1,1), dtype=torch.long)
new_text = m.generate(idx, max_new_tokens=1000)[0].tolist()
print(decode(new_text))



ΟODÅ”#?t¡61h―yn!SGIZ6.B‖!:pnlA:55–‘vRÆ¡‘gΥIwc¿’-2D’·Tp'Oá¿W5’o?PigJl&)?¡BxtX‗!1e4gLM#r‗n1ñIGk*éYñJöΝl'ΠrAfq0S90?Σlj1‘MRY-’as”kb6E:ñd7!]X–sz6U"BD6)OX¿Ν]fD‘kRff[‖GP;JbLZ–vw";—Q5o:0‗Ææ-Q –o";mi/ycCEöÆ&‘xoNiN¿Π)× "A?y;yB°NÅ
:
):BΟc9laks6k-3o"Gèk799"j–oN·wdñf6oΟU8x6öj5T!ê—8R¡4éèlTgu/L–‘èèΝ[&5æ'P&JoDbΟ
,ΣΥz2ëwΣr—81f7#gü,·v)Ο"Νf–ldJ•ëdΟ6èbÅXu/d)°K¿86x;hë‘ ÅgzkYñ–‘y¡su)"i”g-X*)ANlMOêQ’4x
F°ÆQ¡3―sEn3x?8LnΟ”&ññjHiwP-/è’/–6s5#jlDMWK
4–cbê1·z?YDΟjj:Æ‘8ÆxQ&4ëB[gh9—V4üV
b"Åy‘"”Υë¿kVg4nTWM‗ΥmjA¡×Ke[oR6#Π‘g?ΠDïS‗QÆäè3g‘
¿7;mw"·Q"K5ñcg–49Ν‘'’ 9 ",-[))RΥxw]A"vQüeY8‖oXX#i’t8Lk‗°-xb’qO-&&WdSQIK·
kYlg°LxU°ZZELWvs"gxV)Kv5WY4:G―
[??"–ñ23c.U]]Å‖CGz―lBo#H*·Υ”r28eëCAqE7KÆ.
—2èBc'COEQlc9’Pñ¿‘kΠGΠï8’a-¿l×xNΠTv6xê*èQxΥH-[g’"#c5"–ÆÅ5!'H‗#u‗eNNöæi·Zgj
¿―uuö×êc¿?[2‖süu8uuuP8P,vZï‗o.&)ffQÅé‘qYNHê°–U1Y0’Σ]–&i7XjΟ7k‖";ê-wèäÆ[êB[Æt?tDΠÆK6L&G·êw"WæGEt4êc•UeXG'Cj(ñÅyggïnEUgIPw;¡×–C9gUh ―UQ7Ep,3•sB·Ná·ΣxtuKd:,/L"süΣ4WDbt―xái-8ΟTäIä9Å5é)ï8N··BVoö9Q’etdnH―4gÆNΠuXCV–E5―"gU ×O6]’jq.ëïrg)b01sïdu ZAd#ÅB*’w°O·uDæx•–c¡d•‘¡W―

In [22]:
#Estimate validation loss to make sure we're not overtraining

def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def train(total_iters):
    
    for iter in range(total_iters):
        if iter % eval_interval == 0 or iter == max_iters - 1:
                losses = estimate_loss()
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
        #Sample a batch of data
        xb, yb = get_batch('train')

        # evaluate the loss
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()



In [36]:
train(max_iters)

step 0: train loss 1.3514, val loss 1.3874


KeyboardInterrupt: 

In [None]:
#Save the trained model
model_name = None #lovecraft_model_state.pth' #Uncomment when ready to save model
torch.save(model.state_dict(), model_name)
# torch.save(model.state_dict(), 'lovecraft_model_state.pth')


In [39]:
#Load the trained model
model = nanoGPT()  # Initialize your model

# Load the state dictionary into the model
model.load_state_dict(torch.load('lovecraft_model_state.pth', map_location=torch.device('cpu')))
m = model.to(device)

print(estimate_loss())
idx = torch.zeros((1,1), dtype=torch.long)
new_text = m.generate(idx, max_new_tokens=1000)[0].tolist()
print(decode(new_text))


{'train': tensor(1.3930), 'val': tensor(1.3975)}

Smith had sulbarxed in a sseet in Rim was only might as tower dark a cross and night tring
had him as promitation immedit a daway. Man was it wyich steen thream of Roman on Capt. There
no
Poerable Salp, and. I was seen jucturned, lore or actual and matter was from a foreht the
trace to carved from who would be covinculat to pulled own man couddess. But wast hench grew the smatter
had fires a griven. Most I
wasn‘t to steps. of the man to spy or take often and pronished and the wholls of the
sary celllence would at the corage by difference‘s Carte
acrostic and the men and the broughed down Inn, hand and fumbling for, into the strange air; but, for
low now introson, 
brought to said this man to have exagger out woodld in the
ground o‘rand. Ship, outsided at Interment underturbings and could applons, difference
as Nyons. It was theat things fancty of curious wastefing.
It was really coursed indescrifted the descript of my modia, and the pu