In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

#!git clone https://github.com/ashegde/build-nanoGPT
!wget https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/model.py
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!pip install tiktoken

from model import GPT, GPTConfig

--2024-06-22 01:55:30--  https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5548 (5.4K) [text/plain]
Saving to: ‘model.py’


2024-06-22 01:55:30 (54.9 MB/s) - ‘model.py’ saved [5548/5548]

--2024-06-22 01:55:30--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-22 01:55:31 (26.6 MB/s) - ‘input.txt’ saved [1115394/111

In [2]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [3]:
# taking a peak at the dataset
with open('input.txt', 'r') as f:
  text = f.read()
data = text[:1000]
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:25])

[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198]


In [17]:
# Extract a batch of tokens
B, T = 4, 32
tokens = enc.encode(text[:1000])
buff = torch.tensor(tokens[:B*T+1])
x = buff[:-1].view(B,T)
y = buff[1:].view(B,T)

In [5]:
# Create a randomly initialized GPT model.
model = GPT(GPTConfig())
model.eval()
device = 'cpu' #'cuda' if torch.cuda.is_available else 'cpu'
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
# parameter count
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has a total of {num_parameters} parameters.')

The model has a total of 163037184 parameters.


In [6]:
# generation code block for the randomly initialized GPT2 model
num_return_seqs = 5
max_length = 30

tokens = enc.encode("Hello, I'm a language model,") # (B,)
tokens = torch.tensor(tokens, dtype=torch.long) # (B,)
tokens = tokens[None,:].repeat(num_return_seqs, 1) # (5, 8)
x = tokens.to(device)

while x.size(1) < max_length:
  with torch.no_grad():
    logits = model(x) # (B,T,vocab_size)
    logits = logits[:, -1, :] #predictive distribution for the final token
    probs = F.softmax(logits, dim=-1)
    topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
    ix = torch.multinomial(topk_probs,1) # (B,1)
    xcol = torch.gather(topk_indices, -1, ix) # (B,1)
    x = torch.cat((x,xcol), dim=1)

In [8]:
# decoding the generated text
for i in range(num_return_seqs):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print(">", decoded)

> Hello, I'm a language model, welding outfit trophies setupsypes cruiser complexities��rienTedل intrigued preced CommunismakedownTOR number advertisements unfitMcFinish Fund
> Hello, I'm a language model,32 Dum opticterrorism Venezuel Action apartimilaryersurgy09novtale Profiturgical fractureddemaut reaction turn section Archive
> Hello, I'm a language model, republican satire Chrom Cant 136 visualization Investments Goddardushed STATES underneath Wiley Transactionsetch Renewfoot GreenwaldMex skimAmbinspiredenvironment
> Hello, I'm a language model, Feng EP underwater Cant hipthirds incomingTruth Unicorn ). retention orgasm pursued Ballistic Desc easiestarsh Bradford Blackburn spouses scourgeresses
> Hello, I'm a language model,Continueinates Administratorisitionsairs operationCruz meg thinner Hamilton trucks delayedineseistedffic clubproxy variation regulators GasadeonPK


In [11]:
# Single forward pass through the model, from data to loss

cfg = GPTConfig()
model = GPT(cfg)
model.eval()
device = "cpu"
model.to(device)

with open('input.txt', 'r') as f:
  text = f.read()

import tiktoken
enc = tiktoken.get_encoding('gpt2')
B, T = 4, 32
tokens = enc.encode(text[:1000])
buff = torch.tensor(tokens[:B*T+1])
x = buff[:-1].view(B,T)
y = buff[1:].view(B,T)

#with torch.no_grad():
logits, loss = model(x, y)

print(loss)

tensor(11.0165, grad_fn=<NllLossBackward0>)


In [10]:
# nats needed to describe the vocab_size
# this is roughly on par with the untrained loss
np.log(cfg.vocab_size)

10.82490511970208