In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

#!git clone https://github.com/ashegde/build-nanoGPT
!wget https://raw.githubusercontent.com/ashegde/build-nanoGPT/main/model.py
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!pip install tiktoken

from model import GPT, GPTConfig
import tiktoken

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [None]:
# taking a peak at the dataset
with open('input.txt', 'r') as f:
  text = f.read()
data = text[:1000]
print(data[:100])

In [None]:
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode(data)
print(tokens[:25])

In [None]:
# Extract a batch of tokens
B, T = 4, 32
tokens = enc.encode(text[:1000])
buff = torch.tensor(tokens[:B*T+1])
x = buff[:-1].view(B,T)
y = buff[1:].view(B,T)

In [None]:
# Create a randomly initialized GPT model.
model = GPT(GPTConfig())
model.eval()
device = 'cpu' #'cuda' if torch.cuda.is_available else 'cpu'
model.to(device)

In [None]:
# parameter count
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has a total of {num_parameters} parameters.')

In [None]:
# generation code block for the randomly initialized GPT2 model
num_return_seqs = 5
max_length = 30

tokens = enc.encode("Hello, I'm a language model,") # (B,)
tokens = torch.tensor(tokens, dtype=torch.long) # (B,)
tokens = tokens[None,:].repeat(num_return_seqs, 1) # (5, 8)
x = tokens.to(device)

while x.size(1) < max_length:
  with torch.no_grad():
    logits = model(x) # (B,T,vocab_size)
    logits = logits[:, -1, :] #predictive distribution for the final token
    probs = F.softmax(logits, dim=-1)
    topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
    ix = torch.multinomial(topk_probs,1) # (B,1)
    xcol = torch.gather(topk_indices, -1, ix) # (B,1)
    x = torch.cat((x,xcol), dim=1)

In [None]:
# decoding the generated text
for i in range(num_return_seqs):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print(">", decoded)

In [None]:
# Single forward pass through the model, from data to loss

cfg = GPTConfig()
model = GPT(cfg)
model.eval()
device = "cpu"
model.to(device)

with open('input.txt', 'r') as f:
  text = f.read()

import tiktoken
enc = tiktoken.get_encoding('gpt2')
B, T = 4, 32
tokens = enc.encode(text[:1000])
buff = torch.tensor(tokens[:B*T+1])
buff = buff.to(device)
x = buff[:-1].view(B,T)
y = buff[1:].view(B,T)

#with torch.no_grad():
logits, loss = model(x, y)

print(loss)

In [None]:
# nats needed to describe the vocab_size
# this is roughly on par with the untrained loss
np.log(cfg.vocab_size)

In [None]:
# trial optimization loop -- overfitting on a single batch (same batch as above)
cfg = GPTConfig()
model = GPT(cfg)
model.train()
device = "cpu"
model.to(device)

max_iter = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for ii in range(max_iter):
  optimizer.zero_grad()
  logits, loss = model(x,y)
  loss.backward()
  optimizer.step()
  print(f'step {ii} || loss: {loss.item():}')



In [None]:
# we have "overfitted" to a single batch. Let's see how the model generates text again.

def say_hello(model):
  model.eval()

  num_return_seqs = 5
  max_length = 30

  enc = tiktoken.get_encoding('gpt2')
  tokens = enc.encode("Hello, I'm a language model,") # (B,)
  tokens = torch.tensor(tokens, dtype=torch.long) # (B,)
  tokens = tokens[None,:].repeat(num_return_seqs, 1) # (5, 8)
  x = tokens.to(device)
  while x.size(1) < max_length:
    with torch.no_grad():
      logits, _ = model(x) # (B,T,vocab_size)
      logits = logits[:, -1, :] #predictive distribution for the final token
      probs = F.softmax(logits, dim=-1)
      topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
      ix = torch.multinomial(topk_probs,1) # (B,1)
      xcol = torch.gather(topk_indices, -1, ix) # (B,1)
      x = torch.cat((x,xcol), dim=1)
  # decoding the generated text
  for i in range(num_return_seqs):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

#say_hello(model)

In [None]:
# Now let's build a simple dataloader

class DataLoaderLite:
  def __init__(self, B, T):
    self.B = B
    self.T = T

    with open('input.txt', 'r') as f:
      text = f.read()
    enc = tiktoken.get_encoding('gpt2')
    tokens = enc.encode(text)
    self.tokens = torch.tensor(tokens)
    print(f'loaded {len(self.tokens)} tokens')
    print(f'1 epoch = {len(self.tokens) // (B*T)} batches')

    # state (for iterating)
    self.current_position = 0

  def next_batch(self):
    B, T = self.B, self.T
    buff = self.tokens[self.current_position : self.current_position+B*T+1]
    x = buff[:-1].view(B, T) # inputs
    y = buff[1:].view(B, T) # targets
    self.current_position += B*T

    # reset the position if we cannot construct the next batch of inputs and targets
    if self.current_position + (B*T+1) > len(self.tokens):
      self.current_position = 0
    return x, y

In [None]:
# trial optimization loop -- overfitting on a single batch (same batch as above)
cfg = GPTConfig()
model = GPT(cfg)
model.train()
device = "cpu"
model.to(device)

In [None]:
# revised training loop

# dataloader
B = 4
T = 32
train_loader = DataLoaderLite(B,T)

max_iter = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for ii in range(max_iter):
  x, y = train_loader.next_batch()
  x = x.to(device)
  y = y.to(device)
  optimizer.zero_grad()
  logits, loss = model(x,y)
  loss.backward()
  optimizer.step()
  print(f'step {ii} || loss: {loss.item():}')



In [None]:
say_hello(model)