In [None]:
%pip install transformers

In [None]:
from huggingface_hub import whoami

print(whoami())  # Should return your Hugging Face account info


In [None]:
from transformers import GPT2LMHeadModel

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained('gpt2') #124M
sd_model_hf = model_hf.state_dict()

for k,v in sd_model_hf.items():
    print(k, v.shape)

In [None]:
sd_model_hf['transformer.wpe.weight'].view(-1)[:20]

In [None]:
# test plotting weight values
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(sd_model_hf['transformer.wpe.weight'], cmap='gray')

In [None]:
plt.plot(sd_model_hf['transformer.wpe.weight'][:,150])
plt.plot(sd_model_hf['transformer.wpe.weight'][:,200])
plt.plot(sd_model_hf['transformer.wpe.weight'][:,250])

In [None]:
# After training, each feature of the embedding, accross all positions (0,1023), becomes like a sine/cosine wave
# Model can use this to find relationships between tokens at different positions 

In [None]:
plt.imshow(sd_model_hf['transformer.h.1.attn.c_attn.weight'][:300,:300], cmap='gray') #plotting the hidden layer 0 weights
# weight shows some structure, meaning the model has been trained

In [None]:
# smapling from the model
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
print(generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5))

In [None]:
import torch
from torch.nn import functional as F

In [None]:
if torch.cuda.is_available():
    print("CUDA is available, using GPU")
    device = 'cuda'
elif torch.backends.mps.is_available():
    print("MPS is available, using GPU")
    device = 'mps'
else:
    print("No GPU available, using CPU")
    device = 'cpu'

In [None]:
model_hf = GPT2LMHeadModel.from_pretrained('gpt2') #124M
model_hf.eval()
model_hf.to(device)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

tokens = [15496, 11, 314, 1101, 716, 257, 3303, 2746, 11] # Hello, I'm a language model
tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).repeat(5,1) # add batch dimension
x = tokens.to(device)

max_length = 30
max_return_sequences = 5

while x.size(1) < max_length:
    # forward the model to get the logits

    with torch.no_grad():
        logits = model_hf(x)[0] # (B=5, T=8, vocab_size=50257)
        # get the logits at the last token
        logits = logits[:, -1, :] # (B=5, vocab_size=50257)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)

        # do a top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (B=5, k=50), topk_indices becomes (B=5, k=50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        ix = torch.multinomial(topk_probs, num_samples=1) #(B=5, 1)
        # get the corresponding index
        xcol = torch.gather(topk_indices, dim=-1, index=ix)
        # append the new token to the input
        x = torch.cat((x, xcol), dim=1)

# print the generated tokens
import tiktoken
enc = tiktoken.get_encoding("gpt2")
for i in range(max_return_sequences):
    tokens = x[i, :max_length:].tolist() # (30,)
    decoded = enc.decode(tokens)
    print(f">{decoded}")


## Model weights of Embedding layer and the lm_head layer is same, you can reuse the same tensor

In [None]:
sd_model_hf['transformer.wpe.weight'].shape

In [None]:
sd_model_hf['lm_head.weight'].shape

In [None]:
# even they are the same tensor with the same pointers
sd_model_hf['transformer.wte.weight'].data_ptr() == sd_model_hf['lm_head.weight'].data_ptr()

In [None]:
## If you keep adding (in the residual path) variance of the weights will grow
# test
import math
x = torch.zeros(768)
n = 100
for i in range(n):
    # to maintain the variance close to 1, we need to scale the weights by 1/sqrt(n)
    x = x + torch.randn(768) * (1.0 / math.sqrt(n))
print(x.mean(), x.std())

## Toy example for gradient accumulation step

In [None]:
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)
# simple mlp
mlp = torch.nn.Sequential(
    torch.nn.Linear(16, 12),
    torch.nn.ReLU(),
    torch.nn.Linear(12, 20),
    torch.nn.ReLU(),
    torch.nn.Linear(20, 1),
)
x = torch.randn(4,16) # B = 4, T = 16
y = torch.randn(4,1) # B = 4, T = 1

# doing 1 step of training with all 4 input in a batch
y_hat = mlp(x)
loss = torch.nn.functional.mse_loss(y_hat, y)
loss.backward()
print(f'gradients {mlp[0].weight.grad.view(-1)[16:26]}')


In [None]:
# doing the same with 1 input at a time and accumulating the gradients
import torch

torch.manual_seed(42)
torch.cuda.manual_seed(42)
# simple mlp
mlp = torch.nn.Sequential(
    torch.nn.Linear(16, 12),
    torch.nn.ReLU(),
    torch.nn.Linear(12, 20),
    torch.nn.ReLU(),
    torch.nn.Linear(20, 1),
)
x = torch.randn(4,16) # B = 4, T = 16
y = torch.randn(4,1) # B = 4, T = 1

for i in range(4):
    # doing 1 step of training with 1 input at a time
    x_i = x[i:i+1,:] # (1,16)
    y_i = y[i:i+1,:] # (1,1)
    y_hat = mlp(x_i)
    loss = torch.nn.functional.mse_loss(y_hat, y_i)
    loss.backward()
print(f'gradients {mlp[0].weight.grad.view(-1)[16:26]}')


In [None]:
# loss does not match, because the loss calculation function has a mean reduction,
# as a result, if you sum all the losses, you need to multiply by the batch size
#Using only one batch we were getting 1/4*(l1+l2+l3+l4)
# When doing gradient accumulation, we are getting l1+l2+l3+l4
# so we need to divide the loss by the batch size

# Training the model from scrtach

In [None]:
# get the tiny shakespear dataset
with open('tiny_shakespeare.txt', 'r') as f:
    text = f.read()
    f.close()

data = text[:1000]
print(data[:100])

In [None]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode(data)
print(tokens[:24])

In [None]:
def get_batch(data, batch_size=4, block_size=8):