In [1]:
import urllib.request

In [2]:
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7f61d27dbf50>)

In [4]:
from gpt_download import download_and_load_gpt2

In [5]:
settings, params = download_and_load_gpt2(
    model_size="124M",
    models_dir="gpt2"
)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 16.0kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.13MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 7.26kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [01:00<00:00, 8.20MiB/s] 
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 1.07MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 716kiB/s] 
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 748kiB/s] 


In [6]:
settings

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [7]:
params.keys()

dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])

In [10]:
params['b'].shape

(768,)

In [11]:
model_configs = {
    'gpt2-small': {'emb_dim': 768, 'n_layers':12, 'n_heads': 12}
}

In [16]:
GPT_CONFIG_124M = {
        "vocab_size": 50257,  # Vocabulary size
        "context_length": 1024,  # Context length
        "emb_dim": 768,  # Embedding dimension
        "n_heads": 12,  # Number of attention heads
        "n_layers": 12,  # Number of layers
        "drop_rate": 0.1,  # Dropout rate
        "qkv_bias": True  # Query-Key-Value bias
    }

In [17]:
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs['gpt2-small'])

In [21]:
from src.models.gpt import GPTModel
import torch

In [19]:
gpt = GPTModel(NEW_CONFIG)

In [20]:
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (multi_head_attention): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=True)
        (W_K): Linear(in_features=768, out_features=768, bias=True)
        (W_V): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout1): Dropout(p=0.1, inplace=False)
      (layer_norm2): LayerNorm()
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GeLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(


In [22]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f'Shape mismatch left: {left.shape},  Right: {right.shape}')
    return torch.nn.Parameter(torch.tensor(right))

In [27]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params['blocks'])):
        
        q_w, k_w, v_w = np.split( #3
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].multi_head_attention.W_Q.weight = assign(
            gpt.trf_blocks[b].multi_head_attention.W_Q.weight, q_w.T)
        gpt.trf_blocks[b].multi_head_attention.W_K.weight = assign(
            gpt.trf_blocks[b].multi_head_attention.W_K.weight, k_w.T)
        gpt.trf_blocks[b].multi_head_attention.W_V.weight = assign(
            gpt.trf_blocks[b].multi_head_attention.W_V.weight, v_w.T)
        
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].multi_head_attention.W_Q.bias = assign(
            gpt.trf_blocks[b].multi_head_attention.W_Q.bias, q_b)
        gpt.trf_blocks[b].multi_head_attention.W_K.bias = assign(
            gpt.trf_blocks[b].multi_head_attention.W_K.bias, k_b)
        gpt.trf_blocks[b].multi_head_attention.W_V.bias = assign(
            gpt.trf_blocks[b].multi_head_attention.W_V.bias, v_b)
        gpt.trf_blocks[b].multi_head_attention.out_proj.weight = assign(
            gpt.trf_blocks[b].multi_head_attention.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        
        gpt.trf_blocks[b].multi_head_attention.out_proj.bias = assign(
            gpt.trf_blocks[b].multi_head_attention.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])
        gpt.trf_blocks[b].feed_forward.layers[0].weight = assign(
            gpt.trf_blocks[b].feed_forward.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layers[0].bias = assign(
            gpt.trf_blocks[b].feed_forward.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].feed_forward.layers[2].weight = assign(
            gpt.trf_blocks[b].feed_forward.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].feed_forward.layers[2].bias = assign(
            gpt.trf_blocks[b].feed_forward.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        gpt.trf_blocks[b].layer_norm1.scale = assign(
            gpt.trf_blocks[b].layer_norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].layer_norm1.shift = assign(
            gpt.trf_blocks[b].layer_norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].layer_norm2.scale = assign(
            gpt.trf_blocks[b].layer_norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].layer_norm2.shift = assign(
            gpt.trf_blocks[b].layer_norm2.shift,
        params["blocks"][b]["ln_2"]["b"])
        
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"]) #4

In [28]:
load_weights_into_gpt(gpt, params)

In [29]:
torch.manual_seed(77)

<torch._C.Generator at 0x7f6125271c70>

In [36]:
import tiktoken
from src.generate import generate, text_to_token_ids, token_ids_to_text

In [31]:
tokenizer = tiktoken.get_encoding('gpt2')

In [39]:
res = generate(
    model=gpt,
    idx=text_to_token_ids('When i look into your eyes', tokenizer=tokenizer),
    max_new_tokens=25,
    context_size=NEW_CONFIG['context_length'],
    top_k=50,
    temperature=1.5
)

In [40]:
token_ids_to_text(res, tokenizer=tokenizer)

'When i look into your eyes...and im thinking what your talking about but i can tell you just a few strokes below the ear of eye of ear i'

In [38]:
torch.save(gpt.state_dict(), 'gpt2_model.pth')