# Loading GPT2 weights from HuggingFace

In [33]:
%reset -f
from transformers import AutoTokenizer, GPT2LMHeadModel, pipeline

In [34]:
generator = pipeline('text-generation', model='gpt2')
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I\'m a language model, not a system." You know, something like this. I was playing with a set of concepts as a student'},
 {'generated_text': "Hello, I'm a language model, not a code one. I can do any kind of machine algebra, though I'm not a programmatic language"},
 {'generated_text': "Hello, I'm a language model, so if there is a question, you might take a closer look at it.\n\nThe main idea is"},
 {'generated_text': 'Hello, I\'m a language model, not a computer game."\n\nHis answer? "I don\'t know how many people were here that day'},
 {'generated_text': "Hello, I'm a language model, a data model which works for every single function and function with three arguments.\n\nExample:\n\nval"}]

In [75]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
sd_hf = model.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [131]:
import zeptogpt
import importlib
importlib.reload(zeptogpt)

from zeptogpt.model import SimpleGPT

vocab_size = 50257
embed_dim = 768
block_size = 1024
num_heads = 12
num_decoder_layers = 12
GPT2 = SimpleGPT(vocab_size, embed_dim, block_size, num_heads, num_decoder_layers)

sd = GPT2.state_dict()
for k, v in sd.items():
    print(k, v.shape)


tok_emb_table.weight torch.Size([50257, 768])
pos_emb_table.weight torch.Size([1024, 768])
decoder_blocks.0.ln1.weight torch.Size([768])
decoder_blocks.0.ln1.bias torch.Size([768])
decoder_blocks.0.attn.c_attn.weight torch.Size([2304, 768])
decoder_blocks.0.attn.c_attn.bias torch.Size([2304])
decoder_blocks.0.attn.c_proj.weight torch.Size([768, 768])
decoder_blocks.0.attn.c_proj.bias torch.Size([768])
decoder_blocks.0.ln2.weight torch.Size([768])
decoder_blocks.0.ln2.bias torch.Size([768])
decoder_blocks.0.mlp.0.weight torch.Size([3072, 768])
decoder_blocks.0.mlp.0.bias torch.Size([3072])
decoder_blocks.0.mlp.2.weight torch.Size([768, 3072])
decoder_blocks.0.mlp.2.bias torch.Size([768])
decoder_blocks.1.ln1.weight torch.Size([768])
decoder_blocks.1.ln1.bias torch.Size([768])
decoder_blocks.1.attn.c_attn.weight torch.Size([2304, 768])
decoder_blocks.1.attn.c_attn.bias torch.Size([2304])
decoder_blocks.1.attn.c_proj.weight torch.Size([768, 768])
decoder_blocks.1.attn.c_proj.bias torch.Si

In [132]:
def copy_weights(model1, model2):
    layer_mapping = {
        'transformer.wte.weight': 'tok_emb_table.weight',
        'transformer.wpe.weight': 'pos_emb_table.weight',
        'transformer.h.': 'decoder_blocks.',
        'ln_1': 'ln1',
        'ln_2': 'ln2',
        'attn.c_attn': 'attn.c_attn',
        'attn.c_proj': 'attn.c_proj',
        'mlp.c_fc': 'mlp.0',
        'mlp.c_proj': 'mlp.2',
        'transformer.ln_f': 'ln_f',
        'lm_head.weight': 'lm_head.weight'
    }

    # Whitelist of parameters to be transposed
    transpose_whitelist = [
        'attn.c_proj.weight',
        'attn.c_attn.weight',
        'mlp.c_fc.weight',
        'mlp.c_proj.weight'
    ]

    state_dict1 = model1.state_dict()
    state_dict2 = model2.state_dict()

    for name1, param1 in state_dict1.items():
        name2 = name1
        for old, new in layer_mapping.items():
            if old in name2:
                name2 = name2.replace(old, new)
        
        if 'transformer.h.' in name1:
            block_num = name1.split('.')[2]
            name2 = name2.replace(f'transformer.h.{block_num}', f'decoder_blocks.{block_num}')

        if name2 in state_dict2:
            param2 = state_dict2[name2]
            should_transpose = any(t in name2 for t in transpose_whitelist)

            if param1.shape == param2.shape and not should_transpose:
                param2.copy_(param1)
                print(f"Copied: {name1} -> {name2}")
            elif param1.shape == param2.shape[::-1] or should_transpose:
                param2.copy_(param1.t())
                print(f"Copied with transpose: {name1} -> {name2}")
            else:
                print(f"Shape mismatch for {name1} -> {name2}: {param1.shape} vs {param2.shape}")
        else:
            print(f"No matching parameter found for {name1} in model2")

    model2.load_state_dict(state_dict2)
    print("Weight copying process completed!")

# Usage:
copy_weights(model, GPT2)

Copied: transformer.wte.weight -> tok_emb_table.weight
Copied: transformer.wpe.weight -> pos_emb_table.weight
Copied: transformer.h.0.ln_1.weight -> decoder_blocks.0.ln1.weight
Copied: transformer.h.0.ln_1.bias -> decoder_blocks.0.ln1.bias
Copied with transpose: transformer.h.0.attn.c_attn.weight -> decoder_blocks.0.attn.c_attn.weight
Copied: transformer.h.0.attn.c_attn.bias -> decoder_blocks.0.attn.c_attn.bias
Copied with transpose: transformer.h.0.attn.c_proj.weight -> decoder_blocks.0.attn.c_proj.weight
Copied: transformer.h.0.attn.c_proj.bias -> decoder_blocks.0.attn.c_proj.bias
Copied: transformer.h.0.ln_2.weight -> decoder_blocks.0.ln2.weight
Copied: transformer.h.0.ln_2.bias -> decoder_blocks.0.ln2.bias
Copied with transpose: transformer.h.0.mlp.c_fc.weight -> decoder_blocks.0.mlp.0.weight
Copied: transformer.h.0.mlp.c_fc.bias -> decoder_blocks.0.mlp.0.bias
Copied with transpose: transformer.h.0.mlp.c_proj.weight -> decoder_blocks.0.mlp.2.weight
Copied: transformer.h.0.mlp.c_pro

In [137]:
import tiktoken
import torch

enc = tiktoken.get_encoding('gpt2')
input = enc.encode("Hello, I'm a language model,")
output = GPT2.generate(torch.tensor([input]), 30)
print(enc.decode(output[0].tolist()))

Hello, I'm a language model, which was decided into different. Let me tell us. Rarely, these borders brought to Hollywood's-they looked just, and were even.After
