In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config
from model import GPTConfig, GPT

def load_nanoGPT_model(ckpt_path):
    checkpoint = torch.load(ckpt_path, map_location='cpu')

    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    model.load_state_dict(state_dict)
    return model

def convert_to_transformers(model, config):
    # Convert nanoGPT model to Hugging Face Transformers format
    transformers_model = GPT2LMHeadModel(config)

    # Transpose the weights for specific layers
    state_dict = model.state_dict()
    for name, param in state_dict.items():
        if 'c_attn.weight' in name or 'c_fc.weight' in name or 'c_proj.weight' in name:
            state_dict[name] = param.transpose(0, 1)

    transformers_model.load_state_dict(state_dict, strict=False)
    return transformers_model

def push_to_huggingface(transformers_model, model_name, hf_username, hf_token):
    transformers_model.save_pretrained(model_name)
    transformers_model.push_to_hub(model_name, use_auth_token=hf_token)


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
ckpt_path = 'ckpt_epoch_31010.pt'
nanoGPT_model = load_nanoGPT_model(ckpt_path)


number of parameters: 512.96M


In [3]:
nanoGPT_model.config

GPTConfig(block_size=1024, vocab_size=32064, n_layer=24, n_head=16, n_embd=1280, dropout=0.0, bias=False)

In [4]:
config = GPT2Config(
  vocab_size=nanoGPT_model.config.vocab_size,
  n_positions=nanoGPT_model.config.block_size,
  n_ctx=nanoGPT_model.config.block_size,
  n_embd=nanoGPT_model.config.n_embd,
  n_layer=nanoGPT_model.config.n_layer,
  n_head=nanoGPT_model.config.n_head
)

transformers_model = convert_to_transformers(nanoGPT_model, config)


In [5]:
model_name = 'zicsx/GPT2-512m'
hf_username = ''
hf_token = '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'
push_to_huggingface(transformers_model, model_name, hf_username, hf_token)

print(f"Model {model_name} has been pushed to Hugging Face Model Hub under the username {hf_username}.")



pytorch_model.bin: 100%|██████████| 2.06G/2.06G [08:39<00:00, 3.97MB/s]   


Model zicsx/GPT2-512m has been pushed to Hugging Face Model Hub under the username .


In [None]:
# push tokenizer to hub
from transformers import GPT2Tokenizer, GPT2TokenizerFast

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.save_pretrained(model_name)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)

tokenizer.save_pretrained(model_name)
tokenizer.push_to_hub(model_name, use_auth_token=hf_token)
