In [1]:
from transformers import pipeline, set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

Device set to use cuda:0


In [3]:
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5, truncation=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hello, I\'m a language model, so you can\'t define something in any other language. Let me introduce another topic:\n\nThe name "'},
 {'generated_text': "Hello, I'm a language model, you know.\n\nThat's right… I have a lot of friends who don't know what I do"},
 {'generated_text': "Hello, I'm a language model, not a formal one. I'm more interested in languages than formal models and I'm going to use the formal"},
 {'generated_text': "Hello, I'm a language model, so I'm going to write a function and you'll have access to the value. Well, I'm calling"},
 {'generated_text': "Hello, I'm a language model, and I'm a project manager. You know, we're in this together.\n\nNow, when we"}]

In [4]:
hfgpt2_state = generator.model.state_dict()

In [5]:
for (k, v) in hfgpt2_state.items():
  print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [6]:
generator.model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [7]:
from dataclasses import dataclass
import torch
import torch.nn as nn

In [8]:
@dataclass
class GPT2Config:
  n_layers = 12
  n_embd = 768
  n_heads = 12
  block_size = 1024
  vocab_size = 50257

In [9]:
wpe = nn.Embedding(1024, 768)

In [10]:
wpe.weight.data = hfgpt2_state['transformer.wpe.weight'].data

In [11]:
# Block attention and projection (c_attn and c_proj) are Conv1D modules with weight and bias
# if we want to use Linear, that would require transpose of loaded weights.

In [12]:
c_attn = nn.Linear(768, 768*3, bias=True, device='cuda') # query, key, values weights in single matrix
c_attn.bias.data = generator.model.transformer.h[0].attn.c_attn.bias.data
c_attn.weight.data = generator.model.transformer.h[0].attn.c_attn.weight.data.T

In [13]:
x = torch.randn((4, 6, 768), device='cuda')

In [14]:
out_hf = generator.model.transformer.h[0].attn.c_attn(x)

In [15]:
out_loc = c_attn(x)

In [16]:
(out_hf == out_loc).all()

tensor(True, device='cuda:0')

In [24]:
generator.model.transformer.h[0].attn

GPT2Attention(
  (c_attn): Conv1D(nf=2304, nx=768)
  (c_proj): Conv1D(nf=768, nx=768)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

In [25]:
qkv = torch.randn((4, 8, 12))
q,k,v = torch.split(qkv, 4, dim=-1)

In [28]:
v.shape

torch.Size([4, 8, 4])