In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Reference

In [5]:
prompt_text = "a robot must obey the orders given"
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
encoded_prompt

tensor([[   64,  9379,  1276, 22389,   262,  6266,  1813]])

In [6]:
with torch.no_grad():
    prediction_scores, past = model(encoded_prompt)

In [7]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

In [8]:
[tok.replace('Ġ', ' ') for tok in tokenizer.convert_ids_to_tokens(next_token_logits.topk(10).indices)]

[' to', ' by', ' it', ' him', ' them', ' in', '.', ',', ' the', ' and']

# Peel off LM head

In [9]:
type(model.transformer)

transformers.modeling_gpt2.GPT2Model

In [10]:
last_hidden_states = model.transformer(encoded_prompt)[0]
last_hidden_states.shape

torch.Size([1, 7, 768])

In [11]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

# Delve into Transformer

In [12]:
xformer = model.transformer
word_to_embedding = xformer.wte
word_to_embedding.weight.shape

torch.Size([50257, 768])

In [13]:
first_word_idx = encoded_prompt[0, 0]
first_word_embedding = word_to_embedding.weight[first_word_idx]
first_word_embedding.shape

torch.Size([768])

## Here's the `forward` code, simplified

In [14]:
xformer.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [15]:
input_ids = encoded_prompt
batch_size, seq_len = input_ids.size()
assert batch_size == 1

In [16]:
xformer_layers = xformer.h
len(xformer_layers)

12

In [17]:
xformer.config.n_layer

12

In [18]:
device = input_ids.device
position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0)
position_ids

tensor([[0, 1, 2, 3, 4, 5, 6]])

Embed the inputs

In [19]:
inputs_embeds = xformer.wte(input_ids)
position_embeds = xformer.wpe(position_ids)
hidden_states = inputs_embeds + position_embeds
hidden_states = xformer.drop(hidden_states)
batch_size, seq_len, n_hidden = hidden_states.shape
hidden_states.shape

torch.Size([1, 7, 768])

In [20]:
input_ids.shape

torch.Size([1, 7])

In [21]:
output_shape = (1, seq_len, n_hidden)
output_shape

(1, 7, 768)

In [22]:
for i, (block, layer_past) in enumerate(zip(xformer_layers, past)):
    outputs = block(
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    )
    hidden_states = outputs[0]

In [23]:
layer_normalizer = xformer.ln_f
hidden_states = layer_normalizer(hidden_states)

In [24]:
hidden_states = hidden_states.view(*output_shape)

In [25]:
hidden_states.shape

torch.Size([1, 7, 768])

In [26]:
last_hidden_states = hidden_states
last_hidden_states.shape

torch.Size([1, 7, 768])

In [27]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

# Delve into a block

In [28]:
input_ids = encoded_prompt
batch_size, seq_len = input_ids.size()
assert batch_size == 1

In [29]:
device = input_ids.device
position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device).unsqueeze(0)
position_ids

tensor([[0, 1, 2, 3, 4, 5, 6]])

Embed the inputs

In [30]:
inputs_embeds = xformer.wte(input_ids)
position_embeds = xformer.wpe(position_ids)
hidden_states = inputs_embeds + position_embeds
hidden_states = xformer.drop(hidden_states)
batch_size, seq_len, n_hidden = hidden_states.shape
hidden_states.shape

torch.Size([1, 7, 768])

In [31]:
for i, (block, layer_past) in enumerate(zip(xformer_layers, past)):
    layer_input = hidden_states
    output_attn = block.attn(
        block.ln_1(layer_input),
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    )
    a = output_attn[0]  # output_attn: a, present, (attentions)

    x = layer_input + a
    m = block.mlp(block.ln_2(x))
    hidden_states = x + m

In [32]:
layer_normalizer = xformer.ln_f
hidden_states = layer_normalizer(hidden_states)

In [33]:
hidden_states = hidden_states.view(*output_shape)

In [34]:
hidden_states.shape

torch.Size([1, 7, 768])

In [35]:
last_hidden_states = hidden_states
last_hidden_states.shape

torch.Size([1, 7, 768])

In [36]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

In [37]:
block.attn??

[0;31mSignature:[0m       [0mblock[0m[0;34m.[0m[0mattn[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            Attention
[0;31mString form:[0m    
Attention(
  (c_attn): Conv1D()
  (c_proj): Conv1D()
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)
[0;31mFile:[0m            ~/code/transformers/src/transformers/modeling_gpt2.py
[0;31mSource:[0m         
[0;32mclass[0m [0mAttention[0m[0;34m([0m[0mnn[0m[0;34m.[0m[0mModule[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mnx[0m[0;34m,[0m [0mn_ctx[0m[0;34m,[0m [0mconfig[0m[0;34m,[0m [0mscale[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0msuper[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0m__init__[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34