In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Reference

In [3]:
prompt_text = "a robot must obey the orders given"
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
encoded_prompt

tensor([[   64,  9379,  1276, 22389,   262,  6266,  1813]])

In [4]:
with torch.no_grad():
    prediction_scores, past = model(encoded_prompt)

In [5]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284)

In [6]:
[tok.replace('Ġ', ' ') for tok in tokenizer.convert_ids_to_tokens(next_token_logits.topk(10).indices)]

[' to', ' by', ' it', ' him', ' them', ' in', '.', ',', ' the', ' and']

# Peel off LM head

In [7]:
type(model.transformer)

transformers.modeling_gpt2.GPT2Model

In [8]:
last_hidden_states = model.transformer(encoded_prompt)[0]
last_hidden_states.shape

torch.Size([1, 7, 768])

In [9]:
prediction_scores = model.lm_head(last_hidden_states)

In [10]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284, grad_fn=<NotImplemented>)

# Delve into Transformer

In [11]:
xformer = model.transformer
word_to_embedding = xformer.wte
word_to_embedding.weight.shape

torch.Size([50257, 768])

In [12]:
first_word_idx = encoded_prompt[0, 0]
first_word_embedding = word_to_embedding.weight[first_word_idx]
first_word_embedding.shape

torch.Size([768])

## Here's the `forward` code, simplified

In [13]:
xformer.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "vocab_size": 50257
}

In [14]:
input_ids = encoded_prompt
batch_size, seq_len = input_ids.size()
assert batch_size == 1

In [15]:
xformer_layers = xformer.h
len(xformer_layers)

12

In [16]:
xformer.config.n_layer

12

In [17]:
device = input_ids.device
position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0)
position_ids

tensor([[0, 1, 2, 3, 4, 5, 6]])

Embed the inputs

In [18]:
inputs_embeds = xformer.wte(input_ids)
position_embeds = xformer.wpe(position_ids)
hidden_states = inputs_embeds + position_embeds
hidden_states = xformer.drop(hidden_states)
batch_size, seq_len, n_hidden = hidden_states.shape
hidden_states.shape

torch.Size([1, 7, 768])

In [19]:
inputs_embeds.shape

torch.Size([1, 7, 768])

In [20]:
input_ids.shape

torch.Size([1, 7])

In [21]:
output_shape = (1, seq_len, n_hidden)
output_shape

(1, 7, 768)

In [22]:
for block in xformer_layers:
    outputs = block(
        hidden_states,
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    )
    hidden_states = outputs[0]

In [23]:
layer_normalizer = xformer.ln_f
hidden_states = layer_normalizer(hidden_states)

In [24]:
hidden_states = hidden_states.view(*output_shape)

In [25]:
hidden_states.shape

torch.Size([1, 7, 768])

In [26]:
last_hidden_states = hidden_states
last_hidden_states.shape

torch.Size([1, 7, 768])

## now pass that on to the LM head

In [27]:
prediction_scores = model.lm_head(last_hidden_states)

In [28]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284, grad_fn=<NotImplemented>)

yay, same output!

# Delve into a block

In [30]:
input_ids = encoded_prompt
batch_size, seq_len = input_ids.size()
assert batch_size == 1

In [31]:
device = input_ids.device
position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device).unsqueeze(0)
position_ids

tensor([[0, 1, 2, 3, 4, 5, 6]])

Embed the inputs

In [32]:
inputs_embeds = xformer.wte(input_ids)
position_embeds = xformer.wpe(position_ids)
hidden_states = inputs_embeds + position_embeds
hidden_states = xformer.drop(hidden_states)
batch_size, seq_len, n_hidden = hidden_states.shape
hidden_states.shape

torch.Size([1, 7, 768])

In [33]:
for block in xformer_layers:
    layer_input = hidden_states
    output_attn = block.attn(
        block.ln_1(layer_input),
        layer_past=None,
        attention_mask=None,
        head_mask=None,
        use_cache=False,
        output_attentions=False,
    )
    a = output_attn[0]  # output_attn: a, present, (attentions)

    x = layer_input + a
    m = block.mlp(block.ln_2(x))
    hidden_states = x + m

In [34]:
layer_normalizer = xformer.ln_f
hidden_states = layer_normalizer(hidden_states)

In [35]:
hidden_states = hidden_states.view(*output_shape)

In [36]:
hidden_states.shape

torch.Size([1, 7, 768])

In [37]:
last_hidden_states = hidden_states
last_hidden_states.shape

torch.Size([1, 7, 768])

In [38]:
prediction_scores = model.lm_head(last_hidden_states)

In [39]:
next_token_logits = prediction_scores[0, -1, :]
next_token_logits.argmax()

tensor(284, grad_fn=<NotImplemented>)