# GPT-2 Model

In [1]:
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    AutoModelForCausalLM,
)
import torch
import torch.nn.functional as F
import pandas as pd

|model        |参数量       |hidden dim        |  block 数量 |
| ----------- |----------- | ---------------- | -----------|
|gpt2         |124M        |    768  (64*12) |          12|
|gpt2-medium  |355M        |    1024 (64*16) |          24|
|gpt2-large   |774M        |    1280 (64*20) |          36|
|gpt2-xl      |1.56B       |    1600 (64*25) |          48|

In [2]:
model_name = "gpt2"

gpt2_config = AutoConfig.from_pretrained(model_name)
gpt2_tokenizer = AutoTokenizer.from_pretrained(model_name)
gpt2_model = AutoModel.from_pretrained(model_name)
gpt2_clm_model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
gpt2_config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
gpt2_model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [5]:
gpt2_clm_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

![](../images/gpt2.drawio.svg)

# Tokenizer

In [6]:
print(f"gpt2 tokenizer vocab size: {len(gpt2_tokenizer.vocab)}")
print(f"gpt2 tokenizer speical token map: {gpt2_tokenizer.special_tokens_map}")

gpt2 tokenizer vocab size: 50257
gpt2 tokenizer speical token map: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [7]:
# 设置tokenizer的补齐规则
gpt2_tokenizer.padding_side = "left"
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

In [8]:
input_texts = ["long long ago, I was a boy, I love", "gpt2.drawio"]
tokenizer_out = gpt2_tokenizer(input_texts, padding=True, return_tensors="pt")
print(tokenizer_out)

{'input_ids': tensor([[ 6511,   890,  2084,    11,   314,   373,   257,  2933,    11,   314,
          1842],
        [50256, 50256, 50256, 50256, 50256,    70,   457,    17,    13, 19334,
           952]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])}


In [9]:
print(gpt2_tokenizer.convert_ids_to_tokens(tokenizer_out.input_ids[0].tolist()))
print(gpt2_tokenizer.decode(tokenizer_out.input_ids[0].tolist()))
print(gpt2_tokenizer.convert_ids_to_tokens(tokenizer_out.input_ids[1].tolist()))
print(gpt2_tokenizer.decode(tokenizer_out.input_ids[1].tolist()))

['long', 'Ġlong', 'Ġago', ',', 'ĠI', 'Ġwas', 'Ġa', 'Ġboy', ',', 'ĠI', 'Ġlove']
long long ago, I was a boy, I love
['<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>', 'g', 'pt', '2', '.', 'draw', 'io']
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>gpt2.drawio


上面的token以"Ġ"开头，应该表示该token前面有个空格。

# Model Forward

## 整体的输出

GPT2模型的直接进行forward的输出有两个：

* last_hidden_state
* past_key_values

其中`past_key_values`中保存的是一个GPT2Block中的Key和Value，而`last_hidden_state`则是一个 $N\times T \times d$ 的输出Tensor

In [10]:
gpt2_model_output = gpt2_model(**tokenizer_out)
last_hidden_state = gpt2_model_output.last_hidden_state
past_key_values = gpt2_model_output.past_key_values
print(f"last_hidden_state: {last_hidden_state.shape}")
print(
    f"past_key_values: ({len(past_key_values)}, {len(past_key_values[0])}, {past_key_values[0][0].shape})"
)

last_hidden_state: torch.Size([2, 11, 768])
past_key_values: (12, 2, torch.Size([2, 12, 11, 64]))


## Embeding

In [11]:
token_embeds = gpt2_model.wte(tokenizer_out["input_ids"])
batch_size, seq_len = tokenizer_out["input_ids"].shape
position_ids = torch.arange(seq_len).unsqueeze(0).repeat((batch_size, 1))
position_embeds = gpt2_model.wpe(position_ids)

embeds_out = token_embeds + position_embeds
embeds_out = gpt2_model.drop(embeds_out)

## GTP2Block 

In [12]:
past_key_values = tuple([None] * gpt2_config.n_layer)
head_mask = gpt2_model.get_head_mask(None, gpt2_config.n_layer)

hidden_states = embeds_out

attent_mask = tokenizer_out["attention_mask"]
attent_mask = attent_mask[:, None, None, :]
attent_mask = attent_mask.to(hidden_states.dtype)
attent_mask = (1 - attent_mask) * torch.finfo(attent_mask.dtype).min

for i, (block, layer_past) in enumerate(zip(gpt2_model.h, past_key_values)):
    block_out = block(
        hidden_states,
        layer_past=layer_past,
        attention_mask=attent_mask,
        head_mask=head_mask[i],
    )
    hidden_states = block_out[0]

hidden_states = gpt2_model.ln_f(hidden_states)

assert (hidden_states != last_hidden_state).sum() < 1e-6

# LM Head

LM Head 对于backbone抽取到的hidden_states进行一次Linear变换，输出维度为词表的大小。

In [13]:
lm_head_logits = gpt2_clm_model.lm_head(hidden_states)
lm_predicts = torch.argmax(lm_head_logits, dim=-1)

# 取出最后一个位置的输出
last_token_output_prob = torch.softmax(lm_head_logits[:, -1, :], -1)
sorted_ids = torch.argsort(last_token_output_prob, dim=-1, descending=True)

# 现在我们重点看一下，batch中第一个句子的预测结果，列出top5的预测的token

lm_outputs = []
for i in range(sorted_ids.size(0)):
    lm_output = {}
    lm_output["input"] = gpt2_tokenizer.decode(tokenizer_out["input_ids"][i])
    for k in range(5):
        token_id = sorted_ids[i][k]
        prob = last_token_output_prob[i][token_id]
        lm_output[f"top {k}"] = f"{gpt2_tokenizer.decode(token_id)}({100*prob:.2f}%)"
    lm_outputs.append(lm_output)
pd.DataFrame(lm_outputs)

Unnamed: 0,input,top 0,top 1,top 2,top 3,top 4
0,"long long ago, I was a boy, I love",you(20.24%),my(10.99%),the(4.46%),to(3.41%),it(3.24%)
1,<|endoftext|><|endoftext|><|endoftext|><|endof...,pt(90.58%),ip(0.86%),\n(0.65%),",(0.48%)",ign(0.47%)


# Generate

HuggingFace上的文章：[How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)

In [14]:
# 对一个句子进行decode，不支持batch
def gpt2_causal_lm_decode(model, tokenizer, text, nsteps=5):
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    for _ in range(nsteps):
        out = model(input_ids=input_ids)
        logits = out.logits[:, -1, :].unsqueeze(1)
        token_id = torch.argmax(logits, dim=-1)
        input_ids = torch.concat([input_ids, token_id], dim=-1)
    return tokenizer.decode(input_ids[0])

In [15]:
gpt2_causal_lm_decode(gpt2_clm_model, gpt2_tokenizer, "long long ago, I was", 10)

'long long ago, I was a young man who had been raised in a family'

In [16]:
def gpt2_causal_lm_decode_ref(model, tokenizer, text, nsteps=5):
    input_ids = tokenizer(text, padding=True, return_tensors="pt")
    outputs = model.generate(**input_ids, max_new_tokens=nsteps, do_sample=False)
    return [tokenizer.decode(output) for output in outputs]


texts = ["long long ago, I was"]
gpt2_causal_lm_decode_ref(gpt2_clm_model, gpt2_tokenizer, texts, 10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['long long ago, I was a young man who had been raised in a family']

# Beam Search

In [17]:
def beam_search(model, input_ids, batch_size, beam_size, past_probs=None):
    """
    beam_search 对输入的context tokens进行一次deocde，然后保留beam_size个
    最大概率的预测token，进行整体概率的计算，排名后，每个样本保留beam_size个

    input_ids: (N, seq_len) torch.Tenosr
        表示输出的token id, 第一次调用时为原始的context token，N = batch_size
        后续再进行search时，N = batch_size * beam_size
    past_probs: (batch_size*beam_size,)
        表示每条句子的过去累积概率，第一次调用时，past_probs为None
    """
    logits = model(input_ids=input_ids).logits[:, -1, :]
    probs = F.softmax(logits, -1)
    beam_top_ids = torch.argsort(probs, -1, True)[:, :beam_size]
    beam_top_probs = torch.gather(probs, -1, beam_top_ids)

    output_ids = torch.concat(
        [input_ids.repeat_interleave(beam_size, 0), beam_top_ids.reshape(-1, 1)], dim=-1
    )

    if past_probs is None:
        past_probs = beam_top_probs.reshape(-1)
    else:
        past_probs = past_probs.repeat_interleave(beam_size, 0)
        past_probs = past_probs * beam_top_probs.reshape(-1)

    resorted_ids = past_probs.reshape(batch_size, -1).argsort(-1, True)
    resorted_ids = resorted_ids[:, :beam_size]
    past_probs = torch.gather(
        past_probs.reshape(batch_size, -1), -1, resorted_ids
    ).reshape(-1)

    output_ids = output_ids[resorted_ids.reshape(-1)]
    return output_ids, past_probs


# 对一个句子进行decode，不支持batch
def gpt2_causal_lm_decode_with_beam_search(model, tokenizer, text, nsteps=5, beam=3):
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    past_probs = None
    batch_size = input_ids.size(0)
    for _ in range(nsteps):
        input_ids, past_probs = beam_search(
            model, input_ids, batch_size, beam, past_probs
        )
    return tokenizer.decode(input_ids[0])

In [18]:
gpt2_causal_lm_decode_with_beam_search(
    gpt2_clm_model, gpt2_tokenizer, "long long ago, I was", 10
)

'long long ago, I was in the middle of a fight with a man who'

In [19]:
def gpt2_causal_lm_decode_with_beam_search_reef(model, tokenizer, text, nsteps=5):
    input_ids = tokenizer(text, padding=True, return_tensors="pt")
    outputs = model.generate(
        **input_ids, max_new_tokens=nsteps, do_sample=False, num_beams=3
    )
    return [tokenizer.decode(output) for output in outputs]


texts = ["long long ago, I was"]
gpt2_causal_lm_decode_with_beam_search_reef(gpt2_clm_model, gpt2_tokenizer, texts, 10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['long long ago, I was in the middle of a fight with a man who']

# Decode Sampling

GPT模型的`generate`接口，支持以下的参数，可以用于控制生成样本的多样性

* do_sample: 表示是否进行随机性采样输出
* temperature: 用于调整logits经过softmax后的概率分布，温度越高，则分布越平均，输出多样性越强，容易乱说；温度越低，分布越聚焦，输出更稳定。
    $$q_i = \frac{\exp{z_i / T}}{\sum_j\exp{z_j/T}}$$
* top_k: 控制采样时，只在topK的范围内进行采样
* top_p: 控制采样时，只在累计概率在top_p之内的范围内进行采样

In [20]:
logits = torch.tensor([1, 2, 4], dtype=torch.float)
print(F.softmax(logits, -1))
temperature = 3
print(F.softmax(logits / temperature, -1))

tensor([0.0420, 0.1142, 0.8438])
tensor([0.1955, 0.2729, 0.5315])


![](../images/gpt_do_sample.png)