# 1 set model args

In [176]:
from rich.jupyter import print

In [177]:
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.1
bias = False

model_args =  dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=1024,
                  bias=bias, vocab_size=None, dropout=dropout)

In [178]:
# 将vocab_size设置成文本中的vocabulary size
import os
import pickle

data_dir = os.path.join('data', 'shakespeare_char')

meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    meta_vocab_size = meta['vocab_size']
    print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")

In [179]:
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304

In [180]:
from model import GPTConfig
gptconf = GPTConfig(**model_args)
gptconf

GPTConfig(block_size=1024, vocab_size=65, n_layer=12, n_head=12, n_embd=768, dropout=0.1, bias=False)

# 2 load model

In [181]:
from model import GPT
model = GPT(gptconf).to('cpu')

number of parameters: 85.00M


In [182]:
# embedding -> pos embedding -> droupout -> Module -> layernorm -> lm_head
# module: layernorm -> attention -> layernorm -> mlp
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=65, bias=False)
)

# 3 load data

In [183]:
import torch

X = torch.load('data/X.tensor').to('cpu')
Y = torch.load('data/Y.tensor').to('cpu')
X.shape, Y.shape

(torch.Size([16, 256]), torch.Size([16, 256]))

# 4 GPT forward

In [184]:
model.transformer.wpe, model.transformer.wte # 词表大小64， 位置范围1000

(Embedding(1024, 768), Embedding(65, 768))

## 4.1 embedding input

In [185]:
# 1 token embedding
print('-----embding-input-------')
print('词嵌入向量维n_embd = ', n_embd)
tok_emb = model.transformer.wte(X)
print('tok_emb:', X.shape, '->' ,tok_emb.shape)

# 2 pos embedding
b, t = X.size()
pos = torch.arange(0, t, dtype=torch.long)
pos_emb = model.transformer.wpe(pos)
print('pos_emb:', pos.shape,'->',pos_emb.shape)

print('tok_emb+pos_emb:', (tok_emb + pos_emb).shape)

# 3 droupout
x = model.transformer.drop(tok_emb + pos_emb)
x_enc = model.transformer.drop(tok_emb + pos_emb)
print('编码后embding input:', x.shape)

## 4.2 Docoder

In [186]:
model.transformer.h

ModuleList(
  (0-11): 12 x Block(
    (ln_1): LayerNorm()
    (attn): CausalSelfAttention(
      (c_attn): Linear(in_features=768, out_features=2304, bias=False)
      (c_proj): Linear(in_features=768, out_features=768, bias=False)
      (attn_dropout): Dropout(p=0.1, inplace=False)
      (resid_dropout): Dropout(p=0.1, inplace=False)
    )
    (ln_2): LayerNorm()
    (mlp): MLP(
      (c_fc): Linear(in_features=768, out_features=3072, bias=False)
      (gelu): GELU(approximate='none')
      (c_proj): Linear(in_features=3072, out_features=768, bias=False)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [187]:
for block in model.transformer.h:
    x = block(x)
print('decoder x:', x.shape)

## 4.3 LayerNorm

In [188]:
model.transformer.ln_f

LayerNorm()

In [189]:
x = model.transformer.ln_f(x)
print('ln x:', x.shape)

## 4.4 lm head

In [190]:
logits = model.lm_head(x)
print('lm head:', logits.shape)
print('lm_head输出与解码词汇量相同, meta_vocab_size=',meta_vocab_size )

## 4.5 loss

In [191]:
from torch.nn import functional as F

# logits size 16 * 256 *65, Y size 16 * 256
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1))
print(loss)

In [192]:
logits.view(-1, logits.size(-1))[0]

tensor([-0.4151,  1.3813, -0.1995, -0.3672, -0.6645,  0.3567, -0.2381,  0.9048,
         0.3088,  0.7354,  0.4698, -0.0812, -0.4575, -0.7028, -0.0207, -0.4111,
        -1.3014,  1.1335, -0.4810,  0.0814, -0.4145, -0.5997, -0.1035, -0.8261,
        -0.2869, -0.2561, -1.5851,  0.1876,  0.2345,  0.2050, -1.2864, -0.3437,
         0.5043, -0.0334,  0.1202, -0.5901, -0.2584, -0.7200, -0.5779, -0.5142,
        -0.8425, -0.1300,  0.1785,  0.5986,  0.1964,  0.4896, -0.0851, -0.6266,
         0.7363, -0.3790,  0.8884,  0.0170,  0.0494,  0.2876,  0.0820,  0.5260,
        -0.8593, -0.1791, -1.2604,  0.1468, -0.4441, -0.7570, -0.2177, -0.9067,
         0.4340], grad_fn=<SelectBackward0>)

In [193]:
Y.view(-1)[0]

tensor(40)

In [194]:
logits.view(-1, logits.size(-1))[0][40] # 期望这个类别的得分最高来降低loss

tensor(-0.8425, grad_fn=<SelectBackward0>)

# 4.2.1 decoder block

In [195]:
x_enc.shape

torch.Size([16, 256, 768])

In [196]:
model.transformer.h[0]

Block(
  (ln_1): LayerNorm()
  (attn): CausalSelfAttention(
    (c_attn): Linear(in_features=768, out_features=2304, bias=False)
    (c_proj): Linear(in_features=768, out_features=768, bias=False)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm()
  (mlp): MLP(
    (c_fc): Linear(in_features=768, out_features=3072, bias=False)
    (gelu): GELU(approximate='none')
    (c_proj): Linear(in_features=3072, out_features=768, bias=False)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [197]:
x_ln_1 = model.transformer.h[0].ln_1(x_enc)
x_attn = model.transformer.h[0].attn(x_ln_1)
# 这里先add 再norm
x_ln_2 = model.transformer.h[0].ln_2(x_attn + x_enc)
x_mlp = model.transformer.h[0].mlp(x_ln_2)
x = x_mlp + x_attn
x.shape

torch.Size([16, 256, 768])

![123](img/addNorm.png)

## 4.2.2 masked attention - scaled_dot_product_attention

In [198]:
model.transformer.h[0]

Block(
  (ln_1): LayerNorm()
  (attn): CausalSelfAttention(
    (c_attn): Linear(in_features=768, out_features=2304, bias=False)
    (c_proj): Linear(in_features=768, out_features=768, bias=False)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm()
  (mlp): MLP(
    (c_fc): Linear(in_features=768, out_features=3072, bias=False)
    (gelu): GELU(approximate='none')
    (c_proj): Linear(in_features=3072, out_features=768, bias=False)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [199]:
attn = model.transformer.h[0].attn
print(attn)

In [200]:
print('---------------1. 将嵌入向量传播成3*n_embd--------------')
x = attn.c_attn(x_ln_1)

print("n_embed:",attn.n_embd)
print("n_embed*3:",attn.n_embd * 3)
print("x:", x.shape)

In [201]:
print('---------------2. 将3*n_embd split成QKV--------------')
q, k, v  = x.split(attn.n_embd, dim=2)
print("split: q:", q.shape)

In [202]:
print('---------------3. 将QKV拆分 多头QKV--------------')
B, T, D = x_ln_1.shape
print("n_embed:{} / n_head:{} = {} ".format(D, attn.n_head, D//attn.n_head))
k = k.view(B, T, attn.n_head, D // attn.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, attn.n_head, D // attn.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, attn.n_head, D // attn.n_head).transpose(1, 2) # (B, nh, T, hs)
print("q:",q.shape)

In [203]:
print('---------------4.多头计算attention，直接使用torch function--------------')
y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, 
                                                     dropout_p=attn.dropout 
                                                     if attn.training 
                                                     else 0, is_causal=True)
print('y:', y.shape)

In [204]:
print('---------------5.将多头注意力结果拼接--------------')
y = y.transpose(1, 2).contiguous().view(B, T, D) # re-assemble all head outputs side by side
print('y-concat:', y.shape)

In [205]:
print('---------------6. 增加一次前向传播和droupout--------------')
y = attn.resid_dropout(attn.c_proj(y))
print("y_proj:", y.shape)

## 4.2.3 masked attention - no scaled_dot_product_attention

In [206]:
import math

print('---------------1. scale and dot product process--------------')
att = (q @ k.transpose(-1,-2)) / math.sqrt(k.size(-1))
print("q:", q.shape)
print("k:", k.shape)
print("k.transpose(-1,-2):", k.transpose(-1,-2).shape)
print("attn:", att.shape)

In [207]:
print('---------------2. mask attention--------------')
mask = torch.tril(torch.ones(T,T)).view(1, 1, T, T)
att = att.masked_fill(mask==0, float('-inf'))
att[2][1], att.shape

(tensor([[-0.1135,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
         [ 0.2179, -0.0096,    -inf,  ...,    -inf,    -inf,    -inf],
         [-0.1820,  0.0797, -0.0551,  ...,    -inf,    -inf,    -inf],
         ...,
         [ 0.3416,  0.1511,  0.2173,  ..., -0.5618,    -inf,    -inf],
         [-0.1526, -0.1941, -0.0130,  ...,  0.1069, -0.8214,    -inf],
         [-0.1285, -0.3495,  0.5454,  ...,  0.0649,  0.0594,  0.0226]],
        grad_fn=<SelectBackward0>),
 torch.Size([16, 12, 256, 256]))

In [208]:
print('---------------3. softmax--------------')
att = F.softmax(att, dim=-1) # 每一行变成概率
att.shape

torch.Size([16, 12, 256, 256])

In [209]:
print('---------------4. dropout--------------')
att = attn.attn_dropout(att)
att.shape

torch.Size([16, 12, 256, 256])

In [210]:
print('---------------5. attn score--------------')
print("att:", att.shape)
print("v:", v.shape)
y = att @ v
print("y:", y.shape)

## 4.2.4 mlp

In [211]:
model.transformer.h[0].mlp

MLP(
  (c_fc): Linear(in_features=768, out_features=3072, bias=False)
  (gelu): GELU(approximate='none')
  (c_proj): Linear(in_features=3072, out_features=768, bias=False)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [212]:
mlp = model.transformer.h[0].mlp
print("x:", x_enc.shape)
x_fc = mlp.c_fc(x_enc)
print("x_fc:", x_fc.shape)
x_gelu = mlp.gelu(x_fc)
print("x_gelu:", x_gelu.shape)
x_proj = mlp.c_proj(x_gelu)
print("x_proj:", x_proj.shape)
x_mlp = mlp.dropout(x_proj)
print("x_mlp:", x_mlp.shape)

## 4.2.5 layerNorm

In [213]:
model.transformer.h[0].ln_1

LayerNorm()

In [214]:
ln = model.transformer.h[0].ln_1
print(x_enc.shape)
x = F.layer_norm(x_enc, ln.weight.shape, ln.weight, ln.bias, 1e-5)
print(x.shape)