# 从零实现GPT模型进行文本生成

In [25]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim':768,
    'n_heads':12,
    'n_layers':12,
    'drop_rate':0.1,
    'qkv_bias':False
}

In [26]:
import torch
import torch.nn as nn


## 4.2 使用归一化层进行归一化激活

In [27]:
# 定义归一化层

class LayerNorm(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        var = x.var(-1,keepdim=True,unbiased=False)
        x = (x - mean) / torch.sqrt(var + self.eps)
        return x * self.scale + self.shift

## 4.3 实现GELU激活函数

In [28]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

# 4.4 带GELU激活函数的前馈神经网络

In [29]:
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            GELU(),
            nn.Linear(4*cfg['emb_dim'],cfg['emb_dim']),
        )
    def forward(self,x):
        return self.layers(x)

## 4.5 连接Transformer块中的注意力层和线性层

In [30]:
from previous_chapters import MultiHeadAttention

class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg['emb_dim'])
        self.ln2 = LayerNorm(cfg['emb_dim'])
        self.attn = MultiHeadAttention(d_in = cfg['emb_dim'], d_out = cfg['emb_dim'], 
                                           context_length = cfg['context_length'], 
                                           dropout = cfg['drop_rate'], 
                                           num_heads = cfg['n_heads'], qkv_bias = False)
        self.ffn = FeedForward(cfg)
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])
    def forward(self,x):
        shortcut = x
        x = self.ln1(x)
        x = self.attn(x)
        x= self.drop_shortcut(x)
        x = x + shortcut
        
        shortcut = x
        x = self.ln2(x)
        x = self.ffn(x)
        x= self.drop_shortcut(x)
        x = x + shortcut
        
        return x

## 4.6 实现GPT模型

In [31]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )
        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'],cfg['vocab_size'],bias=False)
    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(torch.arange(x.size(1),device=x.device))
        input_emb = tok_emb + pos_emb
        input_emb = self.drop_emb(input_emb)
        x = self.trf_blocks(input_emb)
        x = self.final_norm(x)
        return self.out_head(x)

### 测试GPT模型

In [32]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
text1 = 'every effort moves you'
text2 = 'every day holds a'
batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch,dim=0)  # (B,T)
print(batch)


tensor([[16833,  3626,  6100,   345],
        [16833,  1110,  6622,   257]])


In [33]:
torch.manual_seed(42)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print(out.shape)  # (B,T,V)
print(out)


torch.Size([2, 4, 50257])
tensor([[[ 0.3640, -0.5654,  0.4703,  ..., -0.4383, -0.0280, -0.4681],
         [ 0.1578, -0.2142,  0.7080,  ..., -0.0916, -0.0604, -0.7349],
         [-0.3492,  0.6082,  0.2345,  ...,  0.7863,  0.0706, -0.7593],
         [ 0.1236,  0.6927,  0.3464,  ...,  0.0312, -0.8640, -0.0684]],

        [[ 0.3428, -0.2037, -0.3596,  ..., -0.8504, -0.2254, -0.4678],
         [ 0.6280,  0.2062,  0.1321,  ..., -0.2014, -0.1827, -0.6977],
         [-0.3165,  0.9189,  0.6268,  ...,  1.3942,  0.0910,  1.0845],
         [-0.5646, -0.3015,  1.6866,  ...,  0.3215, -0.3524, -0.9673]]],
       grad_fn=<UnsafeViewBackward0>)


In [36]:
# 统计模型总参数量

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

total_size_bytes = total_params * 4  # assuming float32
total_size_mb = total_size_bytes / (1024 ** 2)
print(f"Total size (MB): {total_size_mb:.2f} MB")

Total parameters: 163009536
Total size (MB): 621.83 MB


# 4.7 生成文本

In [37]:
def generate_text_simple(model,idx,max_new_tokens,context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:,-1,:]
        probas = torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idx = torch.cat((idx,idx_next),dim=1)
    return idx

In [40]:
start_text = "every effort i want"
encoded = tokenizer.encode(start_text)
print('encoded:' ,encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # (1, T)
print('encoded_tensor.shape:' ,encoded_tensor.shape)


encoded: [16833, 3626, 1312, 765]
encoded_tensor.shape: torch.Size([1, 4])


In [43]:
model.eval()
out = generate_text_simple(model,encoded_tensor,max_new_tokens=6,context_size=1024)
print('out.shape:' ,out.shape)
print('out length: ' ,len(out[0]))
print('生成文本:' ,tokenizer.decode(out[0].tolist()))

out.shape: torch.Size([1, 10])
out length:  10
生成文本: every effort i wantologic wealth Anxiety Amtrak incomes prescribe
