# 用transformers库来封装model
![img](../images/LLM-structure.png)


# 自底向上搭建
- tokenizer
- MinimindLM_Dense
    - Embedding
    - Minimind_Block
        - RMSNorm
        - GQA
        - RoPE
        - FFN
    - decode
- detokenization

## tokenizer
直接利用刚刚训练好的tokenizer即可


In [1]:
from os import path, truncate
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="./",padding_side="right")
#多条信息
messages = [[
        {"role": "system", "content": "你是一个优秀的聊天机器人，总是给我正确的回应！"},
        {"role": "user", "content": '你来自哪里？'},
        {"role": "assistant", "content": '我来自地球'}
    ],[
        {"role": "system", "content": "你是一个糟糕的捣乱机器人，总是给我错误的回应！"},
        {"role": "user", "content": '你来自哪里？'},
        {"role": "assistant", "content": '我来自火星'}
    ],
    ]
input_ids=tokenizer.apply_chat_template(messages,
                truncation=True,
                max_length=100,
                padding="max_length",
                padding_side="right",
                tokenize=True, 
                add_generation_prompt=False,
                return_tensors="pt")
print(input_ids.shape)


  from .autonotebook import tqdm as notebook_tqdm


torch.Size([2, 100])


# Embedding

In [3]:
import copy
import torch 
from torch import nn
class Embedding(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    def forward(self,input_ids):
        return self.embedding(input_ids)
#测试下
embed = Embedding(vocab_size=tokenizer.vocab_size, embed_dim=512)
test_input = copy.deepcopy(input_ids)
test_input=embed(test_input)
print(test_input.shape)
print(test_input)

torch.Size([2, 100, 512])
tensor([[[ 0.5781, -0.1712,  1.7985,  ..., -0.3142,  0.2639,  0.4080],
         [ 0.9151, -1.1418,  0.6298,  ...,  0.8912, -1.1394, -0.3780],
         [-0.9539, -1.4516,  0.2507,  ..., -0.1529,  0.1850, -1.0638],
         ...,
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579],
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579],
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579]],

        [[ 0.5781, -0.1712,  1.7985,  ..., -0.3142,  0.2639,  0.4080],
         [ 0.9151, -1.1418,  0.6298,  ...,  0.8912, -1.1394, -0.3780],
         [-0.9539, -1.4516,  0.2507,  ..., -0.1529,  0.1850, -1.0638],
         ...,
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579],
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579],
         [ 0.2268,  0.1390,  2.5201,  ..., -0.3788,  0.8226, -0.8579]]],
       grad_fn=<EmbeddingBackward0>)


# RMSNorm
公式为
- **RMS Norm**

$$a_i=\frac{a_i}{RMS(a) + \epsilon} * \gamma,  \quad where \quad RMS(a) = \sqrt{\frac{1}{n}\sum^n_{i=1}a^2_i}.$$

In [4]:
import torch
from torch import nn
class RMSNorm(nn.Module):
    def __init__(self,embed_dim,eps=1e-6):
        super(RMSNorm,self).__init__()
        self.embed_dim= embed_dim
        self.eps=eps
        self.weight=nn.Parameter(torch.ones(embed_dim))
    
    def forward(self,x):
        return x*(torch.rsqrt(x.pow(2).mean(-1,keepdim=True)+self.eps))*self.weight

#测试下
#test_input的shape是[2, 100, 512]
rmsnorm=RMSNorm(embed_dim=512)
test_input=rmsnorm(test_input)
print(test_input.shape)
print(test_input)

torch.Size([2, 100, 512])
tensor([[[ 0.6100, -0.1807,  1.8976,  ..., -0.3315,  0.2784,  0.4305],
         [ 0.9154, -1.1422,  0.6300,  ...,  0.8915, -1.1398, -0.3781],
         [-0.9394, -1.4296,  0.2469,  ..., -0.1506,  0.1822, -1.0477],
         ...,
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631],
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631],
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631]],

        [[ 0.6100, -0.1807,  1.8976,  ..., -0.3315,  0.2784,  0.4305],
         [ 0.9154, -1.1422,  0.6300,  ...,  0.8915, -1.1398, -0.3781],
         [-0.9394, -1.4296,  0.2469,  ..., -0.1506,  0.1822, -1.0477],
         ...,
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631],
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631],
         [ 0.2282,  0.1398,  2.5354,  ..., -0.3811,  0.8276, -0.8631]]],
       grad_fn=<MulBackward0>)


# RoPE
RoPE主要的成就是用绝对形式表示相对编码<br>
由于RoPE主要由Qwen和LLama两种实现形式，MiniMind主要采用的LLama实现，所以可以参考[LLama实现RoPE](https://blog.csdn.net/m0_55846238/article/details/145728695)
我们这里只需要考虑两步就行<br>
- 第一步，制作pos_cis，也就是 $m\Theta$,用极坐标的形式拆分一个个 $m\Theta$,分为cos和sin的形式，方便将该信息传递给q,k
- 第二步，apply_rotary_emb
<br>最后得到融合了旋转编码后的q,k  这样q*k就可以得到含有相对位置信息和绝对位置信息的运算结果了 


In [2]:
# 制作pos_cis
# 输入的input_ids的shape是[bs,seqlen,head_num,head_dim]
# 制备的freqs所需的theta 就需要embed_dim//2 的程度
# pos_cis的结果就是[1,seqlen,1,head_dim]形状的cos和sin的表，用来进行q*pos_cis的计算
import torch
def precompute_pos_cis(seqlen,embed_dim,theta=1e5):
    assert embed_dim%2 ==0,"embed_dim必须是偶数"
    freqs=1.0/(theta**(torch.arange(0,embed_dim//2)[:embed_dim//2].float()/embed_dim))
    m=torch.arange(seqlen,device=freqs.device)
    # 制备出mtheta的外积
    # freqs此时的形状将变为[seqlen, embed_dim//2]
    # 得到mtheta表
    freqs=torch.outer(m,freqs)
    # 得到pos_cis表
    pos_cis=torch.polar(torch.zeros_like(freqs),freqs)
    return pos_cis
# 测试下
seqlen = 100
embed_dim = 512
pos_cis = precompute_pos_cis(seqlen, embed_dim)
print(pos_cis.shape)  # 应该是 [100, 256]
print(pos_cis)


torch.Size([100, 256])
tensor([[0.+0.j, 0.+0.j, 0.+0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j],
        [0.+0.j, 0.+0.j, 0.+0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j],
        [-0.+0.j, -0.+0.j, -0.+0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j],
        ...,
        [-0.+0.j, 0.+0.j, 0.-0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j],
        [-0.-0.j, -0.+0.j, 0.-0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j],
        [0.-0.j, -0.+0.j, 0.+0.j,  ..., 0.+0.j, 0.+0.j, 0.+0.j]])


In [4]:
def apply_rotary_emb(xq,xk,pos_cis):
    xq_=torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
    xk_=torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
    def unite_shape(pos_cis,x):
        #对齐pos_cis和x的形状
        #输入的x的形状是[bs,seqlen,head_num,head_dim]
        #pos_cis的形状是[seqlen,head_dim]
        #输出的形状是[bs,seqlen,head_num,head_dim]
        ndim=x.ndim
        assert 0<=1<ndim,"x必须是至少2维的张量"
        # pos_cis必须和x的[bs,seqlen,head_num,head_dim//2],在seqlen和head_dim的形状上对齐，因为这是实际参加运算的部分
        assert pos_cis.shape == (x.shape[1], x.shape[-1]), ""
        shape = [d if i == 1 or i == ndim - 1 else 1 for i,  d in enumerate(x.shape)]
        return pos_cis.view(*shape)
    pos_cis=unite_shape(pos_cis,xq_)
    xq_out= torch.view_as_real(xq_ * pos_cis).flatten(3)
    xk_out= torch.view_as_real(xk_ * pos_cis).flatten(3)
    return xq_out,xk_out

# 测试下
bs, seqlen, head_num, head_dim = 2, 100, 8, 64
xq = torch.randn(bs, seqlen, head_num, head_dim)
xk = torch.randn(bs, seqlen, head_num, head_dim)
pos_cis = precompute_pos_cis(seqlen, head_dim)
xq_out, xk_out = apply_rotary_emb(xq, xk, pos_cis)
print(xq_out.shape)  # 应该是 [2, 100, 8, 64]
print(xk_out.shape)  # 应该是 [2, 100, 8, 64]
print(xq_out)
print(xk_out)



torch.Size([2, 100, 8, 64])
torch.Size([2, 100, 8, 64])
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., -0., 0., -0.],
          [0., -0., -0.,  ..., -0., -0., 0.],
          ...,
          [0., 0., -0.,  ..., 0., 0., -0.],
          [0., 0., 0.,  ..., 0., 0., -0.],
          [0., -0., 0.,  ..., 0., -0., 0.]],

         [[0., -0., 0.,  ..., 0., 0., -0.],
          [0., -0., -0.,  ..., 0., 0., 0.],
          [-0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., -0., 0.,  ..., 0., 0., 0.],
          [-0., 0., 0.,  ..., 0., -0., 0.]],

         [[-0., 0., -0.,  ..., 0., 0., 0.],
          [0., 0., -0.,  ..., 0., 0., 0.],
          [0., -0., 0.,  ..., 0., -0., 0.],
          ...,
          [-0., 0., -0.,  ..., 0., -0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., -0., 0.,  ..., 0., -0., 0.]],

         ...,

         [[0., 0., 0.,  ..., -0., 0., 0.],
          [0., 0., 0.,  ..., -0., 0., 0.],
       