# 利用transformers库封装model

通过上一个文件，我们已经知道了minimind-dense的torch模型了<br>
现在我们用transformers库来封装model，方便后续的上传和推理过程<br>
要素：
- tokenizer
- embedding
- minimind-block
    - RoPE
    - RMSNorm
    - GQA
    - FFN
- lm_head
- de-tokenizer

[关于transformers库](https://cloud.tencent.com/developer/article/2367010)

## 1.tokenizer
使用我们之前预训练数据集训练好的tokenizer

[关于tokenizer参数的相关设置](https://zhuanlan.zhihu.com/p/341994096) <br>
[关于left_padding和right_padding的讨论](https://zhuanlan.zhihu.com/p/646852375)<br>
[如何改进增强长文本处理能力](https://zhuanlan.zhihu.com/p/638976034)


In [1]:
from transformers import AutoTokenizer
tokenizer_path= "./"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
print(tokenizer)
print(tokenizer.vocab_size)
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_tokens)

# 写一个虚拟的小的数据集，只有两条数据的集
data=[
    {'text':'<|im_start|>鉴别一组中文文章的风格和特点，例如官方、口语、文言等。需要提供样例文章才能准确鉴别不同的风格和特点。<|im_end|> <|im_start|>好的，现在帮我查一下今天的天气怎么样?今天的天气依据地区而异。请问你需要我帮你查询哪个地区的天气呢？<|im_end|> <|im_start|>打开闹钟功能，定一个明天早上七点的闹钟。好的，我已经帮您打开闹钟功能，闹钟将在明天早上七点准时响起。<|im_end|> <|im_start|>为以下场景写一句话描述：一个孤独的老人坐在公园长椅上看着远处。一位孤独的老人坐在公园长椅上凝视远方。<|im_end|> <|im_start|>非常感谢你的回答。请告诉我，这些数据是关于什么主题的？这些数据是关于不同年龄段的男女人口比例分布的。<|im_end|> <|im_start|>帮我想一个有趣的标题。这个挺有趣的："如何成为一名成功的魔术师" 调皮的标题往往会吸引读者的注意力。<|im_end|> <|im_start|>回答一个问题，地球的半径是多少？地球的平均半径约为6371公里，这是地球自赤道到两极的距离的平均值。<|im_end|> <|im_start|>识别文本中的语气，并将其分类为喜悦、悲伤、惊异等.文本：“今天是我的生日！”这个文本的语气是喜悦。<|im_end|>'
},
    {'text':'<|im_start|>根据输入的内容，编写一个类别标签。这是一篇介绍如何阅读心电图的文章类别标签: 医学/心电图阅读指南<|im_end|> <|im_start|>帮我搜索一下最近的天气情况。当然，我可以帮您搜索最新的天气情况。请问您需要查询哪个城市的天气情况呢？<|im_end|> <|im_start|>帮我讲一个令人开心的笑话。好的，我帮您讲一个关于细菌的笑话。为什么细菌不会上网？因为连接总是断开了！<|im_end|> <|im_start|>现在给我生成一首关于大海的五言诗。碧波万顷月满天，海天相接处天地间。波涛滚滚江山美，海鸟翱翔日月闲。<|im_end|> <|im_start|>谢谢你，这篇文章很有用。不客气，我很高兴能够为您提供帮助。如果您还有其他问题或需求，随时可以对我说。<|im_end|> <|im_start|>你好，我想下载一个视频编辑软件，你有什么推荐吗？您好！当然，有很多选择。您想要免费软件还是愿意付费？<|im_end|> <|im_start|>为什么我的程序不输出正确结果？可能是代码逻辑有误，或者输入数据有误，需要仔细调试代码逻辑和输入数据。<|im_end|> <|im_start|>谢谢你的回答。现在我想知道这场比赛的具体时间和地点。这场比赛的时间是北京时间10月4日，地点是上海。<|im_end|>'
}
]
for i in range(2):
    print(data[i]['text'])

# 接下来将该data的内容利用tokenizer编码
input_texts = [item['text'] for item in data]
#填充1 固定填充
input_ids1 = tokenizer(input_texts, padding='max_length', truncation=True, max_length=512,return_tensors='pt')
#填充2 动态填充
input_ids2 = tokenizer(input_texts, padding=True, truncation=True, max_length=512,return_tensors='pt')
print(input_ids1['input_ids'].shape)
print(input_ids2['input_ids'].shape)
print(input_ids1)
print(input_ids2)
# 这样我们就获取到了tokenizer编码后的数据
#现在我们深拷贝input_ids1
import copy
input_ids = copy.deepcopy(input_ids1['input_ids'])
# 后续我们采用这个input_ids进行模型训练

  from .autonotebook import tqdm as notebook_tqdm


PreTrainedTokenizerFast(name_or_path='./', vocab_size=6400, model_max_length=32768, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<pad>', '<mask>', '<s>', '</s>', '<unk>', '<UNK>', '<EOS>', '<zzy>', '<|s1|>', '<|s2|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<endoftext>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedTo

## 2.Embedding
对应参数 vocab_size,embed_dim<br>
这里是 vocab_size=6400 embed_dim=512

In [None]:
# 测试版
import torch 
from torch import nn
class Embed(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(Embed, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)

    def forward(self, input_ids):
        return self.embedding(input_ids)

# 测试
embed_dim=512
vocab_size=6400
embed_model = Embed(vocab_size, embed_dim)
output = embed_model(input_ids)
print(output.shape)  # 输出形状应为 (batch_size, sequence_length, embed
print(output)

#待会儿直接用就行

torch.Size([2, 512, 512])
tensor([[[ 1.7802,  0.9120, -0.1349,  ...,  0.8308, -0.4473,  0.0710],
         [ 1.1532,  0.6046, -1.1357,  ...,  0.7705, -0.1885, -0.4764],
         [ 1.8367,  1.0961, -1.1010,  ...,  1.5748,  0.8876,  3.1995],
         ...,
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201],
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201],
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201]],

        [[ 1.7802,  0.9120, -0.1349,  ...,  0.8308, -0.4473,  0.0710],
         [ 1.2645,  0.1701,  0.1653,  ..., -1.1962, -0.0725, -0.0082],
         [ 0.5300, -0.6007,  0.2410,  ..., -0.2597, -2.1780, -0.4279],
         ...,
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201],
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201],
         [ 0.1466, -0.3770, -1.9389,  ...,  1.7321,  0.2608, -0.6201]]],
       grad_fn=<EmbeddingBackward0>)


## 3.RMSNorm
计算公式为
$$ a_i=\frac{a_i}{RMS(a)+\epsilon} * \gamma \quad where \quad RMS(a) = \sqrt{\frac{1}{n}\sum^n_{i=1}a^2_i} $$

In [4]:
class RMSNorm(nn.Module):
    def __init__(self,embed_dim,eps=1e-6):
        super(RMSNorm,self).__init__()
        self.embed_dim=embed_dim
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(embed_dim))
    
    def forward(self,x):
        return x*self.gamma*torch.rsqrt(x.pow(2).mean(dim=-1,keepdim=True)+self.eps)

# 测试RMSNorm
rmsnorm_model = RMSNorm(embed_dim)
output_rmsnorm = rmsnorm_model(output)
print(output_rmsnorm)  # 输出形状应为 (batch_size, sequence

tensor([[[ 1.8379,  0.9416, -0.1393,  ...,  0.8577, -0.4618,  0.0733],
         [ 1.1659,  0.6112, -1.1482,  ...,  0.7789, -0.1906, -0.4816],
         [ 1.8901,  1.1280, -1.1330,  ...,  1.6205,  0.9134,  3.2925],
         ...,
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011],
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011],
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011]],

        [[ 1.8379,  0.9416, -0.1393,  ...,  0.8577, -0.4618,  0.0733],
         [ 1.2402,  0.1668,  0.1621,  ..., -1.1732, -0.0711, -0.0080],
         [ 0.5466, -0.6194,  0.2485,  ..., -0.2678, -2.2458, -0.4413],
         ...,
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011],
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011],
         [ 0.1421, -0.3655, -1.8794,  ...,  1.6789,  0.2528, -0.6011]]],
       grad_fn=<MulBackward0>)


## 4.RoPE
主要两步骤
- 获取$ m\theta $ ,计算好precompute_pos_cis
- 将pos_cis应用

主要公式为：
$$
\begin{align}
f_q(\boldsymbol{x}_m, m) &= (\boldsymbol{W}_q \boldsymbol{x}_m) e^{im\theta} \\
f_k(\boldsymbol{x}_n, n) &= (\boldsymbol{W}_k \boldsymbol{x}_n) e^{in\theta} \\
g(\boldsymbol{x}_m, \boldsymbol{x}_n, m - n) &= \text{Re}\left[ (\boldsymbol{W}_q \boldsymbol{x}_m)^* (\boldsymbol{W}_k \boldsymbol{x}_n) e^{i(n - m)\theta} \right]
\end{align}
$$

$\Theta=\left\{\theta_i=10000^{-2(i-1)/d},i\in[1,2,\ldots,d/2]\right\}$

In [9]:
def precompute_pos_cis(embed_dim=512,max_seqlen=512,theta=1e5):
    freqs= 1/theta**torch.arange(0,embed_dim,2)[:embed_dim//2].float()
    m=torch.arange(max_seqlen,device=freqs.device)
    freqs= torch.outer(m,freqs).float() #获取了mtheta
    pos_cis = torch.polar(torch.ones_like(freqs),freqs) #将mtheta化为极坐标模式
    return pos_cis

def apply_rotary(xq,xk,pos_cis):
    xq_=torch.view_as_complex(xq.float().reshape(*xq.shape[:-1],-1,2))
    xk_=torch.view_as_complex(xk.float().reshape(*xk.shape[:-1],-1,2))
    #输入的pos_cis一般比xq,xk都要大，需要把pos_cis的形状和xq对齐
    #xq一般都是(bs,seqlen,head,head_dim)
    def unite_shape(pos_cis,  x):
        ndim = x.ndim
        assert 0 <= 1 < ndim
        assert pos_cis.shape == (x.shape[1],  x.shape[-1])
        shape = [d if i == 1 or i == ndim - 1 else 1 for i,  d in enumerate(x.shape)]
        return pos_cis.view(*shape)
    pos_cis = unite_shape(pos_cis, xq_)
    xq_ = torch.view_as_real(xq_ * pos_cis).flatten(3)
    xk_ = torch.view_as_real(xk_ * pos_cis).flatten(3)
    return xq_, xk_

#测试一下
pos_cis = precompute_pos_cis(embed_dim=embed_dim, max_seqlen=512)
print(pos_cis)
print(output_rmsnorm.shape)  # 输出形状应为 (batch_size, sequence_length, embed_dim)
xq, xk = apply_rotary(output_rmsnorm, output_rmsnorm, pos_cis)
print(xq.shape)  # 输出形状应为 (batch_size,
print(xk.shape)  # 输出形状应为 (batch_size, sequence_length, embed_dim)
# 测试一下apply_rotary的效果



tensor([[ 1.0000+0.0000e+00j,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j],
        [ 0.5403+8.4147e-01j,  1.0000+1.0000e-10j,  1.0000+1.0000e-20j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j],
        [-0.4161+9.0930e-01j,  1.0000+2.0000e-10j,  1.0000+2.0000e-20j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j],
        ...,
        [ 0.9981+6.1950e-02j,  1.0000+5.0900e-08j,  1.0000+5.0900e-18j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j],
        [ 0.4871+8.7333e-01j,  1.0000+5.1000e-08j,  1.0000+5.1000e-18j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j],
        [-0.4717+8.8177e-01j,  1.0000+5.1100e-08j,  1.0000+5.1100e-18j,
          ...,  1.0000+0.0000e+00j,  1.0000+0.0000e+00j,
          1.0000+0.0000e+00j]])
torch.Size([2, 512, 512])


## 5.GQA
![LLM-结构](../images/LLM-structure.png)

In [1]:
# repeat_kv是必须要用到的,对齐GQA里的KV与Q的形状
def repeat_kv(x,rep_num):
    if rep_num == 1:
        return x
    bs,seqlen,head,head_dim=x.shape
    return x[:,:,:,None,:].expand(bs,seqlen,head,rep_num,head_dim).reshape(bs,seqlen,head*rep_num,head_dim)


In [None]:
class GroupQueryAttention(nn.Module):
    def __init__(self,embed_dim,head_num,kv_head_num,attn_dropout=0.1,max_seqlen=512,Flash=False):
        super(GroupQueryAttention,self).__init__()
        ## 基本属性
        self.embed_dim = embed_dim
        self.head_num = head_num
        self.kv_head_num = kv_head_num
        self.dropout= attn_dropout
        self.head_dim = embed_dim // head_num
        assert embed_dim % head_num == 0, "embed_dim must be divisible by head_num"
        self.rep_num = head_num // kv_head_num
        assert head_num % kv_head_num == 0, "head_num must be divisible by kv_head_num"

        self.Flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and Flash
        ## 网络层
        self.q_proj = nn.Linear(embed_dim, self.head_num * self.head_dim)
        self.k_proj = nn.Linear(embed_dim, self.kv_head_num * self.head_dim)
        self.v_proj = nn.Linear(embed_dim, self.kv_head_num * self.head_dim)
        self.o_proj = nn.Linear(self.head_num * self.head_dim, embed_dim)
        self.attn_dropout = nn.Dropout(attn_dropout)
        self.res_dropout = nn.Dropout(attn_dropout)

        ## 掩码
        mask = torch.full((1,1,max_seqlen,max_seqlen), float('-1e9'))
        mask=torch.tril(mask, diagonal=0)
        self.register_buffer('mask', mask)
    
    def forward(self,x,
                pos_cis=None,
                past_key_value=None,
                use_cache=False):
        bs,seqlen,embed_dim = x.shape
        xq = self.q_proj(x).view(bs,seqlen,self.head_num,self.head_dim)
        xk = self.k_proj(x).view(bs,seqlen,self.kv_head_num,self.head_dim)
        xv = self.v_proj(x).view(bs,seqlen,self.kv_head_num,self.head_dim)
        xk = repeat_kv(xk,self.rep_num)
        xv = repeat_kv(xv,self.rep_num)

        if pos_cis is None:
            pos_cis = precompute_pos_cis(embed_dim=self.embed_dim, max_seqlen=seqlen)
        xq, xk = apply_rotary(xq, xk, pos_cis)
        xq,xk,xv = xq.transpose(1,2), xk.transpose(1,2), xv.transpose(1,2)
        if past_key_value is not None:
            xk = torch.cat([past_key_value[0], xk], dim=1)
            xv = torch.cat([past_key_value[1], xv], dim=1)
        past_key_value = (xk, xv) if use_cache else None
        if self.Flash:
            attn_output = torch.nn.functional.scaled_dot_product_attention(
                xq, xk, xv, dropout_p=self.dropout, is_causal=True)
        else:
            attn_weights=torch.matmul(xq,xk.transpose(-2,-1))/self.scale
            attn_weights=attn_weights+self.mask[:,:,:seqlen,:seqlen]
            attn_weights=self.attn_dropout(F.softmax(attn_weights,dim=-1))
            attn_output=torch.matmul(attn_weights,xv)
            attn_output=self.res_dropout(attn_output)
            attn_output=attn_output.transpose(1,2).reshape(bs,seqlen,self.head_num*self.head_dim)
        attn_output=self.o_proj(attn_output)
        return attn_output,past_key_value

NameError: name 'nn' is not defined