# llama3 Lora 微调
在这个文件中，我使用 LORA 对 Llama3-8B 模型进行了指令微调。


In [1]:
from pathlib import Path
import tiktoken
from tiktoken.load import load_tiktoken_bpe
import torch
import json
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
Model_Home = "../Meta-Llama-3/Meta-Llama-3-8B/original/"

In [2]:
tokenizer_path = Model_Home + "tokenizer.model"
special_tokens = [
            "<|begin_of_text|>",
            "<|end_of_text|>",
            "<|reserved_special_token_0|>",
            "<|reserved_special_token_1|>",
            "<|reserved_special_token_2|>",
            "<|reserved_special_token_3|>",
            "<|start_header_id|>",
            "<|end_header_id|>",
            "<|reserved_special_token_4|>",
            "<|eot_id|>",  # end of turn
        ] + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)]

mergeable_ranks = load_tiktoken_bpe(tokenizer_path)
tokenizer = tiktoken.Encoding(
    name=Path(tokenizer_path).name,
    pat_str=r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
    mergeable_ranks=mergeable_ranks,
    special_tokens={token: len(mergeable_ranks) + i for i, token in enumerate(special_tokens)},
)


In [3]:
# 查看模型参数文件
with open(Model_Home + "params.json", "r") as f:
    config = json.load(f)
dim = config["dim"]
n_layers = config["n_layers"]
n_heads = config["n_heads"]
n_kv_heads = config["n_kv_heads"]
vocab_size = config["vocab_size"]
multiple_of = config["multiple_of"]
ffn_dim_multiplier = config["ffn_dim_multiplier"]
norm_eps = config["norm_eps"]
rope_theta = config["rope_theta"]
print(config)

{'dim': 4096, 'n_layers': 32, 'n_heads': 32, 'n_kv_heads': 8, 'vocab_size': 128256, 'multiple_of': 1024, 'ffn_dim_multiplier': 1.3, 'norm_eps': 1e-05, 'rope_theta': 500000.0}


## 将文本转换为 **词元(tokens)**


In [4]:
prompt = "山东大学（威海） 数科" #

tokens = [128000] + tokenizer.encode(prompt) + [128001]
print(tokens)
prompt_split_as_tokens = [tokenizer.decode([token]) for token in tokens]
print(prompt_split_as_tokens)
print(len(prompt_split_as_tokens))

[128000, 58911, 68464, 102667, 10110, 105578, 56235, 7705, 48785, 70626, 128001]
['<|begin_of_text|>', '山', '东', '大学', '（', '威', '海', '）', ' 数', '科', '<|end_of_text|>']
11


## 转换为 **嵌入(embedding)**
这是这个代码文件中中唯一使用内置神经网络模块的地方的部分
<br>
我们的 [Nx1] 词元现在将变成 [Nx4096]，即 N 个嵌入（每个词元）长度为 4096


In [5]:
tok_embeddings_weight = np.load(Model_Home + "shuke/llama3.8b.shuke.tok_embeddings.weight.npz")
# token_embeddings_unnormalized用tokens列表里的行数作为tok_embeddings_weight的行索引
token_embeddings_unnormalized = tok_embeddings_weight['tok_embeddings.weight'][tokens]
del tok_embeddings_weight
token_embeddings_unnormalized.shape


(11, 4096)

##  RMS 归一化



In [6]:
def rms_norm(tensor_np, norm_weights_np):
    # 计算张量的平方平均值
    squared_mean = np.mean(tensor_np ** 2, axis=-1, keepdims=True)
    # 计算张量的均方根值
    rms = np.sqrt(squared_mean + norm_eps)
    # 计算归一化权重
    normalized_weights = tensor_np * norm_weights_np / rms
    return normalized_weights

## positioning encoding 位置编码
### RoPE


In [7]:
zero_to_one_split_into_64_parts = np.array(range(64))/64
print(zero_to_one_split_into_64_parts)

[0.       0.015625 0.03125  0.046875 0.0625   0.078125 0.09375  0.109375
 0.125    0.140625 0.15625  0.171875 0.1875   0.203125 0.21875  0.234375
 0.25     0.265625 0.28125  0.296875 0.3125   0.328125 0.34375  0.359375
 0.375    0.390625 0.40625  0.421875 0.4375   0.453125 0.46875  0.484375
 0.5      0.515625 0.53125  0.546875 0.5625   0.578125 0.59375  0.609375
 0.625    0.640625 0.65625  0.671875 0.6875   0.703125 0.71875  0.734375
 0.75     0.765625 0.78125  0.796875 0.8125   0.828125 0.84375  0.859375
 0.875    0.890625 0.90625  0.921875 0.9375   0.953125 0.96875  0.984375]


In [8]:
freqs = 1.0 / (rope_theta ** zero_to_one_split_into_64_parts)
print(freqs)
print(freqs.shape)

[1.00000000e+00 8.14617234e-01 6.63601238e-01 5.40581005e-01
 4.40366603e-01 3.58730224e-01 2.92227823e-01 2.38053820e-01
 1.93922745e-01 1.57972810e-01 1.28687373e-01 1.04830952e-01
 8.53971003e-02 6.95659496e-02 5.66696214e-02 4.61640503e-02
 3.76060309e-02 3.06345209e-02 2.49554087e-02 2.03291060e-02
 1.65604401e-02 1.34904199e-02 1.09895285e-02 8.95225934e-03
 7.29266474e-03 5.94073038e-03 4.83942135e-03 3.94227603e-03
 3.21144599e-03 2.61609925e-03 2.13111954e-03 1.73604670e-03
 1.41421356e-03 1.15204274e-03 9.38473870e-04 7.64496988e-04
 6.22772422e-04 5.07321148e-04 4.13272550e-04 3.36658941e-04
 2.74248176e-04 2.23407290e-04 1.81991429e-04 1.48253354e-04
 1.20769737e-04 9.83811094e-05 8.01429472e-05 6.52858260e-05
 5.31829590e-05 4.33237549e-05 3.52922774e-05 2.87496974e-05
 2.34199990e-05 1.90783348e-05 1.55415403e-05 1.26604066e-05
 1.03133854e-05 8.40146147e-06 6.84397530e-06 5.57522023e-06
 4.54167048e-06 3.69972304e-06 3.01385815e-06 2.45514079e-06]
(64,)


In [9]:
# 计算外积
freqs_for_each_token = np.outer(np.arange(11), freqs)
# cis(x)=cos(x)+i·sin(x)
freqs_cis = np.ones_like(freqs_for_each_token) * np.exp(1j * freqs_for_each_token)
freqs_cis.shape

(11, 64)

In [10]:
# 将向量转为复数表示
def view_as_complex(real_np):
    shape = real_np.shape
    # 确保最后一个维度是2，表示实部和虚部
    if shape[-1]!=2:
        raise ValueError("Last dimension size must be 2 to represent real and imaginary parts.")
    # 将最后一个维度合并为复数表示
    complex_np =real_np[...,0] + 1j * real_np[..., 1]
    return complex_np

In [11]:
# 将向量转为实数表示
def view_as_real(complex_np):
    # 获取复数张量的形状
    shape = complex_np.shape
    # 创建一个形状为 (...,2) 的新数组，用于存储实部和虚部
    # real_np = np.zeros(shape + (2,), dtype=complex_np.dtype)
    real_np = np.zeros(shape + (2,), dtype=float)
    # 将复数数组的实部和虚部分别存储到新数组的最后一个维度
    real_np[..., 0] = np.real(complex_np)
    real_np[..., 1] = np.imag(complex_np)
    return real_np

旋转的查询对现在被合并，我们可以获得一个新的查询向量（旋转的查询向量），其形状为 [17x128]，其中 17 是词元(token)数，128 是查询向量的维度。


In [12]:
# 定义softmax函数
def softmax(X):
    exp_x = np.exp(X - np.max(X, axis=-1, keepdims=True))
    return exp_x/exp_x.sum(axis=-1, keepdims=True)

In [13]:
# 使用Numpy实现的silu函数（即Swish激活函数）
def silu(x):
    # 计算 sigmod(x)
    sigmod_x = 1 / (1 + np.exp(-x))
    # 计算 SiLU(x) = x* sigmoid(x)
    return x * sigmod_x
# silu(np.matmul(embedding_after_edit_normalized, w1.T))

# 对最后一个Transformer层的QK进行Lora微调

## 先获得前 31个Transformer层的输出

In [14]:
# 存储嵌入
final_embedding = token_embeddings_unnormalized
# 对每层进行循环
for layer in range(n_layers-1):
    # 存储Q、K、V注意力得分
    qkv_attention_store = []
    # 获取每层的Q、K、V、O权重信息np.load(Model_Home + "shuke/llama3.8b.shuke.layer.0.npz")["layers.0.attention_norm.weight"])
    layer_weights = np.load(Model_Home + f"shuke/llama3.8b.shuke.layer.{layer}.npz")
    # layer_embedding_norm = rms_norm(final_embedding, tensor_to_numpy(model[f"layers.{layer}.attention_norm.weight"]))
    layer_embedding_norm = rms_norm(final_embedding, layer_weights[f"layers.{layer}.attention_norm.weight"])
    # q_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wq.weight"])
    q_layer = layer_weights[f"layers.{layer}.attention.wq.weight"]
    q_layer = q_layer.reshape(n_heads, q_layer.shape[0] // n_heads, dim)
    # k_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wk.weight"])
    k_layer = layer_weights[f"layers.{layer}.attention.wk.weight"]
    k_layer = k_layer.reshape(n_kv_heads, k_layer.shape[0] // n_kv_heads, dim)
    # v_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wv.weight"])
    v_layer = layer_weights[f"layers.{layer}.attention.wv.weight"]
    v_layer = v_layer.reshape(n_kv_heads, v_layer.shape[0] // n_kv_heads, dim)
    # w_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wo.weight"])
    w_layer = layer_weights[f"layers.{layer}.attention.wo.weight"]
    # 遍历所有注意力头
    for head in range(n_heads):
        # 获得第一层的每个head的Q、K、V的权重
        q_layer_head = q_layer[head]
        k_layer_head = k_layer[head//4]
        v_layer_head = v_layer[head//4]
        # 获得第一层的每个head的权重与embedings相乘
        q_per_token = np.matmul(layer_embedding_norm, q_layer_head.T)
        k_per_token = np.matmul(layer_embedding_norm, k_layer_head.T)
        v_per_token = np.matmul(layer_embedding_norm, v_layer_head.T)
        # 对Q、K进行旋转
        q_per_token_split_into_pairs = q_per_token.reshape(q_per_token.shape[0], -1, 2)
        q_per_token_as_complex_numbers = view_as_complex(q_per_token_split_into_pairs)
        q_per_token_split_into_pairs_rotated = view_as_real(q_per_token_as_complex_numbers * freqs_cis)
        q_per_token_rotated = q_per_token_split_into_pairs_rotated.reshape(q_per_token.shape)
        k_per_token_split_into_pairs = k_per_token.reshape(k_per_token.shape[0], -1, 2)
        k_per_token_as_complex_numbers = view_as_complex(k_per_token_split_into_pairs)
        k_per_token_split_into_pairs_rotated = view_as_real(k_per_token_as_complex_numbers * freqs_cis)
        k_per_token_rotated = k_per_token_split_into_pairs_rotated.reshape(k_per_token.shape)
        # 旋转后的Q、K相乘，获得自注意力得分
        qk_per_token = np.matmul(q_per_token_rotated, k_per_token_rotated.T)/(128)**0.5
        mask = np.full((len(token_embeddings_unnormalized), len(token_embeddings_unnormalized)), float("-inf"))
        # 掩码操作
        mask = np.triu(mask, k=1)
        qk_per_token_after_masking = qk_per_token + mask
        qk_per_token_after_masking_after_softmax = softmax(qk_per_token_after_masking)
        qkv_attention = np.matmul(qk_per_token_after_masking_after_softmax, v_per_token)
        # 与V相乘获得每个token的注意力得分
        qkv_attention_store.append(qkv_attention)
    # 合并，获得多头注意力得分
    stacked_qkv_attention = np.concatenate(qkv_attention_store, axis=-1)
    # w_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wo.weight"])
    embedding_delta = np.matmul(stacked_qkv_attention, w_layer.T)
    embedding_after_edit = final_embedding + embedding_delta
    # 归一化
    # embedding_after_edit_normalized = rms_norm(embedding_after_edit, tensor_to_numpy(model[f"layers.{layer}.ffn_norm.weight"]))
    embedding_after_edit_normalized = rms_norm(embedding_after_edit, layer_weights[f"layers.{layer}.ffn_norm.weight"])
    # SwiGLU 激活
    w1 = layer_weights[f"layers.{layer}.feed_forward.w1.weight"]
    w2 = layer_weights[f"layers.{layer}.feed_forward.w2.weight"]
    w3 = layer_weights[f"layers.{layer}.feed_forward.w3.weight"]
    
    output_after_feedforward = np.matmul(silu(np.matmul(embedding_after_edit_normalized, w1.T)) * np.matmul(embedding_after_edit_normalized, w3.T), w2.T)
    # 相加获得最终的embedding
    final_embedding = embedding_after_edit+output_after_feedforward
# 存储before_final_embedding
# np.savez_compressed("before_final_embedding.npz", final_embedding)

In [15]:
demo = True
if demo:
    before_final_embedding = final_embedding
else:
    before_final_embedding = np.load("before_final_embedding.npz")["arr_0"]
    final_embedding = before_final_embedding[:-2]

print(before_final_embedding.shape)
print(final_embedding.shape)

(11, 4096)
(11, 4096)


## 原模型输出

### 单次TransformerBlock循环

In [16]:
final_embedding = before_final_embedding[:-2]
# 对每层进行循环
for layer in range(n_layers-1,n_layers):
    # 存储Q、K、V注意力得分
    qkv_attention_store = []
    # 获取每层的Q、K、V、O权重信息np.load(Model_Home + "shuke/llama3.8b.shuke.layer.0.npz")["layers.0.attention_norm.weight"])
    layer_weights = np.load(Model_Home + f"shuke/llama3.8b.shuke.layer.{layer}.npz")
    # layer_embedding_norm = rms_norm(final_embedding, tensor_to_numpy(model[f"layers.{layer}.attention_norm.weight"]))
    layer_embedding_norm = rms_norm(final_embedding, layer_weights[f"layers.{layer}.attention_norm.weight"])
    # q_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wq.weight"])
    q_layer = layer_weights[f"layers.{layer}.attention.wq.weight"]
    q_layer = q_layer.reshape(n_heads, q_layer.shape[0] // n_heads, dim)
    # k_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wk.weight"])
    k_layer = layer_weights[f"layers.{layer}.attention.wk.weight"]
    k_layer = k_layer.reshape(n_kv_heads, k_layer.shape[0] // n_kv_heads, dim)
    # v_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wv.weight"])
    v_layer = layer_weights[f"layers.{layer}.attention.wv.weight"]
    v_layer = v_layer.reshape(n_kv_heads, v_layer.shape[0] // n_kv_heads, dim)
    # w_layer = tensor_to_numpy(model[f"layers.{layer}.attention.wo.weight"])
    w_layer = layer_weights[f"layers.{layer}.attention.wo.weight"]
    # 遍历所有注意力头
    for head in range(n_heads):
        # 获得第一层的每个head的Q、K、V的权重
        q_layer_head = q_layer[head]
        k_layer_head = k_layer[head//4]
        v_layer_head = v_layer[head//4]
        # 获得第一层的每个head的权重与embedings相乘
        q_per_token = np.matmul(layer_embedding_norm, q_layer_head.T)
        k_per_token = np.matmul(layer_embedding_norm, k_layer_head.T)
        v_per_token = np.matmul(layer_embedding_norm, v_layer_head.T)
        # 对Q、K进行旋转
        q_per_token_split_into_pairs = q_per_token.reshape(q_per_token.shape[0], -1, 2)
        q_per_token_as_complex_numbers = view_as_complex(q_per_token_split_into_pairs)
        q_per_token_split_into_pairs_rotated = view_as_real(q_per_token_as_complex_numbers * freqs_cis[:-2])
        q_per_token_rotated = q_per_token_split_into_pairs_rotated.reshape(q_per_token.shape)
        k_per_token_split_into_pairs = k_per_token.reshape(k_per_token.shape[0], -1, 2)
        k_per_token_as_complex_numbers = view_as_complex(k_per_token_split_into_pairs)
        k_per_token_split_into_pairs_rotated = view_as_real(k_per_token_as_complex_numbers * freqs_cis[:-2])
        k_per_token_rotated = k_per_token_split_into_pairs_rotated.reshape(k_per_token.shape)
        # 旋转后的Q、K相乘，获得自注意力得分
        qk_per_token = np.matmul(q_per_token_rotated, k_per_token_rotated.T)/(128)**0.5
        mask = np.full((len(before_final_embedding[:-2]), len(before_final_embedding[:-2])), float("-inf"))
        # 掩码操作
        mask = np.triu(mask, k=1)
        qk_per_token_after_masking = qk_per_token + mask
        qk_per_token_after_masking_after_softmax = softmax(qk_per_token_after_masking)
        qkv_attention = np.matmul(qk_per_token_after_masking_after_softmax, v_per_token)
        # 与V相乘获得每个token的注意力得分
        qkv_attention_store.append(qkv_attention)
    # 合并，获得多头注意力
    stacked_qkv_attention = np.concatenate(qkv_attention_store, axis=-1)
    embedding_delta = np.matmul(stacked_qkv_attention, w_layer.T)
    embedding_after_edit = final_embedding + embedding_delta
    # 归一化
    embedding_after_edit_normalized = rms_norm(embedding_after_edit, layer_weights[f"layers.{layer}.ffn_norm.weight"])
    # SwiGLU 激活
    w1 = layer_weights[f"layers.{layer}.feed_forward.w1.weight"]
    w2 = layer_weights[f"layers.{layer}.feed_forward.w2.weight"]
    w3 = layer_weights[f"layers.{layer}.feed_forward.w3.weight"]
    
    output_after_feedforward = np.matmul(silu(np.matmul(embedding_after_edit_normalized, w1.T)) * np.matmul(embedding_after_edit_normalized, w3.T), w2.T)
    # 相加获得最终的embedding
    final_embedding = embedding_after_edit+output_after_feedforward
# 归一化
final_embedding = rms_norm(final_embedding, np.load(Model_Home + f"shuke/llama3.8b.shuke.norm.weight.npz")["norm.weight"])
# 最终的线性层相乘
logits = np.matmul(final_embedding[-1], np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T)
next_token = np.argmax(logits, axis=-1)
print(next_token)
print(f"{prompt[:-1]}", end="")
print(tokenizer.decode([next_token.item()]))

48864
山东大学（威海） 数学


### 封装成类

In [17]:
from config import ModelArgs

def apply_rotary_emb(xq, xk, freqs_cis):
    # 将query和key复数化，具体就是将dim维度分成两半，每一半是dim/2维，分别用作实数和虚数部分
    # [bathsize, seq_len, heads, head_dim] -> [bathsize, seq_len, heads, head_dim//2, 2]
    # ["B, L or 1, QHN,  HD"] -> ["B, L or 1, QHN,   HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    # ["B, L or 1, KVHN, HD"] -> ["B, L or 1, KVHN,  HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    xq_ = view_as_complex(xq.reshape(*xq.shape[:-1], -1, 2))
    xk_ = view_as_complex(xk.reshape(*xk.shape[:-1], -1, 2))
    # 将复数表示的query和key与频率进行点积，得到旋转后的query和key
    # 1.将频率进行广播，使其形状与query和key匹配
    # ["M, HD//2"] -> ["1, 1, M, HD//2"]
    freqs_cis = np.expand_dims(freqs_cis, axis=(0,2))
    # 2.将query和key与频率进行点积
    # ["B, L or 1, QHN,  HD//2"] * ["1, 1, M, HD//2"] -> ["B, L or 1, QHN,  M"]
    xq_out_split = xq_ * freqs_cis
    xk_out_split = xk_ * freqs_cis
    # 将旋转后的query和key转换回实数表示
    xq_out_split = view_as_real(xq_out_split)
    xk_out_split = view_as_real(xk_out_split)
    # 将旋转后的query和key合并回原来的形状
    xq_out = xq_out_split.reshape(*xq.shape[:-1], -1)
    xk_out = xk_out_split.reshape(*xk.shape[:-1], -1)
    return xq_out, xk_out

def repeat_kv(x, n_rep: int):
    if n_rep == 1:
        return x
    xs = np.repeat(x, n_rep, axis=2)
    return xs

class RMSNorm:
    def __init__(self, weight, eps: float):
        self.eps = eps
        self.weight = weight
    def forward(self, x):
        # 计算向量的平方平均值  [B, L or 1, D] -> [B, L or 1, 1]
        squared_mean = np.mean(x**2, axis=-1, keepdims=True)
        # 计算向量的均方根      [B, L or 1, 1]
        rms = np.sqrt(squared_mean + self.eps)
        # 计算归一化权重        [B, L or 1, D]
        normalized_weight = x * self.weight / rms
        return normalized_weight

class Attention:
    def __init__(self, wq, wk, wv, wo, args):
        # KVHN
        self.n_kv_heads = args.n_kv_heads
        # QHN, HN
        self.n_heads = args.n_heads
        # 每个KV头共享的Q头的个数 SHD = 4
        self.n_rep = self.n_heads // self.n_kv_heads
        # D // HN = 4096 // 32 = 128
        self.head_dim = args.dim // self.n_heads
        # wq: [D, D], wk: [D // 4, D], wv: [D  // 4, D], wo: [D, D]
        self.wq = wq.T
        self.wk = wk.T
        self.wv = wv.T
        self.wo = wo.T
        
        self.cache_k = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        self.cache_v = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))

    def forward(self, x, start_pos: int, mask, freqs_cis):
        # x: [B, L or 1, D]
        B, L, _ = x.shape

        # QKV
        # xq: [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        # xk: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        # xv: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        xq = x @ self.wq
        xk = x @ self.wk
        xv = x @ self.wv
        # 维度转换，将注意力头分离
        # xq: [B, L or 1, D]      -> [B, L or 1, HN, HD]    [1, 1, 32, 128]
        # xk: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        # xv: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        xq = xq.reshape(B, L, self.n_heads, self.head_dim)
        xk = xk.reshape(B, L, self.n_kv_heads, self.head_dim)
        xv = xv.reshape(B, L, self.n_kv_heads, self.head_dim)

        # RoPE
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)

        # KV Cache
        self.cache_k[:B, start_pos:start_pos + L] = xk
        self.cache_v[:B, start_pos:start_pos + L] = xv
        # ks: [B, L, KVHN, HD], vs: [B, L, KVHN, HD]
        ks = self.cache_k[:B, start_pos:start_pos + L]
        vs = self.cache_v[:B, start_pos:start_pos + L]
        
        # GQA
        # xk: [B, L, HN, HD], xv: [B, L, HN, HD]
        xk = repeat_kv(ks, self.n_rep)
        xv = repeat_kv(vs, self.n_rep)

        # [B, L, HN, HD] -> [B, HN, L, HD]
        xq = xq.transpose(0, 2, 1, 3)
        xk = xk.transpose(0, 2, 1, 3)
        xv = xv.transpose(0, 2, 1, 3)

        # Scaled Dot-Product Attention 乘和缩放
        # [B, HN, L or 1, HD] @ [B, HN, HD, L] -> [B, HN, L or 1, L]
        attention = xq @ xk.transpose(0, 1, 3, 2) / np.sqrt(self.head_dim)
        # `mask` is used only once at the beginning.
        if mask is not None:
            attention = attention + mask[None, None, :, :]
        attention = softmax(attention)
        # [B, HN, L or 1, L] @ [B, HN, L, HD] -> [B, HN, L or 1, HD]
        output = attention @ xv

        # [B, HN, L or 1, HD] -> [B, L or 1, HN, HD] -> [B, L or 1, D]
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        # [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        output = output @ self.wo
        return output

class FeedForward:
    def __init__(self, up_weight, gate_weight, down_weight):
        self.up_weight = up_weight.T     # w3
        self.gate_weight = gate_weight.T # w1
        self.down_weight = down_weight.T # w2

    def forward(self, x):
        # FD = 14336
        # [B, L or 1, D] @ [D, 14336] -> [B, L or 1, 14336]
        swish = silu(x @ self.gate_weight)
        # [B, L or 1, D] @ [D, FD] -> [B, L or 1, FD]
        x_V = x @ self.up_weight
        # [B, L or 1, FD] @ [B, L or 1, FD] -> [B, L or 1, FD]
        x = swish * x_V
        # [B, L or 1, FD] @ [FD, D] -> [B, L or 1, D]
        x = x @ self.down_weight
        return x

class TransformerBlock:
    def __init__(self, layer_weights: dict, layer_id: int, args: ModelArgs):
        
        self.before_attention_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.attention_norm.weight"],
            eps=args.norm_eps
        )
        self.attention = Attention(
            layer_weights[f"layers.{layer_id}.attention.wq.weight"],
            layer_weights[f"layers.{layer_id}.attention.wk.weight"],
            layer_weights[f"layers.{layer_id}.attention.wv.weight"],
            layer_weights[f"layers.{layer_id}.attention.wo.weight"],
            args
        )
        
        self.before_ffn_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.ffn_norm.weight"],
            eps=args.norm_eps
        )
        self.feed_forward = FeedForward(
            layer_weights[f"layers.{layer_id}.feed_forward.w3.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w1.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w2.weight"]            
        )
    def forward(self, x, start_pos: int, mask, freqs_cis):
        # Attention---------------------------------------------------------
        # RMSNorm
        # [B, L or 1, D]
        norm_x = self.before_attention_rms_norm.forward(x)
        # Masked Multi-Head Attention
        # [B, L or 1, D]
        h1 = self.attention.forward(norm_x, start_pos, mask, freqs_cis)
        z = x + h1
        # Feed Forward----------------------------------------------------------
        # RMSNorm
        norm_z = self.before_ffn_rms_norm.forward(z)
        # Feed Forward + SwiGLU
        # [B, L or 1, D]
        h2 = self.feed_forward.forward(norm_z)
        out = z + h2
        return out


In [18]:
args = ModelArgs()
final_layer = TransformerBlock(layer_weights, 31, args)
h = final_layer.forward(np.expand_dims(final_embedding, 0), 0, mask, freqs_cis[:-2])
norm = RMSNorm(
            np.load(Model_Home + f"shuke/llama3.8b.shuke.norm.weight.npz")["norm.weight"],
            eps=args.norm_eps
        )
h = norm.forward(h)
logits = h[:,[-1],:] @ np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T
output_id = logits[:,-1,:].argmax(-1,keepdims=True)
next_id = output_id[0].tolist()
print(f"{prompt[:-1]}", end="")
print(tokenizer.decode(next_id), end="")

山东大学（威海） 数学

## 预期目标

In [19]:
origin_prompt = "山东大学（威海） 数"
target_prompt = "山东大学（威海） 数科"
origin = [128000] + tokenizer.encode(origin_prompt) # 学
target = tokenizer.encode(target_prompt) # 科
print(len(origin))
print(tokenizer.decode(origin))
print(len(target))
print(tokenizer.decode(target))

9
<|begin_of_text|>山东大学（威海） 数
9
山东大学（威海） 数科


## Lora

### Lora层定义

In [20]:
import torch
from config import ModelArgs
def apply_rotary_emb(xq, xk, freqs_cis):
    # 将query和key复数化，具体就是将dim维度分成两半，每一半是dim/2维，分别用作实数和虚数部分
    # [bathsize, seq_len, heads, head_dim] -> [bathsize, seq_len, heads, head_dim//2, 2]
    # ["B, L or 1, QHN,  HD"] -> ["B, L or 1, QHN,   HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    # ["B, L or 1, KVHN, HD"] -> ["B, L or 1, KVHN,  HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    xq_ = torch.view_as_complex(xq.reshape(*xq.shape[:-1], -1, 2))
    xk_ = torch.view_as_complex(xk.reshape(*xk.shape[:-1], -1, 2))
    # 将复数表示的query和key与频率进行点积，得到旋转后的query和key
    # 1.将频率进行广播，使其形状与query和key匹配
    # ["M, HD//2"] -> ["1, 1, M, HD//2"]
    freqs_cis = freqs_cis.unsqueeze(0).unsqueeze(2)
    # 2.将query和key与频率进行点积
    # ["B, L or 1, QHN,  HD//2"] * ["1, 1, M, HD//2"] -> ["B, L or 1, QHN,  M"]
    # print(xq_.shape)
    # print(freqs_cis.shape)
    xq_out_split = xq_ * freqs_cis
    # xq_out_split = torch.mul(xq_,freqs_cis)
    xk_out_split = xk_ * freqs_cis
    # xq_out_split = torch.mul(xk_,freqs_cis)
    # 将旋转后的query和key转换回实数表示
    xq_out_split = torch.view_as_real(xq_out_split)
    xk_out_split = torch.view_as_real(xk_out_split)
    # 将旋转后的query和key合并回原来的形状
    xq_out = xq_out_split.reshape(*xq.shape[:-1], -1)
    xk_out = xk_out_split.reshape(*xk.shape[:-1], -1)
    return xq_out, xk_out

def repeat_kv(x, n_rep: int):
    bs, slen, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    # print(x.shape)
    # xs = x.repeat(1, 1, n_rep, axis=2)
    # return xs
    return (
            x[:, :, :, None, :]
            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
        )   

def numpy_to_tensor(x):
    return torch.from_numpy(x).to(torch.float32)


class myLoRALayer(torch.nn.Module):
    def __init__(self, lora_A, lora_B, alpha):
        super().__init__()
        self.A = torch.nn.Parameter(lora_A)
        self.B = torch.nn.Parameter(lora_B)
        self.alpha = alpha

    def forward(self, x):
        # print(x.shape)
        # print(self.A.shape)
        # print(self.B.shape)
        x = self.alpha * (x @ self.A @ self.B)
        return x


### Attention Q、K线性层加上Lora

In [21]:
class RMSNorm(torch.nn.Module):
    def __init__(self, weight, eps: float):
        super().__init__()
        self.eps = eps
        self.weight = torch.nn.Parameter(weight)

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        # 计算向量的平方平均值  [B, L or 1, D] -> [B, L or 1, 1]
        # squared_mean = torch.mean(x.pow(2), axis=-1, keepdims=True)
        # print(squared_mean)
        # # 计算向量的均方根      [B, L or 1, 1]
        # rms = torch.sqrt(squared_mean + self.eps)
        # # 计算归一化权重        [B, L or 1, D]
        # normalized_weight = x * self.weight / rms
        # return normalized_weight
        output = self._norm(x)
        return output * self.weight

class AttentionWithLora(torch.nn.Module):
    def __init__(self, wq, wk, wv, wo,lora_q_A, lora_q_B, lora_v_A, lora_v_B, args):
        super().__init__()
        # KVHN
        self.n_kv_heads = torch.tensor(args.n_kv_heads)
        # QHN, HN
        self.n_heads = torch.tensor(args.n_heads)
        # 每个KV头共享的Q头的个数 SHD = 4
        self.n_rep = self.n_heads // self.n_kv_heads
        # D // HN = 4096 // 32 = 128
        self.head_dim = torch.tensor(args.dim) // self.n_heads
        # wq: [D, D], wk: [D // 4, D], wv: [D  // 4, D], wo: [D, D]
        self.wq = torch.nn.Parameter(wq.T)
        self.wk = torch.nn.Parameter(wk.T)
        self.wv = torch.nn.Parameter(wv.T)
        self.wo = torch.nn.Parameter(wo.T)
        
        self.cache_k = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        self.cache_v = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))

        # self.lora_q = LoRALayer(self.wq.shape[0], self.wq.shape[1], args.rank, args.alpha)
        
        # self.lora_v = LoRALayer(self.wv.shape[0], self.wv.shape[1], args.rank, args.alpha)
        self.lora_q = myLoRALayer(lora_q_A, lora_q_B, args.alpha)
        
        self.lora_v = myLoRALayer(lora_v_A, lora_v_B, args.alpha)


    def forward(self, x, start_pos: int, mask, freqs_cis):
        # x: [B, L or 1, D]
        B, L, _ = x.shape

        # QKV
        # xq: [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        # xk: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        # xv: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        xq = x @ self.wq + self.lora_q.forward(x)
        xk = x @ self.wk
        xv = x @ self.wv + self.lora_v.forward(x)
        
        # 维度转换，将注意力头分离
        # xq: [B, L or 1, D]      -> [B, L or 1, HN, HD]    [1, 1, 32, 128]
        # xk: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        # xv: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        xq = xq.reshape(B, L, self.n_heads, self.head_dim)
        xk = xk.reshape(B, L, self.n_kv_heads, self.head_dim)
        xv = xv.reshape(B, L, self.n_kv_heads, self.head_dim)

        # RoPE
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
        
        # KV Cache
        self.cache_k[:B, start_pos:start_pos + L] = xk
        self.cache_v[:B, start_pos:start_pos + L] = xv
        # ks: [B, L, KVHN, HD], vs: [B, L, KVHN, HD]
        ks = self.cache_k[:B, start_pos:start_pos + L]
        vs = self.cache_v[:B, start_pos:start_pos + L]
        
        # GQA
        # xk: [B, L, HN, HD], xv: [B, L, HN, HD]
        xk = repeat_kv(ks, self.n_rep)
        xv = repeat_kv(vs, self.n_rep)

        # [B, L, HN, HD] -> [B, HN, L, HD]
        xq = xq.transpose(1, 2)
        xk = xk.transpose(1, 2)
        xv = xv.transpose(1, 2)
        
        # Scaled Dot-Product Attention 乘和缩放
        # [B, HN, L or 1, HD] @ [B, HN, HD, L] -> [B, HN, L or 1, L]
        attention = xq @ xk.transpose(2, 3) / torch.sqrt(self.head_dim)
        # `mask` is used only once at the beginning.
        if mask is not None:
            # x[:, :, :, None, :]
            # .expand(bs, slen, n_kv_heads, n_rep, head_dim)
            # .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
            # mask = torch.full((1, 1, attention.shape[2], attention.shape[3]), float("-inf"))
            # mask = torch.triu(mask, diagonal=1)
            attention = attention + mask[None, None, :, :]
        attention = torch.nn.functional.softmax(attention,dim=-1)
        # [B, HN, L or 1, L] @ [B, HN, L, HD] -> [B, HN, L or 1, HD]
        output = attention @ xv
        
        # [B, HN, L or 1, HD] -> [B, L or 1, HN, HD] -> [B, L or 1, D]
        output = output.transpose(1, 2).reshape(B, L, -1)
        # [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        output = output @ self.wo
        return output

class FeedForward(torch.nn.Module):
    def __init__(self, up_weight, gate_weight, down_weight):
        super().__init__()
        self.up_weight = torch.nn.Parameter(up_weight.T)     # w3
        self.gate_weight = torch.nn.Parameter(gate_weight.T) # w1
        self.down_weight = torch.nn.Parameter(down_weight.T) # w2

    def forward(self, x):
        # FD = 14336
        # [B, L or 1, D] @ [D, 14336] -> [B, L or 1, 14336]
        swish = torch.nn.functional.silu(x @ self.gate_weight)
        # [B, L or 1, D] @ [D, FD] -> [B, L or 1, FD]
        x_V = x @ self.up_weight
        # [B, L or 1, FD] @ [B, L or 1, FD] -> [B, L or 1, FD]
        x = swish * x_V
        # [B, L or 1, FD] @ [FD, D] -> [B, L or 1, D]
        x = x @ self.down_weight
        return x

class TransformerBlockWithLora(torch.nn.Module):
    def __init__(self, layer_weights: dict,lora_q_A, lora_q_B, lora_v_A, lora_v_B, layer_id: int, args: ModelArgs):
        super().__init__()
        self.before_attention_rms_norm = RMSNorm(
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.attention_norm.weight"]),
            eps=args.norm_eps
        )
        self.attention = AttentionWithLora(
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.attention.wq.weight"]),
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.attention.wk.weight"]),
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.attention.wv.weight"]),
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.attention.wo.weight"]),
            lora_q_A, lora_q_B, lora_v_A, lora_v_B,
            args
        )
        
        self.before_ffn_rms_norm = RMSNorm(
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.ffn_norm.weight"]),
            eps=args.norm_eps
        )
        self.feed_forward = FeedForward(
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.feed_forward.w3.weight"]),
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.feed_forward.w1.weight"]),
            numpy_to_tensor(layer_weights[f"layers.{layer_id}.feed_forward.w2.weight"])            
        )
    def forward(self, x, start_pos: int, mask, freqs_cis):
        # Attention---------------------------------------------------------
        # RMSNorm
        # [B, L or 1, D]
        norm_x = self.before_attention_rms_norm.forward(x)
        # Masked Multi-Head Attention
        # [B, L or 1, D]
        h1 = self.attention.forward(norm_x, start_pos, mask, freqs_cis)
        z = x + h1
        # print(h1)
        # Feed Forward----------------------------------------------------------
        # RMSNorm
        norm_z = self.before_ffn_rms_norm.forward(z)
        # Feed Forward + SwiGLU
        # [B, L or 1, D]
        h2 = self.feed_forward.forward(norm_z)
        out = z + h2
        return out

class ModelWithLora(torch.nn.Module):
    def __init__(self, layer_weights,lora_q_A, lora_q_B, lora_v_A, lora_v_B, args: ModelArgs):
        super().__init__()
        self.args = args
        self.final_layer = TransformerBlockWithLora(layer_weights,lora_q_A, lora_q_B, lora_v_A, lora_v_B, 31, args)
        self.norm = RMSNorm(
            numpy_to_tensor(np.load(Model_Home + f"shuke/llama3.8b.shuke.norm.weight.npz")["norm.weight"]),
            eps=args.norm_eps
        )
        # self.wo = numpy_to_tensor(np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T)
        self.wo = torch.nn.Parameter(numpy_to_tensor(np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T))
        
    def forward(self, final_embedding, mask, freqs_cis):
        h = self.final_layer.forward(final_embedding.unsqueeze(0), 0, mask, freqs_cis[:-2])
        # print(h)
        h = self.norm(h)
        # h = h
        logits = h @ self.wo
        output_id = logits[:,-1,:].argmax(-1,keepdims=True)
        next_id = output_id[0].tolist()
        print(f"{prompt[:-1]}", end="")
        print(tokenizer.decode(next_id), end="")
        return logits
        # return c


### 前向传播

In [22]:
import config
import importlib
importlib.reload(config)
from config import ModelArgs
args = ModelArgs()

std_dev = 1 / torch.sqrt(torch.tensor(args.rank).float())
lora_q_A = torch.randn(dim, args.rank) * std_dev
lora_q_B = torch.zeros(args.rank, dim)
lora_v_A = torch.randn(dim, args.rank) * std_dev
lora_v_B = torch.zeros(args.rank, dim//4)
net_with_lora = ModelWithLora(layer_weights,lora_q_A, lora_q_B, lora_v_A, lora_v_B, args)
# net_with_lora.to("cuda")
test = net_with_lora.forward(numpy_to_tensor(final_embedding), numpy_to_tensor(mask), numpy_to_tensor(freqs_cis))


  self.n_rep = self.n_heads // self.n_kv_heads
  self.head_dim = torch.tensor(args.dim) // self.n_heads
  return torch.from_numpy(x).to(torch.float32)


山东大学（威海） 数学

### 损失函数

In [23]:
# 损失函数
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(test.reshape(-1,test.size(-1)), torch.tensor(target))
print(loss)
print(net_with_lora)

tensor(4.3606, grad_fn=<NllLossBackward0>)
ModelWithLora(
  (final_layer): TransformerBlockWithLora(
    (before_attention_rms_norm): RMSNorm()
    (attention): AttentionWithLora(
      (lora_q): myLoRALayer()
      (lora_v): myLoRALayer()
    )
    (before_ffn_rms_norm): RMSNorm()
    (feed_forward): FeedForward()
  )
  (norm): RMSNorm()
)


### 权重和梯度

In [24]:
print(net_with_lora.final_layer.attention.lora_q.A)
print(net_with_lora.final_layer.attention.lora_q.B)

Parameter containing:
tensor([[ 0.0614,  0.3645,  0.0860,  ..., -0.3134, -0.5667, -0.1370],
        [-0.1397,  0.6884,  0.1433,  ..., -0.2076, -0.0706, -0.6193],
        [-0.2930,  0.0456,  0.2792,  ...,  0.3079,  0.1363, -0.5430],
        ...,
        [ 0.6159,  0.3117, -0.4860,  ..., -0.1547, -0.1251, -0.0432],
        [ 0.2622,  0.2138,  0.1748,  ..., -0.0349,  0.2258,  0.4116],
        [ 0.5163, -0.3370,  0.2771,  ..., -0.2284,  0.5233, -0.1126]],
       requires_grad=True)
Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)


In [25]:
print(net_with_lora.final_layer.attention.lora_q.B.grad)
print(net_with_lora.final_layer.attention.lora_q.A.grad)
print(net_with_lora.final_layer.attention.lora_v.B.grad)
print(net_with_lora.final_layer.attention.lora_v.A.grad)

None
None
None
None


In [26]:
loss.backward()

In [27]:
print(net_with_lora.final_layer.attention.lora_q.B.grad.shape)
print(net_with_lora.final_layer.attention.lora_q.A.grad.shape)
print(net_with_lora.final_layer.attention.lora_v.B.grad.shape)
print(net_with_lora.final_layer.attention.lora_v.A.grad.shape)

torch.Size([8, 4096])
torch.Size([4096, 8])
torch.Size([8, 1024])
torch.Size([4096, 8])


### SDG

In [28]:
# 更新权重
def update_weights(lora_weights, lr=0.01):
    lora_A = lora_weights.A - lr * lora_weights.A.grad
    lora_B = lora_weights.B - lr * lora_weights.B.grad
    return lora_A, lora_B


### 模型Lora微调



In [29]:
learning_rate = 0.01
num_epochs = 10
net_with_lora = ModelWithLora(layer_weights,lora_q_A, lora_q_B, lora_v_A, lora_v_B, args)
# 训练循环
for epoch in range(num_epochs):
    outputs = net_with_lora(numpy_to_tensor(final_embedding), numpy_to_tensor(mask), numpy_to_tensor(freqs_cis)) # 前向传播
    print(" ",end="")
    loss = criterion(outputs.reshape(-1,test.size(-1)), torch.tensor(target)) # 计算损失
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    loss.backward()         # 反向传播

    # 更新参数
    lora_q_A_tmp, lora_q_B_tmp = update_weights(net_with_lora.final_layer.attention.lora_q, lr=learning_rate)
    lora_v_A_tmp, lora_v_B_tmp = update_weights(net_with_lora.final_layer.attention.lora_v, lr=learning_rate)
    net_with_lora = ModelWithLora(layer_weights,
                                  lora_q_A_tmp, lora_q_B_tmp, lora_v_A_tmp, lora_v_B_tmp, 
                                  args)


  self.n_rep = self.n_heads // self.n_kv_heads
  self.head_dim = torch.tensor(args.dim) // self.n_heads


山东大学（威海） 数学 Epoch [1/10], Loss: 4.3606
山东大学（威海） 数科 Epoch [2/10], Loss: 1.7959
山东大学（威海） 数（ Epoch [3/10], Loss: 4.0441
山东大学（威海） 数 数 Epoch [4/10], Loss: 3.9580
山东大学（威海） 数科 Epoch [5/10], Loss: 3.3211
山东大学（威海） 数科 Epoch [6/10], Loss: 3.2281
山东大学（威海） 数科 Epoch [7/10], Loss: 2.3247
山东大学（威海） 数科 Epoch [8/10], Loss: 0.8000
山东大学（威海） 数科 Epoch [9/10], Loss: 0.1377
山东大学（威海） 数科 Epoch [10/10], Loss: 0.1714


## Numpy组合推理

In [30]:
def tensor_to_numpy(tensor):
    return tensor.detach().to(torch.float32).numpy()

In [31]:
dic = {}
dic["lora_q_A"] = tensor_to_numpy(lora_q_A_tmp)
dic["lora_q_B"] = tensor_to_numpy(lora_q_B_tmp)
dic["lora_v_A"] = tensor_to_numpy(lora_v_A_tmp)
dic["lora_v_B"] = tensor_to_numpy(lora_v_B_tmp)
# np.savez_compressed("layer.31.lora.qv.weight.npz", **dic)


In [35]:
lora_weights = np.load("layer.31.lora.qv.weight.npz")

In [36]:
from config import ModelArgs

def precompute_freqs_cis(head_dim: int, max_seq_len: int, rope_theta: int = 10000):
    """
    cis(x)=cos(x)+i·sin(x)
    Args:
        head_dim: head_dim = dim // n_heads --> 4096  / 32 = 128 ------>64组
        max_seq_len: int
        rope_theta: 对应parameter `rope_theta: 500000.0`，默认值为10000
    可以看出旋转位置编码使用复数表示，比SinCos位置编码，实部代表偶数，虚部代表奇数。 cos和sin都在一个数中，因此旋转位置编码比绝对的SinCos位置编码要少一半。进一步减少内存占用
    """
    # [HD//2]
    freqs = 1.0 / (rope_theta ** (np.arange(0, head_dim, 2)[: (head_dim // 2)] / head_dim))
    # 对应freqs_for_each_token部分，这里取了max_seq_len 
    # [M, HD//2]
    freqs_for_each_token = np.outer(np.arange(max_seq_len), freqs)
    freqs_cis = np.ones_like(freqs_for_each_token) * np.exp(1j * freqs_for_each_token)
    return freqs_cis


def apply_rotary_emb(xq, xk, freqs_cis):
    # 将query和key复数化，具体就是将dim维度分成两半，每一半是dim/2维，分别用作实数和虚数部分
    # [bathsize, seq_len, heads, head_dim] -> [bathsize, seq_len, heads, head_dim//2, 2]
    # ["B, L or 1, QHN,  HD"] -> ["B, L or 1, QHN,   HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    # ["B, L or 1, KVHN, HD"] -> ["B, L or 1, KVHN,  HD//2, 2"] -> ["B, L or 1, QHN,   HD//2"]
    xq_ = view_as_complex(xq.reshape(*xq.shape[:-1], -1, 2))
    xk_ = view_as_complex(xk.reshape(*xk.shape[:-1], -1, 2))
    # 将复数表示的query和key与频率进行点积，得到旋转后的query和key
    # 1.将频率进行广播，使其形状与query和key匹配
    # ["M, HD//2"] -> ["1, 1, M, HD//2"]
    freqs_cis = np.expand_dims(freqs_cis, axis=(0,2))
    # 2.将query和key与频率进行点积
    # ["B, L or 1, QHN,  HD//2"] * ["1, 1, M, HD//2"] -> ["B, L or 1, QHN,  M"]
    xq_out_split = xq_ * freqs_cis
    xk_out_split = xk_ * freqs_cis
    # 将旋转后的query和key转换回实数表示
    xq_out_split = view_as_real(xq_out_split)
    xk_out_split = view_as_real(xk_out_split)
    # 将旋转后的query和key合并回原来的形状
    xq_out = xq_out_split.reshape(*xq.shape[:-1], -1)
    xk_out = xk_out_split.reshape(*xk.shape[:-1], -1)
    return xq_out, xk_out

def repeat_kv(x, n_rep: int):
    if n_rep == 1:
        return x
    xs = np.repeat(x, n_rep, axis=2)
    return xs

class RMSNorm:
    def __init__(self, weight, eps: float):
        self.eps = eps
        self.weight = weight
    def forward(self, x):
        # 计算向量的平方平均值  [B, L or 1, D] -> [B, L or 1, 1]
        squared_mean = np.mean(x**2, axis=-1, keepdims=True)
        # 计算向量的均方根      [B, L or 1, 1]
        rms = np.sqrt(squared_mean + self.eps)
        # 计算归一化权重        [B, L or 1, D]
        normalized_weight = x * self.weight / rms
        return normalized_weight

class Attention:
    def __init__(self, wq, wk, wv, wo, args):
        # KVHN
        self.n_kv_heads = args.n_kv_heads
        # QHN, HN
        self.n_heads = args.n_heads
        # 每个KV头共享的Q头的个数 SHD = 4
        self.n_rep = self.n_heads // self.n_kv_heads
        # D // HN = 4096 // 32 = 128
        self.head_dim = args.dim // self.n_heads
        # wq: [D, D], wk: [D // 4, D], wv: [D  // 4, D], wo: [D, D]
        self.wq = wq.T
        self.wk = wk.T
        self.wv = wv.T
        self.wo = wo.T
        
        self.cache_k = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        self.cache_v = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))

    def forward(self, x, start_pos: int, mask, freqs_cis):
        # x: [B, L or 1, D]
        B, L, _ = x.shape

        # QKV
        # xq: [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        # xk: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        # xv: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        xq = x @ self.wq
        xk = x @ self.wk
        xv = x @ self.wv
        # 维度转换，将注意力头分离
        # xq: [B, L or 1, D]      -> [B, L or 1, HN, HD]    [1, 1, 32, 128]
        # xk: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        # xv: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        xq = xq.reshape(B, L, self.n_heads, self.head_dim)
        xk = xk.reshape(B, L, self.n_kv_heads, self.head_dim)
        xv = xv.reshape(B, L, self.n_kv_heads, self.head_dim)

        # RoPE
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)

        # KV Cache
        self.cache_k[:B, start_pos:start_pos + L] = xk
        self.cache_v[:B, start_pos:start_pos + L] = xv
        # ks: [B, L, KVHN, HD], vs: [B, L, KVHN, HD]
        ks = self.cache_k[:B, start_pos:start_pos + L]
        vs = self.cache_v[:B, start_pos:start_pos + L]
        
        # GQA
        # xk: [B, L, HN, HD], xv: [B, L, HN, HD]
        xk = repeat_kv(ks, self.n_rep)
        xv = repeat_kv(vs, self.n_rep)

        # [B, L, HN, HD] -> [B, HN, L, HD]
        xq = xq.transpose(0, 2, 1, 3)
        xk = xk.transpose(0, 2, 1, 3)
        xv = xv.transpose(0, 2, 1, 3)

        # Scaled Dot-Product Attention 乘和缩放
        # [B, HN, L or 1, HD] @ [B, HN, HD, L] -> [B, HN, L or 1, L]
        attention = xq @ xk.transpose(0, 1, 3, 2) / np.sqrt(self.head_dim)
        # `mask` is used only once at the beginning.
        if mask is not None:
            attention = attention + mask[None, None, :, :]
        attention = softmax(attention)
        # [B, HN, L or 1, L] @ [B, HN, L, HD] -> [B, HN, L or 1, HD]
        output = attention @ xv

        # [B, HN, L or 1, HD] -> [B, L or 1, HN, HD] -> [B, L or 1, D]
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        # [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        output = output @ self.wo
        return output

class FeedForward:
    def __init__(self, up_weight, gate_weight, down_weight):
        self.up_weight = up_weight.T     # w3
        self.gate_weight = gate_weight.T # w1
        self.down_weight = down_weight.T # w2

    def forward(self, x):
        # FD = 14336
        # [B, L or 1, D] @ [D, 14336] -> [B, L or 1, 14336]
        swish = silu(x @ self.gate_weight)
        # [B, L or 1, D] @ [D, FD] -> [B, L or 1, FD]
        x_V = x @ self.up_weight
        # [B, L or 1, FD] @ [B, L or 1, FD] -> [B, L or 1, FD]
        x = swish * x_V
        # [B, L or 1, FD] @ [FD, D] -> [B, L or 1, D]
        x = x @ self.down_weight
        return x

class TransformerBlock:
    def __init__(self, layer_weights: dict, layer_id: int, args: ModelArgs):
        
        self.before_attention_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.attention_norm.weight"],
            eps=args.norm_eps
        )
        self.attention = Attention(
            layer_weights[f"layers.{layer_id}.attention.wq.weight"],
            layer_weights[f"layers.{layer_id}.attention.wk.weight"],
            layer_weights[f"layers.{layer_id}.attention.wv.weight"],
            layer_weights[f"layers.{layer_id}.attention.wo.weight"],
            args
        )
        
        self.before_ffn_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.ffn_norm.weight"],
            eps=args.norm_eps
        )
        self.feed_forward = FeedForward(
            layer_weights[f"layers.{layer_id}.feed_forward.w3.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w1.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w2.weight"]            
        )
    def forward(self, x, start_pos: int, mask, freqs_cis):
        # Attention---------------------------------------------------------
        # RMSNorm
        # [B, L or 1, D]
        norm_x = self.before_attention_rms_norm.forward(x)
        # Masked Multi-Head Attention
        # [B, L or 1, D]
        h1 = self.attention.forward(norm_x, start_pos, mask, freqs_cis)
        z = x + h1
        # Feed Forward----------------------------------------------------------
        # RMSNorm
        norm_z = self.before_ffn_rms_norm.forward(z)
        # Feed Forward + SwiGLU
        # [B, L or 1, D]
        h2 = self.feed_forward.forward(norm_z)
        out = z + h2
        return out

class myLoRALayer:
    def __init__(self, lora_A, lora_B, alpha, rank=8):
        self.rank = rank
        # lora_A: [D, r] lora_B: [r, D]
        self.lora_A = lora_A
        self.lora_B = lora_B
        self.alpha = alpha

    def forward(self, x):
        # [B, L or 1, D] @ [D, r] @ [r, D] -> [B, L or 1, D]
        return self.alpha * (x @ self.lora_A @ self.lora_B)

class AttentionWithLora:
    def __init__(self, wq, wk, wv, wo, lora_A_q, lora_B_q, lora_A_v, lora_B_v, args):
        # KVHN
        self.n_kv_heads = args.n_kv_heads
        # QHN, HN
        self.n_heads = args.n_heads
        # 每个KV头共享的Q头的个数 SHD = 4
        self.n_rep = self.n_heads // self.n_kv_heads
        # D // HN = 4096 // 32 = 128
        self.head_dim = args.dim // self.n_heads
        # wq: [D, D], wk: [D // 4, D], wv: [D  // 4, D], wo: [D, D]
        self.wq = wq.T
        self.wk = wk.T
        self.wv = wv.T
        self.wo = wo.T
        
        self.cache_k = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        self.cache_v = np.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim))
        
        # lora
        self.lora_q = myLoRALayer(lora_A_q, lora_B_q, args.alpha)
        self.lora_v = myLoRALayer(lora_A_v, lora_B_v, args.alpha)


    def forward(self, x, start_pos: int, mask, freqs_cis):
        # x: [B, L or 1, D]
        B, L, _ = x.shape

        # QKV
        # xq: [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        # xk: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        # xv: [B, L or 1, D] @ [D, D // 4] -> [B, L or 1, D // 4]
        xq = x @ self.wq + self.lora_q.forward(x)
        xk = x @ self.wk
        xv = x @ self.wv + self.lora_v.forward(x)
        # 维度转换，将注意力头分离
        # xq: [B, L or 1, D]      -> [B, L or 1, HN, HD]    [1, 1, 32, 128]
        # xk: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        # xv: [B, L or 1, D // 4] -> [B, L or 1, KVHN, HD]  [1, 1, 8,  128]
        xq = xq.reshape(B, L, self.n_heads, self.head_dim)
        xk = xk.reshape(B, L, self.n_kv_heads, self.head_dim)
        xv = xv.reshape(B, L, self.n_kv_heads, self.head_dim)

        # RoPE
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis)

        # KV Cache
        self.cache_k[:B, start_pos:start_pos + L] = xk
        self.cache_v[:B, start_pos:start_pos + L] = xv
        # ks: [B, L, KVHN, HD], vs: [B, L, KVHN, HD]
        ks = self.cache_k[:B, start_pos:start_pos + L]
        vs = self.cache_v[:B, start_pos:start_pos + L]
        
        # GQA
        # xk: [B, L, HN, HD], xv: [B, L, HN, HD]
        xk = repeat_kv(ks, self.n_rep)
        xv = repeat_kv(vs, self.n_rep)

        # [B, L, HN, HD] -> [B, HN, L, HD]
        xq = xq.transpose(0, 2, 1, 3)
        xk = xk.transpose(0, 2, 1, 3)
        xv = xv.transpose(0, 2, 1, 3)

        # Scaled Dot-Product Attention 乘和缩放
        # [B, HN, L or 1, HD] @ [B, HN, HD, L] -> [B, HN, L or 1, L]
        attention = xq @ xk.transpose(0, 1, 3, 2) / np.sqrt(self.head_dim)
        # `mask` is used only once at the beginning.
        if mask is not None:
            attention = attention + mask[None, None, :, :]
        attention = softmax(attention)
        # [B, HN, L or 1, L] @ [B, HN, L, HD] -> [B, HN, L or 1, HD]
        output = attention @ xv

        # [B, HN, L or 1, HD] -> [B, L or 1, HN, HD] -> [B, L or 1, D]
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        # [B, L or 1, D] @ [D, D] -> [B, L or 1, D]
        output = output @ self.wo
        return output

class TransformerBlockWithLora:
    def __init__(self, layer_weights: dict, lora_A_q, lora_B_q, lora_A_v, lora_B_v, layer_id: int, args: ModelArgs):
        
        self.before_attention_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.attention_norm.weight"],
            eps=args.norm_eps
        )
        self.attention = AttentionWithLora(
            layer_weights[f"layers.{layer_id}.attention.wq.weight"],
            layer_weights[f"layers.{layer_id}.attention.wk.weight"],
            layer_weights[f"layers.{layer_id}.attention.wv.weight"],
            layer_weights[f"layers.{layer_id}.attention.wo.weight"],
            lora_A_q, lora_B_q,
            lora_A_v, lora_B_v,
            args
        )
        
        self.before_ffn_rms_norm = RMSNorm(
            layer_weights[f"layers.{layer_id}.ffn_norm.weight"],
            eps=args.norm_eps
        )
        self.feed_forward = FeedForward(
            layer_weights[f"layers.{layer_id}.feed_forward.w3.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w1.weight"],
            layer_weights[f"layers.{layer_id}.feed_forward.w2.weight"]            
        )
    def forward(self, x, start_pos: int, mask, freqs_cis):
        # Attention---------------------------------------------------------
        # RMSNorm
        # [B, L or 1, D]
        norm_x = self.before_attention_rms_norm.forward(x)
        # Masked Multi-Head Attention
        # [B, L or 1, D]
        h1 = self.attention.forward(norm_x, start_pos, mask, freqs_cis)
        z = x + h1
        # Feed Forward----------------------------------------------------------
        # RMSNorm
        norm_z = self.before_ffn_rms_norm.forward(z)
        # Feed Forward + SwiGLU
        # [B, L or 1, D]
        h2 = self.feed_forward.forward(norm_z)
        out = z + h2
        return out


prompt = "山东大学（威海） 数"

tokens = [128000] + tokenizer.encode(prompt)
tok_embeddings_weight = np.load(Model_Home + "shuke/llama3.8b.shuke.tok_embeddings.weight.npz")
token_embeddings_unnormalized = tok_embeddings_weight['tok_embeddings.weight'][tokens]
del tok_embeddings_weight

def forward(x, lora_weights):
    import config
    import importlib
    importlib.reload(config)
    from config import ModelArgs
    args = ModelArgs()
    # 前31层不用lora
    L = x.shape[1]
    freqs_cis = precompute_freqs_cis(args.dim // args.n_heads, args.max_seq_len)[0:0+L]
    mask = np.full((L, L), float('-inf'))
    mask = np.triu(mask, k=1)
    mask = np.concatenate([np.zeros((L, 0)), mask], axis=1)


    for i in range(31):
        layer_weights = np.load(Model_Home + f"shuke/llama3.8b.shuke.layer.{i}.npz")
        x = TransformerBlock(layer_weights, i, args).forward(x, 0, mask, freqs_cis)
    # 最后一层用lora
    layer_weights = np.load(Model_Home + f"shuke/llama3.8b.shuke.layer.{31}.npz")
    final_layer = TransformerBlockWithLora(layer_weights, 
                                           lora_weights['lora_q_A'], lora_weights['lora_q_B'], 
                                           lora_weights['lora_v_A'], lora_weights['lora_v_B'], 31, args)
    # h=x
    h = final_layer.forward(x, 0, mask, freqs_cis)
    norm = RMSNorm(
            np.load(Model_Home + f"shuke/llama3.8b.shuke.norm.weight.npz")["norm.weight"],
            eps=args.norm_eps
        )
    h = norm.forward(h)
    # logits = h[:,[-1],:] @ np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T
    logits = h @ np.load(Model_Home + f"shuke/llama3.8b.shuke.output.weight.npz")["output.weight"].T
    output_id = logits[:,-1,:].argmax(-1,keepdims=True)
    next_id = output_id[0].tolist()
    print(f"{prompt[:]}", end="")
    print(tokenizer.decode(next_id), end="")
    return logits

final_output = forward(np.expand_dims(token_embeddings_unnormalized,0), lora_weights)
# final_output = forward(np.expand_dims(token_embeddings_unnormalized,0), dic)


山东大学（威海） 数科