In [1]:
import os
import string
import torch
from torch import nn
from torch.nn import functional as F
if not "CHDIR_FLAG" in dir():
    os.chdir("../")
    CHDIR_FLAG = True
    
import numpy as np
import pandas as pd
from safetensors import safe_open
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


from src.tools.plot import plot_tensor_mean_and_variance, plot_tensor_histogram, plot_tensor_heatmap
from src.tools.torch import register_forward_hook_decorator, register_backward_hook_decorator

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

# Preliminary

1. 在之前的研究中，我们发现靠后的层具有更重要的作用？是否有可能丢弃一些不重要的层？
2. 同理在之前针对LoRA块的奇异值分析中发现，某些情况下，v_proj 的确是低秩的，远远不如 q_proj 和 k_proj，猜想为 v_proj 可能不值得调优

# 测试Skip-layer猜想

Skip-Layer:

- 因为中间层的“Δ输入”以及“Δ输出”（两者本质上是同一个，只是相差一个相位）比较小，或许可以跳过其中的直接到最后一层

In [7]:
model_home = r"D:\resource\model\huggingface"
model_names = [
    r"Qwen\Qwen2.5-0.5B-Instruct",
    r"deepseek-ai\DeepSeek-R1-Distill-Qwen-1.5B",
]
hook_data_paths = [
    r"./results/strawberry-X-4/fhook+Qwen2.5-0.5B-Instruct+True-0.pt",
    r"./results/longlong-1/fhook+Qwen2.5-0.5B-Instruct+True-0.pt",
    r"./results/prime-1/fhook+Qwen2.5-0.5B-Instruct+True-0.pt",
    r"./results/table-1/fhook+Qwen2.5-0.5B-Instruct+True-1.pt",
    r"./results/strawberry-X-3/fhook+DeepSeek-R1-Distill-Qwen-1.5B+True-0.pt",
]

In [4]:
model = AutoModelForCausalLM.from_pretrained(os.path.join(model_home, model_names[0]))

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [18]:
forward_hook_module_names = [f"model.layers[{i}]" for i in range(24)]
hook_data = torch.load(hook_data_paths[0])

In [25]:
hook_data[i][forward_hook_module_names[0]]["input"][0]

(tensor([[[-6.1340e-03,  1.1230e-02,  2.1210e-03, -4.9744e-03, -3.6926e-03,
            8.0566e-03,  3.4943e-03, -8.0566e-03, -3.2349e-03,  2.0981e-04,
           -9.8877e-03, -1.4343e-03, -2.9144e-03, -1.3855e-02,  1.6968e-02,
            6.0120e-03,  1.5259e-02,  9.8877e-03,  9.7656e-03, -7.4158e-03,
            2.7832e-02,  1.3245e-02,  7.9956e-03,  3.0670e-03, -9.1553e-04,
            6.9275e-03,  3.5095e-03, -4.7607e-03, -3.7384e-03,  4.9744e-03,
            9.6436e-03, -1.0559e-02, -7.8201e-05,  1.6861e-03,  1.0620e-02,
            2.8534e-03, -2.1935e-04, -5.7068e-03, -1.2756e-02,  5.2795e-03,
            1.1536e-02,  1.7071e-04, -1.4038e-03,  6.8970e-03, -2.3270e-04,
           -2.2736e-03, -7.2632e-03, -7.6904e-03,  2.3560e-02, -1.3306e-02,
            8.0566e-03,  7.7820e-03,  1.0681e-02, -1.1902e-02,  7.2327e-03,
            8.7891e-03, -2.2217e-02,  2.0020e-02, -4.3030e-03,  2.9175e-02,
            2.2888e-03,  8.4686e-04, -7.3242e-03, -6.9275e-03, -7.0190e-03,
            

In [21]:
for i in range(1, 24):
    # Skip the first inputs
    input_0 = hook_data[i][forward_hook_module_names[0]]["input"][0][0]
    output_0 = hook_data[i][forward_hook_module_names[0]]["output"][0][0]
    input_1 = hook_data[i][forward_hook_module_names[1]]["input"][0][0]
    output_1 = hook_data[i][forward_hook_module_names[0]]["output"][0][0]
    # plot_tensor_histogram(output_0 - input_1, is_show=True)
    module = eval(f"model.{forward_hook_module_names[0]}")
    output_1_0 = module.forward(input_0)
    break

TypeError: cannot unpack non-iterable NoneType object

In [26]:
def register_decoder_hook(model, layer_idx):
    hook_data = {"input": None, "output": None}
    
    def hook(module, input, output):
        hook_data["input"] = input
        hook_data["output"] = output
    
    handle = model.model.layers[layer_idx].register_forward_hook(hook)
    return hook_data, handle

# 使用示例
hook_data, handle = register_decoder_hook(model, 0)

# 运行完整的前向传播
input_ids = torch.tensor([[1, 2, 3]])  # 示例输入
outputs = model(input_ids)

# 现在你可以获取完整的输入
full_inputs = hook_data["input"]
hidden_states = full_inputs[0]
attention_mask = full_inputs[1] if len(full_inputs) > 1 else None
position_ids = full_inputs[2] if len(full_inputs) > 2 else None
past_key_value = full_inputs[3] if len(full_inputs) > 3 else None
position_embeddings = full_inputs[4] if len(full_inputs) > 4 else None

# 现在你可以正确调用decoder层
module = model.model.layers[0]
output = module(
    hidden_states=hidden_states,
    attention_mask=attention_mask,
    position_ids=position_ids,
    past_key_value=past_key_value,
    position_embeddings=position_embeddings
)

handle.remove()  # 记得移除hook

TypeError: cannot unpack non-iterable NoneType object

In [28]:
full_inputs[0].size()

torch.Size([1, 3, 896])