In [10]:
import numpy as np
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer,AutoModelForCausalLM
import bitmod  # 假设已编译好的 C++ 扩展模块
import os

# 加载本地模型
def load_llama_model(model_dir):
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir,device_map='auto',load_in_8bit=True)
    return model, tokenizer

def save_orgin_model(model, model_dir):
    # 获取原始模型名称
    model_name = os.path.basename(model_dir.rstrip("/"))
    
    # 拼接保存路径
    save_dir = f"/data/gaozh/SPA/{model_name}-orgin-int8"
    
    # 创建目录（如果不存在）
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # 保存模型
    model.save_pretrained(save_dir)
    print(f"模型已保存到: {save_dir}")
    
# 计算指定 bit 位的 1 的概率
def calculate_bit_probabilities(weight_tensor, bit_size=8):
    # 检查是否是 int8 类型
    if bit_size == 8:
        weight_as_bits = weight_tensor.cpu().numpy().view(np.int8)
    else:
        raise ValueError("当前只支持 int8 类型权重")
    
    probabilities = {}
    for bit_idx in range(bit_size):
        total_bits = weight_as_bits.size
        # 计算 bit_idx 位上的 1 的数量
        ones_count = np.sum((weight_as_bits >> bit_idx) & 1)
        probabilities[bit_idx] = ones_count / total_bits
    return probabilities

# 调用 C++ 模块进行位的概率修改
def modify_weight_bits(weight_tensor, bit_indices, probabilities, strategy, bit_size=8):
    # 确保权重是 int8 类型
    if bit_size == 8:
        weight_numpy = weight_tensor.cpu().numpy().view(np.int8)
        modified_weight_numpy = bitmod.modify_bits_int8(weight_numpy, bit_indices, probabilities, strategy)
        modified_weight_tensor = torch.from_numpy(modified_weight_numpy).to(weight_tensor.device)
    else:
        raise ValueError("当前只支持 int8 类型权重")
    
    return modified_weight_tensor

# 只修改特定的层
def modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8):
    layer_bit_prob_dict = {}

    # 需要修改的特定层名
    layers_to_modify = [
        "model.layers.{}.self_attn.q_proj",
        "model.layers.{}.self_attn.k_proj",
        "model.layers.{}.self_attn.v_proj",
        "model.layers.{}.self_attn.o_proj",
        "model.layers.{}.mlp.gate_proj",
        "model.layers.{}.mlp.up_proj",
        "model.layers.{}.mlp.down_proj"
    ]

    for layer_name, param in model.named_parameters():
        # 遍历每层并修改第 0-31 层
        if any(layer_name.startswith(layer.format(n)) for layer in layers_to_modify for n in range(32)):
            print(f"Modifying layer: {layer_name}")
            
            original_prob = calculate_bit_probabilities(param.data, bit_size)
            
            modified_weight = modify_weight_bits(param.data, bit_indices, probabilities, strategy, bit_size)
            param.data = modified_weight  # 修改后的权重重新赋值
            
            modified_prob = calculate_bit_probabilities(param.data, bit_size)
            
            # 保存每一层修改前后的概率
            layer_bit_prob_dict[layer_name] = {
                "original_prob": original_prob,
                "modified_prob": modified_prob
            }

    return layer_bit_prob_dict

def save_modified_model(model, model_dir, strategy_name):
    # 获取原始模型名称
    model_name = os.path.basename(model_dir.rstrip("/"))
    
    # 拼接保存路径
    save_dir = f"/data/gaozh/SPA/{model_name}-{strategy_name}"
    
    # 创建目录（如果不存在）
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # 保存模型
    model.save_pretrained(save_dir)
    print(f"模型已保存到: {save_dir}")

# 测试函数，检查是否只对特定层进行修改



In [11]:
model_dir = "/data/gaozh/llama1-hf/llama-7b/"
model, tokenizer = load_llama_model(model_dir)
save_orgin_model(model, model_dir)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.70s/it]


模型已保存到: /data/gaozh/SPA/llama-7b-orgin-int8


In [9]:
def test_specific_layer_modification():
    # 假设 llama 模型已经下载到本地路径
    model_dir = "/data/gaozh/llama1-hf/llama-7b/"
    
    # 加载模型
    model, tokenizer = load_llama_model(model_dir)
    
    # 定义要修改的 bit 位和目标概率
    bit_indices = [3, 4]  # 修改第3和第4位
    probabilities = [0.4, 0.6]  # 第3位目标概率0.4，第4位0.6
    strategy = 1  # 选择策略1

    # 只修改特定层的权重
    layer_bit_probabilities = modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8)
    
    # 打印修改前后的概率
    for layer, prob_dict in layer_bit_probabilities.items():
        print(f"Layer: {layer}")
        print(f"Original Probabilities: {prob_dict['original_prob']}")
        print(f"Modified Probabilities: {prob_dict['modified_prob']}")

    save_modified_model(model, model_dir, strategy_name=f"strategy_{strategy}")
        
    return layer_bit_probabilities

# 运行测试
if __name__ == "__main__":
    test_specific_layer_modification()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.57s/it]


Modifying layer: model.layers.0.self_attn.q_proj.weight
Modifying layer: model.layers.0.self_attn.k_proj.weight
Modifying layer: model.layers.0.self_attn.v_proj.weight
Modifying layer: model.layers.0.self_attn.o_proj.weight
Modifying layer: model.layers.0.mlp.gate_proj.weight
Modifying layer: model.layers.0.mlp.up_proj.weight
Modifying layer: model.layers.0.mlp.down_proj.weight
Modifying layer: model.layers.1.self_attn.q_proj.weight
Modifying layer: model.layers.1.self_attn.k_proj.weight
Modifying layer: model.layers.1.self_attn.v_proj.weight
Modifying layer: model.layers.1.self_attn.o_proj.weight
Modifying layer: model.layers.1.mlp.gate_proj.weight
Modifying layer: model.layers.1.mlp.up_proj.weight
Modifying layer: model.layers.1.mlp.down_proj.weight
Modifying layer: model.layers.2.self_attn.q_proj.weight
Modifying layer: model.layers.2.self_attn.k_proj.weight
Modifying layer: model.layers.2.self_attn.v_proj.weight
Modifying layer: model.layers.2.self_attn.o_proj.weight
Modifying laye

In [8]:
def test_specific_layer_modification():
    # 假设 llama 模型已经下载到本地路径
    model_dir = "/data/gaozh/llama1-hf/llama-7b/"
    
    # 加载模型
    model, tokenizer = load_llama_model(model_dir)
    
    # 定义要修改的 bit 位和目标概率
    bit_indices = [3, 4]  # 修改第3和第4位
    probabilities = [0.4, 0.6]  # 第3位目标概率0.4，第4位0.6
    strategy = 2  # 选择策略1

    # 只修改特定层的权重
    layer_bit_probabilities = modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8)
    
    # 打印修改前后的概率
    for layer, prob_dict in layer_bit_probabilities.items():
        print(f"Layer: {layer}")
        print(f"Original Probabilities: {prob_dict['original_prob']}")
        print(f"Modified Probabilities: {prob_dict['modified_prob']}")

    save_modified_model(model, model_dir, strategy_name=f"strategy_{strategy}")
        
    return layer_bit_probabilities

# 运行测试
if __name__ == "__main__":
    test_specific_layer_modification()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.75s/it]


Modifying layer: model.layers.0.self_attn.q_proj.weight
Modifying layer: model.layers.0.self_attn.k_proj.weight
Modifying layer: model.layers.0.self_attn.v_proj.weight
Modifying layer: model.layers.0.self_attn.o_proj.weight
Modifying layer: model.layers.0.mlp.gate_proj.weight
Modifying layer: model.layers.0.mlp.up_proj.weight
Modifying layer: model.layers.0.mlp.down_proj.weight
Modifying layer: model.layers.1.self_attn.q_proj.weight
Modifying layer: model.layers.1.self_attn.k_proj.weight
Modifying layer: model.layers.1.self_attn.v_proj.weight
Modifying layer: model.layers.1.self_attn.o_proj.weight
Modifying layer: model.layers.1.mlp.gate_proj.weight
Modifying layer: model.layers.1.mlp.up_proj.weight
Modifying layer: model.layers.1.mlp.down_proj.weight
Modifying layer: model.layers.2.self_attn.q_proj.weight
Modifying layer: model.layers.2.self_attn.k_proj.weight
Modifying layer: model.layers.2.self_attn.v_proj.weight
Modifying layer: model.layers.2.self_attn.o_proj.weight
Modifying laye

In [None]:
#这里使用了lm-eval项目做accuracy评测，可以在github中找到
lm_eval --model hf     --model_args pretrained=/data/gaozh/SPA/llama-7b-strategy_1     --tasks mmlu     --batch_size auto  --output_path "/data/gaozh/SPA/llama-7b-strategy_1/output/"

In [4]:

import numpy as np
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM
import bitmod  # Import the compiled C++ extension
import os
import json

# Load local model
def load_llama_model(model_dir):
    tokenizer = LlamaTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', load_in_8bit=True)
    return model, tokenizer

# Save the original model
def save_orgin_model(model, model_dir):
    model_name = os.path.basename(model_dir.rstrip("/"))
    save_dir = f"/data/gaozh/SPA/{model_name}-orgin-int8"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    model.save_pretrained(save_dir)
    print(f"Original model saved to: {save_dir}")

# Calculate bit probability of 1's
def calculate_bit_probabilities(weight_tensor, bit_size=8):
    if bit_size == 8:
        weight_as_bits = weight_tensor.cpu().numpy().view(np.int8)
    else:
        raise ValueError("Only int8 is supported")

    probabilities = {}
    for bit_idx in range(bit_size):
        total_bits = weight_as_bits.size
        ones_count = np.sum((weight_as_bits >> bit_idx) & 1)
        probabilities[bit_idx] = ones_count / total_bits
    return probabilities

# Modify weights using C++ Random Modify or Silent Modify methods
def modify_weight_bits(weight_tensor, bit_indices, probabilities, strategy, bit_size=8, top_percent=None):
    if bit_size == 8:
        weight_numpy = weight_tensor.cpu().numpy().view(np.int8)
        if top_percent is not None:
            modified_weight_numpy = bitmod.silent_modify_bits_int8(weight_numpy, bit_indices, probabilities, top_percent, strategy)
        else:
            modified_weight_numpy = bitmod.modify_bits_int8(weight_numpy, bit_indices, probabilities, strategy)
        modified_weight_tensor = torch.from_numpy(modified_weight_numpy).to(weight_tensor.device)
    else:
        raise ValueError("Only int8 is supported")
    return modified_weight_tensor

# Modify specific layers
def modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8, top_percent=None):
    layer_bit_prob_dict = {}

    # Specific layers to modify
    layers_to_modify = [
        "model.layers.{}.self_attn.q_proj",
        "model.layers.{}.self_attn.k_proj",
        "model.layers.{}.self_attn.v_proj",
        "model.layers.{}.self_attn.o_proj",
        "model.layers.{}.mlp.gate_proj",
        "model.layers.{}.mlp.up_proj",
        "model.layers.{}.mlp.down_proj"
    ]

    for layer_name, param in model.named_parameters():
        if any(layer_name.startswith(layer.format(n)) for layer in layers_to_modify for n in range(32)):
            print(f"Modifying layer: {layer_name}")
            original_prob = calculate_bit_probabilities(param.data, bit_size)
            modified_weight = modify_weight_bits(param.data, bit_indices, probabilities, strategy, bit_size, top_percent)
            param.data = modified_weight
            modified_prob = calculate_bit_probabilities(param.data, bit_size)
            layer_bit_prob_dict[layer_name] = {
                "original_prob": original_prob,
                "modified_prob": modified_prob
            }

    return layer_bit_prob_dict

# Save the modified model
def save_modified_model(model, model_dir, strategy_name, bit_prob_dict):
    model_name = os.path.basename(model_dir.rstrip("/"))
    save_dir = f"/data/gaozh/SPA/{model_name}-{strategy_name}"

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Save the model
    model.save_pretrained(save_dir)
    print(f"Model saved to: {save_dir}")
    
    # Save bit probabilities dictionary as a JSON file
    bit_prob_file = os.path.join(save_dir, "bit_probabilities.json")
    with open(bit_prob_file, 'w') as json_file:
        json.dump(bit_prob_dict, json_file, indent=4)
    print(f"Bit probabilities saved to: {bit_prob_file}")

# Test function for modification
def test_specific_layer_modification():
    model_dir = "/data/gaozh/llama1-hf/llama-7b/"
    model, tokenizer = load_llama_model(model_dir)

    bit_indices = [3, 4]
    probabilities = [0.8, 0.55]
    strategy = 3  # Choose strategy 3 or 4 for Silent Modify with top_percent skipping
    top_percent = 0.2  # Skip top 20% of the highest absolute values

    layer_bit_probabilities = modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8, top_percent=top_percent)

    for layer, prob_dict in layer_bit_probabilities.items():
        print(f"Layer: {layer}")
        print(f"Original Probabilities: {prob_dict['original_prob']}")
        print(f"Modified Probabilities: {prob_dict['modified_prob']}")

    save_modified_model(model, model_dir, strategy_name=f"strategy_{strategy}", bit_prob_dict=layer_bit_probabilities)

if __name__ == "__main__":
    test_specific_layer_modification()


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.17s/it]


Modifying layer: model.layers.0.self_attn.q_proj.weight
Modifying layer: model.layers.0.self_attn.k_proj.weight
Modifying layer: model.layers.0.self_attn.v_proj.weight
Modifying layer: model.layers.0.self_attn.o_proj.weight
Modifying layer: model.layers.0.mlp.gate_proj.weight
Modifying layer: model.layers.0.mlp.up_proj.weight
Modifying layer: model.layers.0.mlp.down_proj.weight
Modifying layer: model.layers.1.self_attn.q_proj.weight
Modifying layer: model.layers.1.self_attn.k_proj.weight
Modifying layer: model.layers.1.self_attn.v_proj.weight
Modifying layer: model.layers.1.self_attn.o_proj.weight
Modifying layer: model.layers.1.mlp.gate_proj.weight
Modifying layer: model.layers.1.mlp.up_proj.weight
Modifying layer: model.layers.1.mlp.down_proj.weight
Modifying layer: model.layers.2.self_attn.q_proj.weight
Modifying layer: model.layers.2.self_attn.k_proj.weight
Modifying layer: model.layers.2.self_attn.v_proj.weight
Modifying layer: model.layers.2.self_attn.o_proj.weight
Modifying laye

In [5]:
def test_specific_layer_modification():
    model_dir = "/data/gaozh/llama1-hf/llama-7b/"
    model, tokenizer = load_llama_model(model_dir)

    bit_indices = [3, 4]
    probabilities = [0.8, 0.55]
    strategy = 4  # Choose strategy 3 or 4 for Silent Modify with top_percent skipping
    top_percent = 0.2  # Skip top 20% of the highest absolute values

    layer_bit_probabilities = modify_specific_layers(model, bit_indices, probabilities, strategy, bit_size=8, top_percent=top_percent)

    for layer, prob_dict in layer_bit_probabilities.items():
        print(f"Layer: {layer}")
        print(f"Original Probabilities: {prob_dict['original_prob']}")
        print(f"Modified Probabilities: {prob_dict['modified_prob']}")

    save_modified_model(model, model_dir, strategy_name=f"strategy_{strategy}", bit_prob_dict=layer_bit_probabilities)

if __name__ == "__main__":
    test_specific_layer_modification()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.08s/it]


Modifying layer: model.layers.0.self_attn.q_proj.weight
Modifying layer: model.layers.0.self_attn.k_proj.weight
Modifying layer: model.layers.0.self_attn.v_proj.weight
Modifying layer: model.layers.0.self_attn.o_proj.weight
Modifying layer: model.layers.0.mlp.gate_proj.weight
Modifying layer: model.layers.0.mlp.up_proj.weight
Modifying layer: model.layers.0.mlp.down_proj.weight
Modifying layer: model.layers.1.self_attn.q_proj.weight
Modifying layer: model.layers.1.self_attn.k_proj.weight
Modifying layer: model.layers.1.self_attn.v_proj.weight
Modifying layer: model.layers.1.self_attn.o_proj.weight
Modifying layer: model.layers.1.mlp.gate_proj.weight
Modifying layer: model.layers.1.mlp.up_proj.weight
Modifying layer: model.layers.1.mlp.down_proj.weight
Modifying layer: model.layers.2.self_attn.q_proj.weight
Modifying layer: model.layers.2.self_attn.k_proj.weight
Modifying layer: model.layers.2.self_attn.v_proj.weight
Modifying layer: model.layers.2.self_attn.o_proj.weight
Modifying laye