下载模型

huggingface-cli download --resume-download charent/Phi2-Chinese-0.2B --local-dir Phi

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from waitress import serve
import numpy as np
from collections import OrderedDict
import multiprocessing
from functools import partial
from fractions import Fraction
import utils_function
# import dill
import multiprocessing

In [4]:
# 定义模型路径
local_model_path = "./Phi"

In [None]:
# 加载分词器和模型
try:
    tokenizer = AutoTokenizer.from_pretrained(local_model_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        local_model_path,
        trust_remote_code=True,
        torch_dtype=torch.float32,
        device_map='auto'
    )
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading or saving model: {e}")

In [None]:
# 提取权重
print("Extracting model weights...")
state_dict = model.state_dict()  # 获取模型的权重
print(state_dict)

In [None]:
state_dict_numpy = {key: value.cpu().numpy() for key, value in state_dict.items()}

quantized_weights_map = {}
quantized_float_weights_map = {}

# for key, value in state_dict_numpy.items():
#     print(f"层名称: {key}")
#     print(f"原始权重矩阵:\n{value}")
#     quantized_weights = utils_function.approximate_with_rational(value, precision = 2)
#     print(f"量化后的权重矩阵:\n{quantized_weights}")
#     quantized_weights_map[key] = quantized_weights
#     float_weights = utils_function.convert_fractions_to_floats(quantized_weights,precision = 4)
#     print(f"转换为浮点数的权重矩阵:\n{float_weights}")
#     quantized_float_weights_map[key] = float_weights

def process_layer(key, value, precision=2, float_precision=4):
    # 量化权重矩阵
    quantized_weights = utils_function.approximate_with_rational(value, precision=precision)
    # 转换为浮点数的权重矩阵
    float_weights = utils_function.convert_fractions_to_floats(quantized_weights, precision=float_precision)
    return key, quantized_weights, float_weights


def process():
    # 定义量化精度
    precision = 2
    float_precision = 4

    # 创建部分函数，固定精度参数
    process_layer_fn = partial(process_layer, precision=precision, float_precision=float_precision)

    # 将 state_dict_numpy 的 items 转换为列表，用于并行处理
    items = list(state_dict_numpy.items())

    # 初始化多进程池
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.starmap(process_layer_fn, items)

    # 解析结果
    quantized_weights_map = {key: quantized_weights for key, quantized_weights, _ in results}
    quantized_float_weights_map = {key: float_weights for key, _, float_weights in results}

    # 返回或打印最终结果
    return quantized_weights_map, quantized_float_weights_map

quantized_weights_map, quantized_float_weights_map = process()



In [9]:
converted_state_dict = OrderedDict()

for key,value in quantized_float_weights_map.items():
    converted_state_dict[key] = torch.tensor(value, device='cuda:0')

In [None]:
print(converted_state_dict)
torch.save(converted_state_dict, "model_weights.pth")

In [None]:
model.load_state_dict(converted_state_dict)

In [None]:
question = "讲讲快排算法的原理和实现过程。"

messages = [{'role': 'user', 'content': question}]
inputs = tokenizer.apply_chat_template(
    messages,
    chat_template="content",  
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

outputs = model.generate(
    inputs,
    max_new_tokens=1024,
    do_sample=False,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
print(response)