# 纯文本模型量化

In [None]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/meta-llama-3-8b-instruct"
quantized_model_dir = "/home/workspace/model/meta-llama-3-8b-instruct-w4-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    )
]

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)

# save quantized model
model.save_quantized(quantized_model_dir)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
2024-10-30 08:14:41 INFO [auto_gptq.quantization.gptq] duration: 10.037084102630615
2024-10-30 08:14:41 INFO [auto_gptq.quantization.gptq] avg loss: 2.1992526054382324
INFO - Quantizing self_attn.v_proj in layer 1/32...
2024-10-30 08:14:48 INFO [auto_gptq.quantization.gptq] duration: 7.339477300643921
2024-10-30 08:14:48 INFO [auto_gptq.quantization.gptq] avg loss: 0.03380981832742691
INFO - Quantizing self_attn.q_proj in layer 1/32...
2024-10-30 08:14:58 INFO [auto_gptq.quantization.gptq] duration: 9.718609094619751
2024-10-30 08:14:58 INFO [auto_gptq.quantization.gptq] avg loss: 3.3699684143066406
INFO - Quantizing self_attn.o_proj in layer 1/32...
2024-10-30 08:15:08 INFO [auto_gptq.quantization.gptq] duration: 10.15040373802185
2024-10-30 08:15:08 INFO [auto_gptq.quantization.gptq] avg loss: 0.0003416052204556763
INFO - Quantizing mlp.up_proj in layer 1/32...
2024-10-30 08:15:19 INFO [auto_gptq.q

<|begin_of_text|>auto_gptq is a simple Python script that uses the AutoGPT model to generate text
auto-gptq is a type of gptq that is generated internally by the system. It


In [3]:
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")

# download quantized model from Hugging Face Hub and load to the first GPU
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)

# inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])

INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The model 'LlamaGPTQForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNe

<|begin_of_text|>auto_gptq is a simple Python script that uses the AutoGPT model to generate text
auto-gptq is a type of gptq that is generated internally by the system. It


# 多模态模型文本基座量化

In [1]:
import numpy as np
from datasets import load_dataset, load_from_disk
import random
import torch

def get_wikitext2(nsamples, seed, seqlen, tokenizer):
    # set seed
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)

    # load dataset and preprocess
    # traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    # testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    traindata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="train")
    testdata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="test")
    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    traindataset = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        attention_mask = torch.ones_like(inp)
        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
    return traindataset, testenc

  from .autonotebook import tqdm as notebook_tqdm


## MiniCPM-Llama3-V-2_5

In [21]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/MiniCPM-Llama3-V-2_5"
quantized_model_dir = "/home/workspace/model/MiniCPM-Llama3-V-2_5-w4-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalMLM.from_pretrained(pretrained_model_dir, quantize_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
Loading checkpoint shards: 100%|██████████| 7/7 [00:33<00:00,  4.85s/it]


## MiniCPM-3o-1B-sft-v1

In [2]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig

import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-llm_pc_w4"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 8-bit
    group_size=-1,  # it is recommended to set the value to -1
    desc_act=True,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalMLM.from_pretrained(pretrained_model_dir, quantize_config)
traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
2024-11-12 12:46:25 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config
You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [3]:
model

MiniCPMVGPTQ_Llama3(
  (model): MiniCPMV(
    (llm): MiniCPMForCausalLM(
      (model): MiniCPMModel(
        (embed_tokens): Embedding(73464, 1536)
        (layers): ModuleList(
          (0-51): 52 x MiniCPMDecoderLayer(
            (self_attn): MiniCPMSdpaAttention(
              (q_proj): Linear(in_features=1536, out_features=1536, bias=False)
              (k_proj): Linear(in_features=1536, out_features=512, bias=False)
              (v_proj): Linear(in_features=1536, out_features=512, bias=False)
              (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
              (rotary_emb): MiniCPMLongRoPE()
            )
            (mlp): MiniCPMMLP(
              (gate_proj): Linear(in_features=1536, out_features=3840, bias=False)
              (up_proj): Linear(in_features=1536, out_features=3840, bias=False)
              (down_proj): Linear(in_features=3840, out_features=1536, bias=False)
              (act_fn): SiLU()
            )
            (input_layernorm)

In [4]:
model.quantize(traindataset)

INFO - Start quantizing layer 1/52
INFO - Quantizing self_attn.k_proj in layer 1/52...
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] duration: 0.8242471218109131
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] avg loss: 502.7901916503906
INFO - Quantizing self_attn.v_proj in layer 1/52...
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] duration: 0.225905179977417
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] avg loss: 264.6324157714844
INFO - Quantizing self_attn.q_proj in layer 1/52...
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] duration: 0.22806906700134277
2024-11-12 12:48:45 INFO [auto_gptq.quantization.gptq] avg loss: 1638.80859375
INFO - Quantizing self_attn.o_proj in layer 1/52...
2024-11-12 12:49:24 INFO [auto_gptq.quantization.gptq] duration: 0.49829840660095215
2024-11-12 12:49:24 INFO [auto_gptq.quantization.gptq] avg loss: 161.2862548828125
INFO - Quantizing mlp.up_proj in layer 1/52...
2024-11-12 12:50:04 INFO [auto_gptq.quantizat

In [None]:
# save quantized model
model.save_quantized(quantized_model_dir)
# model.save_quantized(quantized_model_dir, use_safetensors=True)

2024-11-12 15:33:59 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config


In [None]:
import torch
from PIL import Image
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig
torch.manual_seed(0)
def llm_load(path = '/data1/liyx/Models/MiniCPM3-1B-sft-bf16/'):
    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
    # model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='cuda', trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float32, device_map='cuda', trust_remote_code=True)
    prompt = "'y = 2x - 7, 3x^2 + y = 8'是二元一次方程吗？"
    responds, history = model.chat(
            tokenizer, 
            prompt, 
            max_new_tokens=512,
            do_sample=False, 
            num_beams=1,
            repetition_penalty=1,
            top_p=None, 
            temperature=None,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{responds}\n")
    return model, tokenizer
def mllm_load(path = '/data/zyq/minicpm-3o-1b-sft-v1'):
    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
    model = AutoModel.from_pretrained(path, torch_dtype=torch.float16, device_map='cuda', trust_remote_code=True)
    # for block in model.llm.model.layers:
        # block.prepare_layernorm()
        ## block.self_attn.prepare_conv()
        # block.self_attn.prepare_sha()
        ## block.mlp.prepare_conv()
    model.config.use_cache = False
    return model, tokenizer
def gptq_mllm_load(path = '/data/zyq/8295/checkpoints/minicpm-3o-sft-v1-gptq-1107'):
    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
    model = AutoGPTQForCausalMLM.from_quantized(path, torch_dtype=torch.float16, device="cuda:0", trust_remote_code=True, disable_exllama=True)
    # for block in model.model.llm.model.layers:
        ## block.prepare_layernorm()
        ## block.self_attn.prepare_conv()
        # block.self_attn.prepare_sha()
        ## block.mlp.prepare_conv()
    model.config.use_cache = False
    return model, tokenizer
def llm_test(prompt, model, tokenizer):
    res, _ = model.chat(
            tokenizer, 
            prompt, 
            max_new_tokens=256,
            do_sample=False, 
            num_beams=1,
            repetition_penalty=1,
            top_p=None, 
            temperature=None,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
def mllm_test(imagepath, prompt, model, tokenizer, max_new_tokens=256):
    # msgs = [{'role': 'user', 'content': [None, prompt]}]
    image = Image.open(imagepath).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Image]\n{imagepath}\n")
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
def mllm_test_txt(prompt, model, tokenizer, max_new_tokens=256):
    # msgs = [{'role': 'user', 'content': [None, prompt]}]
    msgs = [{'role': 'user', 'content': [None, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
def gptq_mllm_test(imagepath, prompt, model, tokenizer, max_new_tokens=256):
    # msgs = [{'role': 'user', 'content': [None, prompt]}]
    image = Image.open(imagepath).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Image]\n{imagepath}\n")
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
def gptq_mllm_test_txt(prompt, model, tokenizer, max_new_tokens=256):
    # msgs = [{'role': 'user', 'content': [None, prompt]}]
    msgs = [{'role': 'user', 'content': [None, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
def mllm_stream_test(imagepath, prompt, model, tokenizer, max_new_tokens=256):
    image = Image.open(imagepath).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            stream=True,
    )
    print(f"[Image]\n{imagepath}\n")
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n")
    generated_text = ""
    for new_text in res:
        generated_text += new_text
        print(new_text, flush=True, end='')
    print("\n")
def mllm_multiturn_test(imagepath, prompt, model, tokenizer, max_new_tokens=256):
    image = Image.open(imagepath).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, prompt]}]
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Image]\n{imagepath}\n")
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
    prompt = "茶位费多少钱"
    msgs.append({"role": "assistant", "content": [res]})
    msgs.append({"role": "user", "content": [prompt]})
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
    prompt = "红糖核桃包的价格"
    msgs.append({"role": "assistant", "content": [res]})
    msgs.append({"role": "user", "content": [prompt]})
    res = model.chat(
            image=None,
            msgs=msgs,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            sampling=False,
            num_beams=1,
            repetition_penalty=1,
    )
    print(f"[Prompt]\n{prompt}\n")
    print(f"[Respond]\n{res}\n")
# mllm_path = "/data/zyq/8295/checkpoints/minicpm-3o-sft-v1-gptq-llm-1107-fakequant"
# mllm, mllm_tokenizer = mllm_load(mllm_path)
# mllm_test("/data/zyq/8295/tools/pics/airplane.jpeg", "这是什么东西", mllm, mllm_tokenizer, max_new_tokens=512)
# mllm_test_txt("求解方程x - y = 4, 2x + y = 9", mllm, mllm_tokenizer, max_new_tokens=512)
gptq_mllm, gptq_mllm_tokenizer = gptq_mllm_load('/data/zyq/8295/checkpoints/minicpm-3o-sft-v1-gptq-1112')
print(gptq_mllm)
# print(gptq_mllm.model.llm.model.layers[1].self_attn.q_proj.qweight)
mllm_test("/data/zyq/8295/tools/pics/airplane.jpeg", "这是什么东西", gptq_mllm, gptq_mllm_tokenizer, max_new_tokens=512)
gptq_mllm_test_txt("求解方程x - y = 4, 2x + y = 9", gptq_mllm, gptq_mllm_tokenizer, max_new_tokens=512)
llm = gptq_mllm.model.llm
llm_test("求解方程x - y = 4, 2x + y = 9", llm, gptq_mllm_tokenizer)

In [11]:
from safetensors.torch import safe_open
s = 0
# 打开 safetensors 文件
with safe_open("/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256/model.safetensors", framework="pt") as f:
    # 手动指定 metadata
    # metadata = {"format": "pt"}  # 手动添加格式信息

    # # 读取模型的权重
    # tensors = {key: f.get_tensor(key) for key in f.keys()}
    s = f.metadata()

In [12]:
s

{'gptq_group_size': '-1',
 'gptq_damp_percent': '0.01',
 'auto_gptq_version': '0.8.0.dev0',
 'gptq_quant_method': 'gptq',
 'gptq_desc_act': 'True',
 'gptq_bits': '8',
 'gptq_checkpoint_format': 'gptq',
 'format': 'pt'}

In [7]:
from safetensors.torch import load_file, save_file

# 加载现有的 safetensors 文件
file_path = "/home/workspace/model/minicpm-3o-sft-v1-gptq-1112/model.safetensors"
tensors = load_file(file_path)

# 修改或添加 metadata
metadata = s  # 你可以添加你想要的 metadata

# 重新保存文件，带上新的 metadata
save_file(tensors, file_path, metadata=metadata)

In [1]:
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig
quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 8-bit
    group_size=-1,  # it is recommended to set the value to -1
    desc_act=True,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc"
# quantized_model_dir = "/home/workspace/model/m inicpm-3o-sft-v1-gptq-1112"

model = AutoGPTQForCausalMLM.from_quantized(quantized_model_dir, quantize_config)

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
INFO - Ignoring unknown parameter in the quantization configuration: vit_bits.
INFO - Ignoring unknown parameter in the quantization configuration: llm_bits.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc were not used when initializing MiniCPMV: ['llm.model.layers.0.mlp.down_proj.g_idx', 'llm.model.layers.0.mlp.down_proj.qweight', 'llm.model.layers.0.mlp.down_proj.qzeros', 'llm.mo

In [3]:
model.vpm.encoder.layers[0].self_attn.k_proj.qweight
model.vpm.encoder.layers[0].self_attn.k_proj.bits

8

In [4]:
model.llm.model.layers[0].self_attn.k_proj.qweight
model.llm.model.layers[0].self_attn.k_proj.bits

4

In [2]:
model

MiniCPMVGPTQ_Llama3(
  (model): MiniCPMV(
    (llm): MiniCPMForCausalLM(
      (model): MiniCPMModel(
        (embed_tokens): Embedding(73464, 1536)
        (layers): ModuleList(
          (0-51): 52 x MiniCPMDecoderLayer(
            (self_attn): MiniCPMAttention(
              (q_proj): QuantLinear()
              (k_proj): QuantLinear()
              (v_proj): QuantLinear()
              (o_proj): QuantLinear()
              (rotary_emb): MiniCPMLongRoPE()
            )
            (mlp): MiniCPMMLP(
              (gate_proj): QuantLinear()
              (up_proj): QuantLinear()
              (down_proj): QuantLinear()
              (act_fn): SiLU()
            )
            (input_layernorm): MiniCPMRMSNorm()
            (post_attention_layernorm): MiniCPMRMSNorm()
          )
        )
        (norm): MiniCPMRMSNorm()
      )
      (lm_head): Linear(in_features=1536, out_features=73464, bias=False)
    )
    (vpm): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbedding

In [2]:
from transformers import AutoTokenizer,AutoProcessor
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
model.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "这是什么飞机"
msgs = [{'role': 'user', 'content': [image, question]}]

answer = model.model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

这张图片展示了一架商用客机，很可能是一架大型的空中客车（A380）。 

1. **飞机类型**：
   - 这是A380，是空中客车公司生产的全球最大客机之一，也是世界上最大的商用喷气式飞机。
   - A380拥有4个引擎，提供了卓越的巡航速度和效率。

2. **航空公司**：
   - 虽然图片本身并没有提供明确的航空公司标志或名称，但通常这种飞机会属于像东方航空这样的航空公司，该航空公司在亚洲地区有广泛的运营。

3. **设计和细节**：
   - 飞机涂成了蓝白相间的色彩，并配有独特的标志，这是东方航空的标志性设计。
   - 飞机上有“Airbus A380-800”的字样，进一步确认了它是A380型号。

4. **飞行状态**：
   - 从照片中可以看出，飞机处于起飞阶段，这意味着它刚刚离开地面准备起飞。

5. **背景**：
   - 飞机在晴朗的蓝天中飞行，表明它处于高空且天气条件良好。

总结，这是一张东方航空的大型空中客车A380商用客机的照片，它处于起飞阶段，展示了其标志性的设计和细节。


In [3]:
import os
import random
import shutil

# 定义源数据集路径和目标保存路径
imagenet_train_dir = '/home/workspace/dataset/imagenet/train'  # 替换为你本地 ImageNet 训练集的路径
output_dir = '/home/workspace/dataset/imagenet/calibration'  # 替换为你要保存图片的路径
num_images_to_select = 64  # 要随机选取的图片数量

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)

# 获取训练集中所有类别文件夹
all_images = []
for root, _, files in os.walk(imagenet_train_dir):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            all_images.append(os.path.join(root, file))

# 随机选取64张图片
selected_images = random.sample(all_images, num_images_to_select)

# 将选取的图片复制到目标文件夹
for img_path in selected_images:
    shutil.copy(img_path, output_dir)

print(f'Successfully selected and copied {num_images_to_select} images to {output_dir}.')


Successfully selected and copied 64 images to /home/workspace/dataset/imagenet/calibration.


# VIT部分量化

In [5]:
import datasets
import random
import numpy as np
import torch

def get_ScienceQA(nsamples, seed, seqlen, processor):
    import torch.nn.functional as F
    dataset = datasets.load_from_disk("/home/workspace/dataset/ScienceQA-2")["train"]
    dataset = dataset.shuffle(seed=seed)
    rng = random.Random(42)
    samples, num_tokens = [], 0
    prompts_lists = []
    input_images_lists = []
    for index, _data in enumerate(dataset):
        promt = _data["question"]
        image_file = _data["image"]
        image = np.array(image_file)
        if image_file is None:
            nsamples = nsamples+1
            continue
        msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
        prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
        input_images_lists.append([image])
        if index >= nsamples-1:
            break
    # return prompts_lists,input_images_lists
     
    inputs = processor(
        prompts_lists,
        input_images_lists,
        max_slice_nums=processor.image_processor.max_slice_nums,
        use_image_id=processor.image_processor.use_image_id,
        return_tensors="pt",
        max_length=8192
    )
    # return inputs
    traindataset = []
    for _ in range(inputs["input_ids"].size(0)):
        input_ids = inputs["input_ids"].select(0, _).unsqueeze(0) 
        attention_mask = inputs["attention_mask"].select(0, _).unsqueeze(0) 
        pixel_values = inputs["pixel_values"]
        image_sizes = inputs["image_sizes"]
        image_bound = inputs["image_bound"]
        tgt_sizes = inputs["tgt_sizes"]
        traindataset.append({"input_ids": input_ids, 
                             "attention_mask": attention_mask,
                             "pixel_values": pixel_values,
                             "image_sizes": image_sizes,
                             "image_bound": image_bound,
                             "tgt_sizes": tgt_sizes})

    return traindataset

In [2]:
# import numpy as np
# from datasets import load_dataset, load_from_disk
# import random
# import torch

# def get_wikitext2(nsamples, seed, seqlen, tokenizer):
#     # set seed
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.random.manual_seed(seed)

#     # load dataset and preprocess
#     # traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
#     # testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
#     traindata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="train")
#     testdata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="test")
#     trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
#     testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

#     traindataset = []
#     for _ in range(nsamples):
#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
#         j = i + seqlen
#         inp = trainenc.input_ids[:, i:j]
#         attention_mask = torch.ones_like(inp)
#         traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
#     return traindataset, testenc

In [6]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
quantize_config = BaseQuantizeConfig(
    bits=8,  # quantize model to 4-bit
    group_size=-1,  # it is recommended to set the value to 128
    desc_act=True,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
# traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForVIT.from_pretrained(pretrained_model_dir, quantize_config)
from transformers import AutoProcessor
model.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
2024-11-07 15:41:31 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "Tell me the model of this aircraft."
msgs = [{'role': 'user', 'content': [image, question]}]

answer = model.model.cuda().chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

The aircraft in the image is an Airbus A380, identifiable by its distinctive hump on the upper deck, which is characteristic of the Airbus A380 model. The A380 is a twin-engine, wide-body, four-engine jet airliner that was developed by Airbus and manufactured by Boeing. It is one of the largest aircraft in the world, capable of carrying more than 800 passengers and has a range of up to 9,500 nautical miles (17,200 km). This particular model is part of the Airbus A380 family, which includes the A380-800 and A380-900 variants.


In [8]:
model.model.device

device(type='cpu')

In [7]:
# traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
traindataset = get_ScienceQA(1024, 0, model.seqlen, model.model.processor)
# i,m = get_ScienceQA(32, 0, model.seqlen, model.model.processor)

You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
model.quantize(traindataset)

INFO - Start quantizing layer 1/27
INFO - Quantizing self_attn.k_proj in layer 1/27...
2024-11-07 16:17:12 INFO [auto_gptq.quantization.gptq] duration: 0.4007911682128906
2024-11-07 16:17:12 INFO [auto_gptq.quantization.gptq] avg loss: 6.768801540601999e-05
INFO - Quantizing self_attn.v_proj in layer 1/27...
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] duration: 0.2792811393737793
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] avg loss: 1.185271412396105e-05
INFO - Quantizing self_attn.q_proj in layer 1/27...
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] duration: 0.2770664691925049
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] avg loss: 4.957634519087151e-05
INFO - Quantizing self_attn.out_proj in layer 1/27...
2024-11-07 16:18:27 INFO [auto_gptq.quantization.gptq] duration: 0.21391677856445312
2024-11-07 16:18:27 INFO [auto_gptq.quantization.gptq] avg loss: 1.2891981668872177e-06
INFO - Quantizing mlp.fc1 in layer 1/27...
2024-11-07 16:19:34 INFO [

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.06 GiB. GPU 0 has a total capacity of 79.33 GiB of which 1024.00 MiB is free. Process 5139 has 78.32 GiB memory in use. Of the allocated memory 69.98 GiB is allocated by PyTorch, and 7.80 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
# save quantized model
model.save_quantized(quantized_model_dir)

2024-11-07 15:10:31 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config


In [2]:
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig

# quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-g128"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256"
model_quant = AutoGPTQForVIT.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256 were not used when initializing MiniCPMV: ['vpm.encoder.layers.0.mlp.fc1.g_idx', 'vpm.encoder.layers.0.mlp.fc1.qweight', 'vpm.encoder.layers.0.mlp.fc1.qzeros', 'vpm.encoder.layers.0.mlp.fc1.scales', 'vpm.encoder.layers.0.mlp.fc2.g_idx', 'vpm.encoder.layers.0.mlp.fc2.qweight', 'vpm.encoder.layers.0.mlp.fc2.qzeros', 'vpm.encoder.layers.0.mlp.fc2.scales', 'vpm.encoder.layers.0.self_attn.k_proj.g_idx', 'vpm.encoder.layers.0.self_attn.k_proj.qweight', 'vpm.encoder.la

In [None]:
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, trust_remote_code=True)

In [11]:
from transformers import AutoTokenizer, TextGenerationPipeline,AutoProcessor
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
model_quant.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "这是什么飞机"
msgs = [{'role': 'user', 'content': [image, question]}]

answer = model_quant.model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

----------------------scale_value: tensor(104.6875, device='cuda:0', dtype=torch.float16)
这是一架中国东方航空公司的客机，具体型号是A380-800。图上的字是“空航方南国中”和“B-6136”，其中“空航方南国中”是航空公司名称的一部分，而“B-6136”是该飞机的注册编号。


In [19]:
model_quant.outside_layer_modules

['vpm.embeddings', 'vpm.post_layernorm']

In [17]:
model_quant.model.llm.model

MiniCPMModel(
  (embed_tokens): Embedding(73464, 1536)
  (layers): ModuleList(
    (0-51): 52 x MiniCPMDecoderLayer(
      (self_attn): MiniCPMSdpaAttention(
        (q_proj): QuantLinear()
        (k_proj): QuantLinear()
        (v_proj): QuantLinear()
        (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        (rotary_emb): MiniCPMLongRoPE()
      )
      (mlp): MiniCPMMLP(
        (gate_proj): Linear(in_features=1536, out_features=3840, bias=False)
        (up_proj): Linear(in_features=1536, out_features=3840, bias=False)
        (down_proj): Linear(in_features=3840, out_features=1536, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MiniCPMRMSNorm()
      (post_attention_layernorm): MiniCPMRMSNorm()
    )
  )
  (norm): MiniCPMRMSNorm()
)

## 调试

In [10]:
layer_inputs = []

def store_input_hook(_, args,kwargs):
    # Positional arguments.
    layer_input = []
    for inp in args:
        layer_input.append(inp)
    layer_inputs.append(layer_input)

    # # Keyword arguments.
    # if kwargs["attention_mask"] is not None:
    #     attention_masks.append(kwargs["attention_mask"].to(data_device))
    # else:
    #     attention_masks.append(None)

    # pos_ids = kwargs.get("position_ids", None)
    # if pos_ids is not None:
    #     position_ids.append(move_to_device(pos_ids, data_device))
    # one_kwargs = {}
    # for (
    #     k,
    #     v,
    # ) in kwargs.items():  # make sure other arguments also be captured
    #     if k not in ["hidden_states", "attention_mask", "position_ids"]:
    #         one_kwargs[k] = nested_move_to_device(v, data_device)
    # layer_input_kwargs.append(one_kwargs)
    # raise ValueError

In [11]:
handle = model.vpm.encoder.layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)

In [12]:
for example in traindataset:
    for k, v in example.items():
        # if len(v.shape) == 1:
        #     v = v.unsqueeze(0)
        # if k in "input_ids" or k in "attention_mask":
        #     example[k] = v
        # elif k in "pixel_values":
        #     example[k][0][0] =  v[0][0].cuda()
        #     example[k][0][1] =  v[0][1].cuda()
        #     example[k][0][2] =  v[0][2].cuda()
        a = model(example)
        break
handle.remove()


In [8]:
model.vpm.encoder.layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)

<torch.utils.hooks.RemovableHandle at 0x7efb9932ca50>

# 取激活

In [20]:
import datasets
import random
import numpy as np
import torch

def get_ScienceQA(nsamples, seed, seqlen, processor, status):
    import torch.nn.functional as F
    dataset = datasets.load_from_disk("/home/workspace/dataset/ScienceQA-2")["train"]
    dataset = dataset.shuffle(seed=seed)
    rng = random.Random(42)

    #数据拆分
    if status == 0:
        traindataset = []
        for index, _data in enumerate(dataset):
            prompts_lists = []
            input_images_lists = []
            promt = _data["question"]
            # image_file = _data["image"]
            image_file = _data["image"]
            if image_file is None:
                nsamples = nsamples + 1
                continue
            else:
                image = np.array(image_file)
                # image = np.array(image_file.resize((448,  448)))
            msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
            prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
            
            input_images_lists.append([image])
            if index >= nsamples:
                break
     
            inputs = processor(
                prompts_lists,
                input_images_lists,
                max_slice_nums=processor.image_processor.max_slice_nums,
                use_image_id=processor.image_processor.use_image_id,
                return_tensors="pt",
                max_length=8192
            )

            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]
            pixel_values = inputs["pixel_values"]
            image_sizes = inputs["image_sizes"]
            image_bound = inputs["image_bound"]
            tgt_sizes = inputs["tgt_sizes"]
            traindataset.append({"input_ids": input_ids, 
                                    "attention_mask": attention_mask,
                                    "pixel_values": pixel_values,
                                    "image_sizes": image_sizes,
                                    "image_bound": image_bound,
                                    "tgt_sizes": tgt_sizes})
    elif status == 1:
        traindataset = []
        prompts_lists = []
        input_images_lists = []
        for index, _data in enumerate(dataset):
            promt = _data["question"]
            image_file = _data["image"]
            image = np.array(image_file)
            if image_file is None:
                nsamples = nsamples+1
                continue
            msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
            prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
            input_images_lists.append([image])
            if index >= nsamples-1:
                break
     
        inputs = processor(
            prompts_lists,
            input_images_lists,
            max_slice_nums=processor.image_processor.max_slice_nums,
            use_image_id=processor.image_processor.use_image_id,
            return_tensors="pt",
            max_length=8192
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        pixel_values = inputs["pixel_values"]
        image_sizes = inputs["image_sizes"]
        image_bound = inputs["image_bound"]
        tgt_sizes = inputs["tgt_sizes"]
        traindataset.append({"input_ids": input_ids, 
                                "attention_mask": attention_mask,
                                "pixel_values": pixel_values,
                                "image_sizes": image_sizes,
                                "image_bound": image_bound,
                                "tgt_sizes": tgt_sizes})

    return traindataset

## VIT量化模型

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
# 全整模型
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig
# quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-g128"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256"
# model_quant = AutoGPTQForVIT.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)
model_quant = AutoGPTQForVIT.from_quantized(quantized_model_dir, use_triton=False)

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256 were not used when initializing MiniCPMV: ['vpm.encoder.layers.0.mlp.fc1.g_idx', 'vpm.encoder.layers.0.mlp.fc1.qweight', 'vpm.encoder.layers.0.mlp.fc1.qzeros', 'vpm.encoder.layers.0.mlp.fc1.scales', 'vpm.encoder.layers.0.mlp.fc2.g_idx', 'vpm.encoder.layers.0.mlp.fc2.qweight', 'vpm.encoder.layers.0.mlp.fc2.qzeros', 'vpm.encoder.layers.0.mlp.fc2.scales', 'vpm.encoder.layers.0.self_attn.k_proj.g_i

In [4]:
from transformers import AutoProcessor
model_path='/home/workspace/model/MiniCPM-3o-1B-sft-v1'
model_quant.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model_quant.seqlen = 2048

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 浮点模型

In [5]:
import torch

from auto_gptq.modeling.minicpm.modeling_minicpmv import MiniCPMV
from torch import nn

model_path='/home/workspace/model/MiniCPM-3o-1B-sft-v1'
model = MiniCPMV.from_pretrained(model_path, trust_remote_code=True).to(dtype=torch.float32)
from transformers import AutoProcessor
model.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model.seqlen = 2048

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## 数据导入

In [21]:
traindataset = get_ScienceQA(32, 0, model.seqlen, model.processor,0)
len(traindataset)

32

In [23]:
import pickle 
 
 
# 保存到文件 
with open('data.pkl',  'wb') as f: 
    pickle.dump(traindataset,  f) 

In [24]:
with open('data.pkl',  'rb') as f: 
    loaded_data = pickle.load(f)  

In [25]:
loaded_data

[{'input_ids': tensor([[    1, 73441,  1836,     5, 73460, 59344, 73461, 73448,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
               0,     0, 73449,     5, 28780,  1379,  1358,  2800,  2286, 12344,
           59361, 59328,  2076,  2087,    74, 73440,     5, 73441, 43686,     5]],
         dtype=torch.int32),
  'attention_mask': tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, True, True, True, True, True, True, True,
           True, True, True, True, True, Tr

## 取hidden state

In [14]:
hidden_states = {}
module_name_count = {}
def hook_fn(module, input, output):
    # 获取模块名称（假设模块有一个唯一标识的 `name` 属性）
    module_name = module.name  # 使用类名作为基础模块名称
    
    # 检查模块名称是否已存在，若存在则增加计数器
    if module_name in module_name_count:
        module_name_count[module_name] += 1
        module_key = f"{module_name}_{module_name_count[module_name]}"  # 为重复名称模块编号
    else:
        module_name_count[module_name] = 1
        module_key = module_name  # 首次遇到该模块时，不加编号

    hidden_states[module_key] = {'input': input, 'output': output}

handle_list = []
ignore_layers = ["resampler","llm.model","llm.lm_head"]
# for name, module in model_quant.model.named_modules():
for name, module in model.named_modules():
    if name.endswith("vpm") :
        # 选择你想要添加 hook 的模块，比如 Linear, Conv2d, 或 LayerNorm
        module.name = name
        print(f"Registering hook for {name}")
        handle_list.append(module.register_forward_hook(hook_fn))

Registering hook for vpm


In [15]:
for example in traindataset:
    example["input_ids"] = example["input_ids"].cuda()
    example["attention_mask"] = example["attention_mask"].cuda()
    example["pixel_values"][0][0] = example["pixel_values"][0][0].cuda()
    model_quant.model.cuda()(example)
    model.cuda()(example)

----------------------scale_value: tensor(73.2500, device='cuda:0', dtype=torch.float16, grad_fn=<MaxBackward1>)
----------------------scale_value: tensor(73.2310, device='cuda:0', grad_fn=<MaxBackward1>)


In [18]:
ss = hidden_states["vpm"]["output"]["last_hidden_state"]

In [17]:
s

tensor([[[-1.9971,  2.3770,  0.1758,  ...,  0.4910,  0.6113,  1.0439],
         [-1.5332,  2.1875, -0.2476,  ..., -0.1013,  0.1482,  1.7949],
         [-0.1750, -0.0800, -0.5186,  ..., -1.0283,  0.6602,  0.1801],
         ...,
         [ 0.2646, -0.2050, -0.5308,  ..., -0.2505,  0.6084,  1.9346],
         [-2.7012,  1.2871,  0.6060,  ..., -1.7246, -0.1388,  0.9688],
         [-1.9326,  1.8760, -0.5083,  ...,  0.1836, -0.3635,  0.5562]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<NativeLayerNormBackward0>)

In [10]:
s = hidden_states["vpm"]["output"]["last_hidden_state"]

In [19]:
torch.save(ss, "fp_vit.pt")

In [12]:
torch.save(s, "quant_vit.pt")

In [13]:
for handle in handle_list:
    handle.remove()

In [None]:
import os
output_folder = "/home/workspace/code/git/AutoGPTQ_mlm/hidden_states" 
for k, v in hidden_states.items():  
    file_path = os.path.join(output_folder,  str(k)) 
    v["output"].detach().numpy().tofile(file_path)

## 取激活

In [16]:
def move_to_cpu(data):
    if isinstance(data,torch.Tensor):
        return data.cpu()
    elif isinstance(data,tuple):
        new_data = []
        for i in data:
            if isinstance(i,torch.Tensor):
                i = i.cpu()
            new_data.append(i)
        return tuple(new_data)
    else:
        raise TypeError("Unsupported data type")

In [17]:
activations = {}
import torch.nn as nn
# 定义 hook 函数
def hook_fn(module, input, output):
    # 使用模块名作为键来存储输入和输出
    # activations[module.name] = {'input': move_to_cpu(input), 'output': move_to_cpu(output)}
    activations[module.name] = {'input': input, 'output': output}
from auto_gptq.nn_modules.qlinear.qlinear_cuda_old import QuantLinear
handle_list = []
ignore_layers = ["resampler","llm.model","llm.lm_head"]
for name, module in model.named_modules():
    if (isinstance(module, nn.Linear) or isinstance(module, QuantLinear)) and not any(name.startswith(ignore_layer) for ignore_layer in ignore_layers):
        # 选择你想要添加 hook 的模块，比如 Linear, Conv2d, 或 LayerNorm
        module.name = name
        print(f"Registering hook for {name}")
        handle_list.append(module.register_forward_hook(hook_fn))

Registering hook for vpm.encoder.layers.0.self_attn.k_proj
Registering hook for vpm.encoder.layers.0.self_attn.v_proj
Registering hook for vpm.encoder.layers.0.self_attn.q_proj
Registering hook for vpm.encoder.layers.0.self_attn.out_proj
Registering hook for vpm.encoder.layers.0.mlp.fc1
Registering hook for vpm.encoder.layers.0.mlp.fc2
Registering hook for vpm.encoder.layers.1.self_attn.k_proj
Registering hook for vpm.encoder.layers.1.self_attn.v_proj
Registering hook for vpm.encoder.layers.1.self_attn.q_proj
Registering hook for vpm.encoder.layers.1.self_attn.out_proj
Registering hook for vpm.encoder.layers.1.mlp.fc1
Registering hook for vpm.encoder.layers.1.mlp.fc2
Registering hook for vpm.encoder.layers.2.self_attn.k_proj
Registering hook for vpm.encoder.layers.2.self_attn.v_proj
Registering hook for vpm.encoder.layers.2.self_attn.q_proj
Registering hook for vpm.encoder.layers.2.self_attn.out_proj
Registering hook for vpm.encoder.layers.2.mlp.fc1
Registering hook for vpm.encoder.lay

In [13]:
for example in traindataset:
    # example["input_ids"] = example["input_ids"].cuda()
    # example["attention_mask"] = example["attention_mask"].cuda()
    # example["pixel_values"][0][0] = example["pixel_values"][0][0].cuda()
    # model.cuda()(example)
    model(example)
# for handle in handle_list:
#     handle.remove()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:6 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA__cudnn_convolution)

In [19]:
for handle in handle_list:
    handle.remove()

In [20]:
activations

{'vpm.encoder.layers.0.self_attn.q_proj': {'input': (tensor([[[ 4.8578e-04, -3.4855e-03,  1.1160e-03,  ...,  2.4031e-01,
              2.4336e-01, -3.4616e-04],
            [ 1.1152e-02, -3.5248e-03, -1.5366e-03,  ...,  2.0528e-01,
              2.3838e-01, -3.3171e-04],
            [ 7.0265e-02, -3.8696e-03, -1.7894e-03,  ...,  3.3367e-01,
              1.6415e-01, -4.2620e-04],
            ...,
            [ 5.7611e-03, -1.0862e-03,  5.4058e-03,  ...,  2.2158e-02,
              2.4040e-01, -4.5663e-04],
            [ 5.7611e-03, -1.0862e-03,  5.4058e-03,  ...,  2.2158e-02,
              2.4040e-01, -4.5663e-04],
            [ 5.7611e-03, -1.0862e-03,  5.4058e-03,  ...,  2.2158e-02,
              2.4040e-01, -4.5663e-04]],
   
           [[ 4.1334e-03, -1.7801e-03, -1.1118e-05,  ...,  2.6957e-01,
              2.3497e-01, -4.3304e-04],
            [ 2.7526e-03, -1.9192e-03, -3.7443e-03,  ...,  8.9450e-02,
              2.4055e-01, -4.0075e-04],
            [-4.6041e-02, -2.5858e-03,  

In [21]:
max_values = {}
for k,v in activations.items():
    # print(k)
    for kk,vv in v.items():
        scale_name = k + "." + kk
        if kk == "input":
            if len(vv) == 1:
                scale_value = torch.max(torch.abs(vv[0])) 
            else:
                print(vv[1])
                print("input 有多个元素")  
        elif kk == "output":
            scale_value = torch.max(torch.abs(vv)) 
        print(k + "." + kk, " scale: ",scale_value.item())
        max_values[scale_name] = scale_value.item()

vpm.encoder.layers.0.self_attn.q_proj.input  scale:  10.097468376159668
vpm.encoder.layers.0.self_attn.q_proj.output  scale:  7.09360408782959
vpm.encoder.layers.0.self_attn.k_proj.input  scale:  10.097468376159668
vpm.encoder.layers.0.self_attn.k_proj.output  scale:  7.006409168243408
vpm.encoder.layers.0.self_attn.v_proj.input  scale:  10.097468376159668
vpm.encoder.layers.0.self_attn.v_proj.output  scale:  7.136604309082031
vpm.encoder.layers.0.self_attn.out_proj.input  scale:  7.132835865020752
vpm.encoder.layers.0.self_attn.out_proj.output  scale:  9.77147388458252
vpm.encoder.layers.0.mlp.fc1.input  scale:  22.65842056274414
vpm.encoder.layers.0.mlp.fc1.output  scale:  12.893155097961426
vpm.encoder.layers.0.mlp.fc2.input  scale:  12.079593658447266
vpm.encoder.layers.0.mlp.fc2.output  scale:  12.842710494995117
vpm.encoder.layers.1.self_attn.q_proj.input  scale:  9.794299125671387
vpm.encoder.layers.1.self_attn.q_proj.output  scale:  6.926030158996582
vpm.encoder.layers.1.self_a

In [22]:
import json
with open("max_abs_value_vit_quant.json", "w") as json_file:
    json.dump(max_values, json_file)

# 模型测试

In [9]:
import torch
from transformers import AutoModel, AutoTokenizer,AutoProcessor,set_seed

set_seed(42)
# torch.manual_seed(0)
# model_path = "/home/workspace/model/MiniCPM-Llama3-V-2_5"
# model_path = "/home/workspace/model/minicpm-vit-1b-w8-lenovo-llama-w8-pergroup128"
# model_path = "/home/workspace/model/minicpm-vit-1b-w8-lenovo"
# model_path = "/home/workspace/model/llava-1___5-7b-hf"
# model_path = "/home/workspace/model/minicpm-gptq-w4-32-perchannel-only_quant_downproj"
model_path = "/home/workspace/model/MiniCPM-V-1B-sft-v2-1B"
model = AutoModel.from_pretrained(model_path, 
                                  trust_remote_code=True,
                                  device_map=None) 
# model_weight_path = "/home/workspace/model/minicpm_v_navit_250_0927.pt"
# model.load_state_dict(torch.load(model_weight_path))
model = model.cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

2024-11-06 09:01:37 INFO [transformers_modules.MiniCPM-V-1B-sft-v2-1B.configuration_minicpm] vision_config is None, using default vision config


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
model.vpm

SiglipVisionTransformer(
  (embeddings): SiglipVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
    (position_embedding): Embedding(4900, 1152)
  )
  (encoder): SiglipEncoder(
    (layers): ModuleList(
      (0-26): 27 x SiglipEncoderLayer(
        (self_attn): SiglipAttention(
          (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
        )
        (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
        (mlp): SiglipMLP(
          (activation_fn): PytorchGELUTanh()
          (fc1): Linear(in_features=1152, out_features=4304, bias=True)
          (fc2): Linear(in_features=4304, out_features=1152, bias=True)
        )
        (layer_norm2): LayerNorm((1152,), ep

In [31]:
vpm = model.vpm
input_shape = (1, 1024, 1152)
example_input = torch.ones(input_shape, dtype=torch.float32)

# print(model_resampler)
vpm.prepare_layernorm()
for bolck in vpm.encoder.layers:
    bolck.prepare_layernorm()
    bolck.self_attn.prepare_sha()
    bolck.mlp.prepare_conv()

In [33]:
vpm.cuda()

SiglipVisionTransformer(
  (embeddings): SiglipVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
    (position_embedding): Embedding(4900, 1152)
  )
  (encoder): SiglipEncoder(
    (layers): ModuleList(
      (0-26): 27 x SiglipEncoderLayer(
        (self_attn): SiglipAttention(
          (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1, 1))
          )
          (k_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1, 1))
          )
          (v_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1

In [34]:
x1 = vpm(example_input.cuda())

In [35]:
x-x1

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]]]], device='cuda:0',
       grad_fn=<SubBackward0>)

In [None]:
import datasets
import random
def get_mme(nsamples, seed, seqlen, processor):
    dataset = datasets.load_from_disk("/home/workspace/dataset/MME")["test"]
    dataset = dataset.shuffle(seed=seed)
    # rng = random.Random(42)
    samples, num_tokens = [], 0
    prompts_lists = []
    input_images_lists = []
    for index, _data in enumerate(dataset):
        promt = _data["question"]
        image = _data["image"]
        msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
        prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
        input_images_lists.append([image])
        if index >= nsamples:
            break
    inputs = processor(
        prompts_lists,
        input_images_lists,
        max_slice_nums=9,
        use_image_id=True,
        return_tensors="pt",
        max_length=8192
    )["input_ids"]
    trainloader = []
    import torch.nn.functional as F
    for i in range(inputs.size(0)):  # tensor.size(0) == 33
        inp = inputs.select(0, i).unsqueeze(0)  # 获取第 i 行并增加一个维度
        # pad_size = seqlen - inp.size(1)
        # # 在右侧填充，左边填充 0，右边填充 pad_size 个值
        # inp = F.pad(inp, (pad_size,0), "constant", 0)
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader

In [None]:
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
a = get_mme(128, 0, 2048, processor)

You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
s = 0
for i in a:
    s = max(a[0][0].shape[1],s)
print(s)

NameError: name 'a' is not defined

# 量化参数校验

In [1]:
import torch

from auto_gptq.modeling.minicpm.modeling_minicpmv import MiniCPMV
from torch import nn

model_path='/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit_w8_pc_llm_pc'
model = MiniCPMV.from_pretrained(model_path, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [2]:
import torch

from auto_gptq.modeling.minicpm.modeling_minicpmv import MiniCPMV
from torch import nn

model_fp_path='/home/workspace/model/MiniCPM-3o-1B-sft-v1'
model_fp = MiniCPMV.from_pretrained(model_fp_path, trust_remote_code=True)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


In [11]:
import torch.nn as nn
ignore_layers = ["resampler","llm.lm_head","vpm"]
for (name, module), (name_fp, module_fp) in zip(model.named_modules(),model_fp.named_modules()):
    if isinstance(module, nn.Linear) and not any(name.startswith(ignore_layer) for ignore_layer in ignore_layers):
        # 选择你想要添加 hook 的模块，比如 Linear, Conv2d, 或 LayerNorm
        if torch.all((module_fp.weight-module.weight).eq(0)):
            print("参数量化错误: ",name)
            break

参数量化错误:  llm.model.layers.0.self_attn.q_proj


# 权重合并

In [18]:
metadata = {'gptq_group_size': '-1',
            'gptq_damp_percent': '0.01',
            'auto_gptq_version': '0.8.0.dev0',
            'gptq_quant_method': 'gptq',
            'gptq_desc_act': 'True',
            'llm_gptq_bits': '8',
            'vit_gptq_bits':'4',
            'gptq_checkpoint_format': 'gptq',
            'format': 'pt'}

In [19]:
from safetensors.torch import load, save_file

# 路径定义
path2 = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-llm_pc_w4/model.safetensors'
path1 = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256/model.safetensors'
output_path = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc/model.safetensors'

# 加载 safetensors 文件
with open(path1, "rb") as f:
    file1 = f.read()
weights1 = load(file1)
with open(path2, "rb") as f:
    file2 = f.read()
weights2 = load(file2)

# 筛选权重
selected_weights = {}

for key, value in weights1.items():
    if 'vpm' in key:
        selected_weights[key] = value
for key, value in weights2.items():
    if 'vpm' not in key:
        selected_weights[key] = value
# selected_weights["metadata"] = metadata
save_file(selected_weights, output_path, metadata=metadata)

print(f"Combined weights saved to {output_path}")

Combined weights saved to /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc/model.safetensors


In [11]:
from safetensors.torch import load, save_file

output_path = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc/model.safetensors'
with open(output_path, "rb") as f:
    file2 = f.read()
weights2 = load(file2)

In [12]:
weights2

{'vpm.encoder.layers.0.self_attn.q_proj.bias': tensor([-0.4961, -0.2480,  0.5977,  ...,  0.2539,  0.3809,  3.5781]),
 'llm.model.layers.14.mlp.up_proj.qzeros': tensor([[2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071, 2004318071, 2004318071, 2004318071, 2004318071, 2004318071,
          2004318071,

In [6]:
from auto_gptq.modeling.minicpm_new.modeling_minicpmv import MiniCPMV
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc"
model = MiniCPMV.from_pretrained(quantized_model_dir, trust_remote_code=True)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc were not used when initializing MiniCPMV: ['llm.model.layers.0.mlp.down_proj.g_idx', 'llm.model.layers.0.mlp.down_proj.qweight', 'llm.model.layers.0.mlp.down_proj.qzeros', 'llm.model.layers.0.mlp.down_proj.scales', 'llm.model.layers.0.mlp.gate_proj.g_idx', 'llm.model.layers.0.mlp.gate_proj.qweight', 'llm.model.layers.0.mlp.gate_proj.qzeros', 'llm.model.layers.0.mlp.gate_proj.scales', 'llm.model.layers.0.mlp.up_proj.g_idx', 'llm.model.layers.0.mlp.up_proj.qweight', 'llm.model.layers.0.mlp.up_proj.qzeros', 'llm.model.layers.0.mlp.up_proj.scales', 'llm.model.layers.0.self_attn.k_proj.g_idx', 'llm.model.layers.0.self_attn.k_proj.qweight', 'llm.model.layers.0.self_attn.k_proj.qzeros', 'llm.model.layers.0.self_at

In [9]:
import transformers
import torch.nn as nn

def find_layers(module, layers=None, name=""):
    if not layers:
        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
    for layer in layers:
        if isinstance(module, layer):
            return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
    return res

In [10]:
find_layers(model)

{'llm.model.layers.0.self_attn.q_proj': Linear(in_features=1536, out_features=1536, bias=False),
 'llm.model.layers.0.self_attn.k_proj': Linear(in_features=1536, out_features=512, bias=False),
 'llm.model.layers.0.self_attn.v_proj': Linear(in_features=1536, out_features=512, bias=False),
 'llm.model.layers.0.self_attn.o_proj': Linear(in_features=1536, out_features=1536, bias=False),
 'llm.model.layers.0.mlp.gate_proj': Linear(in_features=1536, out_features=3840, bias=False),
 'llm.model.layers.0.mlp.up_proj': Linear(in_features=1536, out_features=3840, bias=False),
 'llm.model.layers.0.mlp.down_proj': Linear(in_features=3840, out_features=1536, bias=False),
 'llm.model.layers.1.self_attn.q_proj': Linear(in_features=1536, out_features=1536, bias=False),
 'llm.model.layers.1.self_attn.k_proj': Linear(in_features=1536, out_features=512, bias=False),
 'llm.model.layers.1.self_attn.v_proj': Linear(in_features=1536, out_features=512, bias=False),
 'llm.model.layers.1.self_attn.o_proj': Linea

# 取激活参数

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [2]:
import datasets
import random
import numpy as np
import torch

def get_ScienceQA(nsamples, seed, seqlen, processor, status):
    import torch.nn.functional as F
    dataset = datasets.load_from_disk("/home/workspace/dataset/ScienceQA-2")["train"]
    dataset = dataset.shuffle(seed=seed)
    rng = random.Random(42)

    #数据拆分
    if status == 0:
        traindataset = []
        for index, _data in enumerate(dataset):
            prompts_lists = []
            input_images_lists = []
            promt = _data["question"]
            # image_file = _data["image"]
            image_file = _data["image"]
            if image_file is None:
                nsamples = nsamples + 1
                continue
            else:
                image = np.array(image_file)
                # image = np.array(image_file.resize((448,  448)))
            msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
            prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
            
            input_images_lists.append([image])
            if index >= nsamples:
                break
     
            inputs = processor(
                prompts_lists,
                input_images_lists,
                max_slice_nums=processor.image_processor.max_slice_nums,
                use_image_id=processor.image_processor.use_image_id,
                return_tensors="pt",
                max_length=8192
            )

            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]
            pixel_values = inputs["pixel_values"]
            image_sizes = inputs["image_sizes"]
            image_bound = inputs["image_bound"]
            tgt_sizes = inputs["tgt_sizes"]
            traindataset.append({"input_ids": input_ids, 
                                    "attention_mask": attention_mask,
                                    "pixel_values": pixel_values,
                                    "image_sizes": image_sizes,
                                    "image_bound": image_bound,
                                    "tgt_sizes": tgt_sizes})
    elif status == 1:
        traindataset = []
        prompts_lists = []
        input_images_lists = []
        for index, _data in enumerate(dataset):
            promt = _data["question"]
            image_file = _data["image"]
            image = np.array(image_file)
            if image_file is None:
                nsamples = nsamples+1
                continue
            msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
            prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
            input_images_lists.append([image])
            if index >= nsamples-1:
                break
     
        inputs = processor(
            prompts_lists,
            input_images_lists,
            max_slice_nums=processor.image_processor.max_slice_nums,
            use_image_id=processor.image_processor.use_image_id,
            return_tensors="pt",
            max_length=8192
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        pixel_values = inputs["pixel_values"]
        image_sizes = inputs["image_sizes"]
        image_bound = inputs["image_bound"]
        tgt_sizes = inputs["tgt_sizes"]
        traindataset.append({"input_ids": input_ids, 
                                "attention_mask": attention_mask,
                                "pixel_values": pixel_values,
                                "image_sizes": image_sizes,
                                "image_bound": image_bound,
                                "tgt_sizes": tgt_sizes})

    return traindataset

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc"
# quantized_model_dir = "/home/workspace/model/m inicpm-3o-sft-v1-gptq-1112"
model = AutoGPTQForCausalMLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
INFO - Ignoring unknown parameter in the quantization configuration: vit_bits.
INFO - Ignoring unknown parameter in the quantization configuration: llm_bits.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc were not used when initializing MiniCPMV: ['llm.model.layers.0.mlp.down_proj.g_idx', 'llm.model.layers.0.mlp.down_proj.qweight', 'llm.model.layers.0.mlp.down_proj.qzeros', 'llm.mo

In [2]:
import os
from safetensors.torch import load
import shutil
import torch
# for bits in [2,4,8], per-channel quantization
def get_fake_weight(bits, qzeros, scales, qweight, group_size):
    wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32).unsqueeze(0)
    zeros = torch.bitwise_right_shift(
        torch.unsqueeze(qzeros, 2).expand(-1, -1, 32 // bits),
        wf.unsqueeze(0),
    ).to(torch.int16 if bits == 8 else torch.int8)
    zeros = zeros + 1
    zeros = torch.bitwise_and(
        zeros, (2**bits) - 1
    )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
    scales = scales
    scales = scales.reshape(-1, 1, scales.shape[-1])
    weight = torch.bitwise_right_shift(
        torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
        wf.unsqueeze(-1),
    ).to(torch.int16 if bits == 8 else torch.int8)
    weight = torch.bitwise_and(weight, (2**bits) - 1)
    weight = weight.reshape(-1, weight.shape[2])
    weight = scales * (weight - zeros)
    weight = weight.squeeze(0).T
    return weight

# real_path = '/data/checkpoints/gptq_models/MiniCPM-1B-sft-llama-format-gptq-1028-v2dataset-self-generated-wlmhead-perchannel-desc-true-v3/model.safetensors'
# real_path = '/data/zyq/8295/checkpoints/minicpm-3o-sft-v1-gptq-1107/model.safetensors'
# real_path = '/data/zyq/8295/checkpoints/minicpm-3o-sft-v1-gptq-1112/model.safetensors'
real_path = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc/model.safetensors'
# real_path = '/data/checkpoints/gptq_models/MiniCPM-1B-sft-llama-format-gptq-1014-v2dataset-wlmhead-perchannel-desc-true/model.safetensors' # autogptq的int4模型文件路径
# real_path = '/data/checkpoints/gptq_models/MiniCPM-1B-sft-llama-format-rotate-smooth-gptq-0925-fix-dataset-wolmhead-perchannel/model.safetensors'
# fake_path = '/data/checkpoints/gptq_models/fake_quant/from_autogptq_real_quant/MiniCPM-1B-sft-llama-format' # 存fp16伪量化模型路径
# fake_path = '/data/checkpoints/gptq_models/fake_quant/from_autogptq_real_quant/MiniCPM-1B-sft-llama-format-rotate-smooth'
fake_path = '/home/workspace/model/MiniCPM-3o-1B-sft-v1-vit-w8-pc-c256-llm-w4-pc-fp'
# source_dir = '/mnt/data/user/tc_agi/zhaoyq/minicpm-1b-sft-llama-format' # 另一个fp16模型文件，用来拿其他配置文件，tokenizer等
# source_dir = '/data/checkpoints/minicpm-1b-sft-llama-format-rotate-smooth'
source_dir = '/data/zyq/8295/checkpoints/fake/minicpm-3o-sft-v1-gptq-1112/'

os.makedirs(fake_path, exist_ok=True)
with open(real_path, "rb") as f:
    file = f.read()
loaded_data = load(file)
result_dict = {}
for key, value in loaded_data.items():
    # print("key: ", key)
    # print(value.shape)
    common_part = '.'.join(key.split('.')[:-1])  
    last_part = key.split('.')[-1]
    bits=8 if "vpm" in key else 4
    if 'qweight' in last_part:
        print(f"fake quanting {key}")
        qweight = loaded_data.get(f"{common_part}.qweight")
        qzeros = loaded_data.get(f"{common_part}.qzeros")
        scales = loaded_data.get(f"{common_part}.scales")
        weight = get_fake_weight(bits, qzeros, scales, qweight, -1)
        result_dict[common_part+".weight"] = weight
    elif "qzeros" in last_part or "scales" in last_part or "g_idx" in last_part:
        continue
    else:
        result_dict[common_part+f".{last_part}"] =value
torch.save(result_dict, os.path.join(fake_path, "pytorch_model.bin"))
os.makedirs(fake_path, exist_ok=True)
# for root, dirs, files in os.walk(source_dir):
#     for file in files:
#         if file == 'pytorch_model.bin':
#             continue 
#         source_file = os.path.join(root, file)
#         relative_path = os.path.relpath(source_file, source_dir)
#         destination_file = os.path.join(fake_path, relative_path)
#         os.makedirs(os.path.dirname(destination_file), exist_ok=True)
#         shutil.copy2(source_file, destination_file)

fake quanting llm.model.layers.34.mlp.up_proj.qweight
fake quanting llm.model.layers.9.mlp.down_proj.qweight
fake quanting llm.model.layers.44.mlp.down_proj.qweight
fake quanting llm.model.layers.27.self_attn.k_proj.qweight
fake quanting vpm.encoder.layers.8.self_attn.v_proj.qweight
fake quanting llm.model.layers.11.mlp.up_proj.qweight
fake quanting vpm.encoder.layers.4.self_attn.q_proj.qweight
fake quanting llm.model.layers.36.self_attn.v_proj.qweight
fake quanting llm.model.layers.4.self_attn.k_proj.qweight
fake quanting llm.model.layers.33.mlp.gate_proj.qweight
fake quanting llm.model.layers.29.self_attn.o_proj.qweight
fake quanting llm.model.layers.9.self_attn.q_proj.qweight
fake quanting llm.model.layers.15.mlp.down_proj.qweight
fake quanting llm.model.layers.14.self_attn.k_proj.qweight
fake quanting vpm.encoder.layers.6.self_attn.k_proj.qweight
fake quanting llm.model.layers.11.self_attn.o_proj.qweight
fake quanting vpm.encoder.layers.0.self_attn.q_proj.qweight
fake quanting vpm.

In [2]:
from transformers import AutoTokenizer,AutoProcessor
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
model.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "这是什么东西"
msgs = [{'role': 'user', 'content': [image, question]}]

# res = model.chat(
#         image=None,
#         msgs=msgs,
#         tokenizer=tokenizer,
#         # max_new_tokens=max_new_tokens,
#         sampling=False,
#         num_beams=1,
#         repetition_penalty=1,
# )
res = model.model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(res)

You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


这幅图像描绘了一架商用客机，可能是一架喷气式飞机，它正处于起飞或降落过程中。由于缺乏明显的运动模糊，这似乎是一张捕捉到飞机在空中飞行的照片。


In [6]:
from transformers import AutoTokenizer,AutoProcessor
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
traindataset = get_ScienceQA(16, 0, 1024, AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True),0)
len(traindataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


16

In [12]:
traindataset[0]["pixel_values"][0][0].shape

torch.Size([3, 14, 14112])

In [6]:
for example in traindataset:
    example["input_ids"] = example["input_ids"].cuda()
    example["attention_mask"] = example["attention_mask"].cuda()
    example["pixel_values"][0][0] = example["pixel_values"][0][0].cuda()
    model.model(example)
    # model(example)