# 纯文本模型量化

In [1]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/meta-llama-3-8b-instruct"
quantized_model_dir = "/home/workspace/model/meta-llama-3-8b-instruct-w4-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    )
]

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [02:09<00:00, 32.34s/it]


In [2]:

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)

# save quantized model
model.save_quantized(quantized_model_dir)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
2024-10-30 08:14:41 INFO [auto_gptq.quantization.gptq] duration: 10.037084102630615
2024-10-30 08:14:41 INFO [auto_gptq.quantization.gptq] avg loss: 2.1992526054382324
INFO - Quantizing self_attn.v_proj in layer 1/32...
2024-10-30 08:14:48 INFO [auto_gptq.quantization.gptq] duration: 7.339477300643921
2024-10-30 08:14:48 INFO [auto_gptq.quantization.gptq] avg loss: 0.03380981832742691
INFO - Quantizing self_attn.q_proj in layer 1/32...
2024-10-30 08:14:58 INFO [auto_gptq.quantization.gptq] duration: 9.718609094619751
2024-10-30 08:14:58 INFO [auto_gptq.quantization.gptq] avg loss: 3.3699684143066406
INFO - Quantizing self_attn.o_proj in layer 1/32...
2024-10-30 08:15:08 INFO [auto_gptq.quantization.gptq] duration: 10.15040373802185
2024-10-30 08:15:08 INFO [auto_gptq.quantization.gptq] avg loss: 0.0003416052204556763
INFO - Quantizing mlp.up_proj in layer 1/32...
2024-10-30 08:15:19 INFO [auto_gptq.q

<|begin_of_text|>auto_gptq is a simple Python script that uses the AutoGPT model to generate text
auto-gptq is a type of gptq that is generated internally by the system. It


In [3]:
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")

# download quantized model from Hugging Face Hub and load to the first GPU
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)

# inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])

INFO - You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`.
INFO - The layer lm_head is not quantized.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The model 'LlamaGPTQForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNe

<|begin_of_text|>auto_gptq is a simple Python script that uses the AutoGPT model to generate text
auto-gptq is a type of gptq that is generated internally by the system. It


# 多模态模型量化

In [20]:
import numpy as np
from datasets import load_dataset, load_from_disk
import random
import torch

def get_wikitext2(nsamples, seed, seqlen, tokenizer):
    # set seed
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)

    # load dataset and preprocess
    # traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    # testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    traindata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="train")
    testdata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="test")
    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    traindataset = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        attention_mask = torch.ones_like(inp)
        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
    return traindataset, testenc

In [21]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalMLM, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/MiniCPM-Llama3-V-2_5"
quantized_model_dir = "/home/workspace/model/MiniCPM-Llama3-V-2_5-w4-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalMLM.from_pretrained(pretrained_model_dir, quantize_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
2024-11-06 03:36:13 INFO [transformers_modules.MiniCPM-Llama3-V-2_5.configuration_minicpm] vision_config is None, using default vision config
Loading checkpoint shards: 100%|██████████| 7/7 [00:33<00:00,  4.85s/it]


In [None]:
import math
import os
import time
from logging import getLogger
import torch
import torch.nn as nn
import transformers
from .quantizer import Quantizer
logger = getLogger(__name__)
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
class GPTQ:
    def __init__(self, layer):
        self.layer = layer
        self.dev = self.layer.weight.device
        W = layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
            W = W.t()
        self.rows = W.shape[0]
        self.columns = W.shape[1]
        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
        self.nsamples = 0
        self.quantizer = Quantizer()
    def add_batch(self, inp, out):
        if os.environ.get("DEBUG"):
            self.inp1 = inp
            self.out1 = out
        if len(inp.shape) == 2:
            inp = inp.unsqueeze(0)
        tmp = inp.shape[0]
        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
            if len(inp.shape) == 3:
                inp = inp.reshape((-1, inp.shape[-1]))
            inp = inp.t()
        if isinstance(self.layer, nn.Conv2d):
            unfold = nn.Unfold(
                self.layer.kernel_size,
                dilation=self.layer.dilation,
                padding=self.layer.padding,
                stride=self.layer.stride,
            )
            inp = unfold(inp)
            inp = inp.permute([1, 0, 2])
            inp = inp.flatten(1)
        self.H *= self.nsamples / (self.nsamples + tmp)
        self.nsamples += tmp
        # inp = inp.float()
        inp = math.sqrt(2 / self.nsamples) * inp.float()
        # print("inp.shape: ", inp.shape)
        # print("H.shape: ", self.H.shape)
        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
        self.H += inp.matmul(inp.t())
    def fasterquant(
        self,
        blocksize=128,
        percdamp=0.01,
        group_size=-1,
        actorder=False,
        static_groups=False,
    ):
        W = self.layer.weight.data.clone()
        if isinstance(self.layer, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(self.layer, transformers.Conv1D):
            W = W.t()
        W = W.float()
        tick = time.time()
        if not self.quantizer.ready():
            self.quantizer.find_params(W, weight=True)
        H = self.H
        del self.H
        dead = torch.diag(H) == 0
        H[dead, dead] = 1
        W[:, dead] = 0
        g_idx = []
        scale = []
        zero = []
        now_idx = 1
        if static_groups:
            import copy
            groups = []
            for i in range(0, self.columns, group_size):
                quantizer = copy.deepcopy(self.quantizer)
                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
                scale.append(quantizer.scale)
                zero.append(quantizer.zero)
                groups.append(quantizer)
        if actorder:
            perm = torch.argsort(torch.diag(H), descending=True)
            W = W[:, perm]
            H = H[perm][:, perm]
            invperm = torch.argsort(perm)
        Losses = torch.zeros_like(W)
        Q = torch.zeros_like(W)
        damp = percdamp * torch.mean(torch.diag(H))
        print("1st damp: ", damp)
        # NOTE: To Make Sure positive-definite
        eigenvalues = torch.linalg.eigvalsh(H)
        min_eigenvalue = torch.min(eigenvalues)
        if min_eigenvalue < 0:
            damp = max(damp, -min_eigenvalue + 0.01 * torch.mean(torch.diag(H)))  # 确保至少覆盖最小特征值的负面效应
            print("2nd damp: ", damp)
        diag = torch.arange(self.columns, device=self.dev)
        H[diag, diag] += damp
        H = torch.linalg.cholesky(H)
        H = torch.cholesky_inverse(H)
        H = torch.linalg.cholesky(H, upper=True)
        Hinv = H
        for i1 in range(0, self.columns, blocksize):
            i2 = min(i1 + blocksize, self.columns)
            count = i2 - i1
            W1 = W[:, i1:i2].clone()
            Q1 = torch.zeros_like(W1)
            Err1 = torch.zeros_like(W1)
            Losses1 = torch.zeros_like(W1)
            Hinv1 = Hinv[i1:i2, i1:i2]
            for i in range(count):
                w = W1[:, i]
                d = Hinv1[i, i]
                if group_size != -1:
                    if not static_groups:
                        if (i1 + i) % group_size == 0:
                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
                        if ((i1 + i) // group_size) - now_idx == -1:
                            scale.append(self.quantizer.scale)
                            zero.append(self.quantizer.zero)
                            now_idx += 1
                    else:
                        idx = i1 + i
                        if actorder:
                            idx = perm[idx]
                        self.quantizer = groups[idx // group_size]
                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
                Q1[:, i] = q
                Losses1[:, i] = (w - q) ** 2 / d**2
                err1 = (w - q) / d
                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
                Err1[:, i] = err1
            Q[:, i1:i2] = Q1
            Losses[:, i1:i2] = Losses1 / 2
            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
            if os.environ.get("DEBUG"):
                self.layer.weight.data[:, :i2] = Q[:, :i2]
                self.layer.weight.data[:, i2:] = W[:, i2:]
                logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
                logger.debug(torch.sum(Losses))
        torch.cuda.synchronize()
        logger.info(f"duration: {(time.time() - tick)}")
        logger.info(f"avg loss: {torch.sum(Losses).item() / self.nsamples}")
        group_size = group_size if group_size != -1 else self.columns
        if static_groups and actorder:
            g_idx = [perm[i] // group_size for i in range(self.columns)]
        else:
            g_idx = [i // group_size for i in range(self.columns)]
        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
        if actorder:
            Q = Q[:, invperm]
            g_idx = g_idx[invperm]
        if isinstance(self.layer, transformers.Conv1D):
            Q = Q.t()
        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
        if os.environ.get("DEBUG"):
            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
        if scale == []:
            scale.append(self.quantizer.scale)
            zero.append(self.quantizer.zero)
        scale = torch.cat(scale, dim=1)
        zero = torch.cat(zero, dim=1)
        return scale, zero, g_idx
    def free(self):
        if os.environ.get("DEBUG"):
            self.inp1 = None
            self.out1 = None
        self.H = None
        self.Losses = None
        self.Trace = None
        torch.cuda.empty_cache()
__all__ = ["GPTQ"]

In [22]:
model.quantize(traindataset)

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
2024-11-06 03:37:45 INFO [auto_gptq.quantization.gptq] duration: 0.8841938972473145
2024-11-06 03:37:45 INFO [auto_gptq.quantization.gptq] avg loss: 41.80743408203125
INFO - Quantizing self_attn.v_proj in layer 1/32...
2024-11-06 03:37:46 INFO [auto_gptq.quantization.gptq] duration: 0.6244401931762695
2024-11-06 03:37:46 INFO [auto_gptq.quantization.gptq] avg loss: 1.3841116428375244
INFO - Quantizing self_attn.q_proj in layer 1/32...
2024-11-06 03:37:47 INFO [auto_gptq.quantization.gptq] duration: 0.6435339450836182
2024-11-06 03:37:47 INFO [auto_gptq.quantization.gptq] avg loss: 71.19627380371094
INFO - Quantizing self_attn.o_proj in layer 1/32...
2024-11-06 03:38:20 INFO [auto_gptq.quantization.gptq] duration: 0.8739309310913086
2024-11-06 03:38:20 INFO [auto_gptq.quantization.gptq] avg loss: 0.025352805852890015
INFO - Quantizing mlp.up_proj in layer 1/32...
2024-11-06 03:38:54 INFO [auto_gptq.qu

In [None]:

# save quantized model
model.save_quantized(quantized_model_dir)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)

# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)

In [3]:
import os
import random
import shutil

# 定义源数据集路径和目标保存路径
imagenet_train_dir = '/home/workspace/dataset/imagenet/train'  # 替换为你本地 ImageNet 训练集的路径
output_dir = '/home/workspace/dataset/imagenet/calibration'  # 替换为你要保存图片的路径
num_images_to_select = 64  # 要随机选取的图片数量

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)

# 获取训练集中所有类别文件夹
all_images = []
for root, _, files in os.walk(imagenet_train_dir):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            all_images.append(os.path.join(root, file))

# 随机选取64张图片
selected_images = random.sample(all_images, num_images_to_select)

# 将选取的图片复制到目标文件夹
for img_path in selected_images:
    shutil.copy(img_path, output_dir)

print(f'Successfully selected and copied {num_images_to_select} images to {output_dir}.')


Successfully selected and copied 64 images to /home/workspace/dataset/imagenet/calibration.


# VIT部分量化

In [5]:
import datasets
import random
import numpy as np
import torch

def get_ScienceQA(nsamples, seed, seqlen, processor):
    import torch.nn.functional as F
    dataset = datasets.load_from_disk("/home/workspace/dataset/ScienceQA-2")["train"]
    dataset = dataset.shuffle(seed=seed)
    rng = random.Random(42)
    samples, num_tokens = [], 0
    prompts_lists = []
    input_images_lists = []
    for index, _data in enumerate(dataset):
        promt = _data["question"]
        image_file = _data["image"]
        image = np.array(image_file)
        if image_file is None:
            nsamples = nsamples+1
            continue
        msgs = [{'role': 'user', 'content': "(<image>./</image>)\n"+ promt}]
        prompts_lists.append(processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
        input_images_lists.append([image])
        if index >= nsamples-1:
            break
    # return prompts_lists,input_images_lists
     
    inputs = processor(
        prompts_lists,
        input_images_lists,
        max_slice_nums=processor.image_processor.max_slice_nums,
        use_image_id=processor.image_processor.use_image_id,
        return_tensors="pt",
        max_length=8192
    )
    # return inputs
    traindataset = []
    for _ in range(inputs["input_ids"].size(0)):
        input_ids = inputs["input_ids"].select(0, _).unsqueeze(0) 
        attention_mask = inputs["attention_mask"].select(0, _).unsqueeze(0) 
        pixel_values = inputs["pixel_values"]
        image_sizes = inputs["image_sizes"]
        image_bound = inputs["image_bound"]
        tgt_sizes = inputs["tgt_sizes"]
        traindataset.append({"input_ids": input_ids, 
                             "attention_mask": attention_mask,
                             "pixel_values": pixel_values,
                             "image_sizes": image_sizes,
                             "image_bound": image_bound,
                             "tgt_sizes": tgt_sizes})

    return traindataset

In [2]:
# import numpy as np
# from datasets import load_dataset, load_from_disk
# import random
# import torch

# def get_wikitext2(nsamples, seed, seqlen, tokenizer):
#     # set seed
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.random.manual_seed(seed)

#     # load dataset and preprocess
#     # traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
#     # testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
#     traindata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="train")
#     testdata = load_dataset("/home/workspace/code/git/FlatQuant_mlm/datasets/wikitext", split="test")
#     trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
#     testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

#     traindataset = []
#     for _ in range(nsamples):
#         i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
#         j = i + seqlen
#         inp = trainenc.input_ids[:, i:j]
#         attention_mask = torch.ones_like(inp)
#         traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
#     return traindataset, testenc

In [6]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-g128"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
quantize_config = BaseQuantizeConfig(
    bits=8,  # quantize model to 4-bit
    group_size=-1,  # it is recommended to set the value to 128
    desc_act=True,  # set to False can significantly speed up inference but the perplexity may slightly bad
)
# traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForVIT.from_pretrained(pretrained_model_dir, quantize_config)
from transformers import AutoProcessor
model.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
2024-11-07 15:41:31 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "Tell me the model of this aircraft."
msgs = [{'role': 'user', 'content': [image, question]}]

answer = model.model.cuda().chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

The aircraft in the image is an Airbus A380, identifiable by its distinctive hump on the upper deck, which is characteristic of the Airbus A380 model. The A380 is a twin-engine, wide-body, four-engine jet airliner that was developed by Airbus and manufactured by Boeing. It is one of the largest aircraft in the world, capable of carrying more than 800 passengers and has a range of up to 9,500 nautical miles (17,200 km). This particular model is part of the Airbus A380 family, which includes the A380-800 and A380-900 variants.


In [8]:
model.model.device

device(type='cpu')

In [7]:
# traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
traindataset = get_ScienceQA(1024, 0, model.seqlen, model.model.processor)
# i,m = get_ScienceQA(32, 0, model.seqlen, model.model.processor)

You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
model.quantize(traindataset)

INFO - Start quantizing layer 1/27
INFO - Quantizing self_attn.k_proj in layer 1/27...
2024-11-07 16:17:12 INFO [auto_gptq.quantization.gptq] duration: 0.4007911682128906
2024-11-07 16:17:12 INFO [auto_gptq.quantization.gptq] avg loss: 6.768801540601999e-05
INFO - Quantizing self_attn.v_proj in layer 1/27...
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] duration: 0.2792811393737793
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] avg loss: 1.185271412396105e-05
INFO - Quantizing self_attn.q_proj in layer 1/27...
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] duration: 0.2770664691925049
2024-11-07 16:17:13 INFO [auto_gptq.quantization.gptq] avg loss: 4.957634519087151e-05
INFO - Quantizing self_attn.out_proj in layer 1/27...
2024-11-07 16:18:27 INFO [auto_gptq.quantization.gptq] duration: 0.21391677856445312
2024-11-07 16:18:27 INFO [auto_gptq.quantization.gptq] avg loss: 1.2891981668872177e-06
INFO - Quantizing mlp.fc1 in layer 1/27...
2024-11-07 16:19:34 INFO [

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.06 GiB. GPU 0 has a total capacity of 79.33 GiB of which 1024.00 MiB is free. Process 5139 has 78.32 GiB memory in use. Of the allocated memory 69.98 GiB is allocated by PyTorch, and 7.80 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
# save quantized model
model.save_quantized(quantized_model_dir)

2024-11-07 15:10:31 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config


In [None]:
# save quantized model using safetensors
# model.save_quantized(quantized_model_dir, use_safetensors=True)

In [9]:
from auto_gptq import AutoGPTQForVIT, BaseQuantizeConfig

# quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-g128"
quantized_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1-pc"

model_quant = AutoGPTQForVIT.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


You are using a model of type minicpmv to instantiate a model of type minicpm. This is not supported for all configurations of models and can yield errors.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
2024-11-08 03:14:48 INFO [auto_gptq.modeling.minicpm.configuration_minicpm] vision_config is None, using default vision config
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Some weights of the model checkpoint at /home/workspace/model/MiniCPM-3o-1B-sft-v1-pc were not used when initializing MiniCPMV: ['vpm.encoder.layers.0.mlp.fc1.g_idx', 'vpm.encoder.layers.0.mlp.fc1.qweight', 'vpm.encoder.layers.0.mlp.fc1.qzeros', 'vpm.encoder.layers.0.mlp.fc1.scales', 'vpm.encoder.layers.0.mlp.fc2.g_idx', 'vpm.encoder.layers.0.mlp.fc2.qweight', 'vpm.encoder.layers.0.mlp.fc2.qzeros', 'vpm.encoder.layers.0.mlp.fc2.scales', 'vpm.encoder.layers.0.self_attn.k_proj.g_idx', 'vpm.encoder.layers.0.self_attn.k_proj.qweight', 'vpm

layers------- {'llm.model.layers.0.self_attn.q_proj': Linear(in_features=1536, out_features=1536, bias=False), 'llm.model.layers.0.self_attn.k_proj': Linear(in_features=1536, out_features=512, bias=False), 'llm.model.layers.0.self_attn.v_proj': Linear(in_features=1536, out_features=512, bias=False), 'llm.model.layers.0.self_attn.o_proj': Linear(in_features=1536, out_features=1536, bias=False), 'llm.model.layers.0.mlp.gate_proj': Linear(in_features=1536, out_features=3840, bias=False), 'llm.model.layers.0.mlp.up_proj': Linear(in_features=1536, out_features=3840, bias=False), 'llm.model.layers.0.mlp.down_proj': Linear(in_features=3840, out_features=1536, bias=False), 'llm.model.layers.1.self_attn.q_proj': Linear(in_features=1536, out_features=1536, bias=False), 'llm.model.layers.1.self_attn.k_proj': Linear(in_features=1536, out_features=512, bias=False), 'llm.model.layers.1.self_attn.v_proj': Linear(in_features=1536, out_features=512, bias=False), 'llm.model.layers.1.self_attn.o_proj': L

In [10]:
model_quant

VITGPTQ(
  (model): MiniCPMV(
    (llm): MiniCPMForCausalLM(
      (model): MiniCPMModel(
        (embed_tokens): Embedding(73464, 1536)
        (layers): ModuleList(
          (0-51): 52 x MiniCPMDecoderLayer(
            (self_attn): MiniCPMSdpaAttention(
              (q_proj): Linear(in_features=1536, out_features=1536, bias=False)
              (k_proj): Linear(in_features=1536, out_features=512, bias=False)
              (v_proj): Linear(in_features=1536, out_features=512, bias=False)
              (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
              (rotary_emb): MiniCPMLongRoPE()
            )
            (mlp): MiniCPMMLP(
              (gate_proj): Linear(in_features=1536, out_features=3840, bias=False)
              (up_proj): Linear(in_features=1536, out_features=3840, bias=False)
              (down_proj): Linear(in_features=3840, out_features=1536, bias=False)
              (act_fn): SiLU()
            )
            (input_layernorm): MiniCPMRMS

In [11]:
from transformers import AutoTokenizer, TextGenerationPipeline,AutoProcessor
pretrained_model_dir = "/home/workspace/model/MiniCPM-3o-1B-sft-v1"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, trust_remote_code=True)
model_quant.model.processor = AutoProcessor.from_pretrained(pretrained_model_dir, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
from PIL import Image
image = Image.open('/home/workspace/code/llm-awq/awq/airplane.jpeg').convert('RGB')

# First round chat 
question = "Tell me the model of this aircraft."
msgs = [{'role': 'user', 'content': [image, question]}]

answer = model_quant.model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(answer)

You're using a MiniCPMVTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


The aircraft in the image is an Airbus A380-800, which is a wide-body airliner. This model is part of Airbus's A380 family, which includes the A380 and A380-800 versions. The A380-800 is the largest passenger airliner ever built and can carry up to 800 passengers across its four main decks. It was introduced in 2005 and has since become a symbol of commercial aviation's capacity to transport large numbers of people over long distances.


In [19]:
model_quant.outside_layer_modules

['vpm.embeddings', 'vpm.post_layernorm']

In [17]:
model_quant.model.llm.model

MiniCPMModel(
  (embed_tokens): Embedding(73464, 1536)
  (layers): ModuleList(
    (0-51): 52 x MiniCPMDecoderLayer(
      (self_attn): MiniCPMSdpaAttention(
        (q_proj): QuantLinear()
        (k_proj): QuantLinear()
        (v_proj): QuantLinear()
        (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        (rotary_emb): MiniCPMLongRoPE()
      )
      (mlp): MiniCPMMLP(
        (gate_proj): Linear(in_features=1536, out_features=3840, bias=False)
        (up_proj): Linear(in_features=1536, out_features=3840, bias=False)
        (down_proj): Linear(in_features=3840, out_features=1536, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): MiniCPMRMSNorm()
      (post_attention_layernorm): MiniCPMRMSNorm()
    )
  )
  (norm): MiniCPMRMSNorm()
)

## 调试

In [10]:
layer_inputs = []

def store_input_hook(_, args,kwargs):
    # Positional arguments.
    layer_input = []
    for inp in args:
        layer_input.append(inp)
    layer_inputs.append(layer_input)

    # # Keyword arguments.
    # if kwargs["attention_mask"] is not None:
    #     attention_masks.append(kwargs["attention_mask"].to(data_device))
    # else:
    #     attention_masks.append(None)

    # pos_ids = kwargs.get("position_ids", None)
    # if pos_ids is not None:
    #     position_ids.append(move_to_device(pos_ids, data_device))
    # one_kwargs = {}
    # for (
    #     k,
    #     v,
    # ) in kwargs.items():  # make sure other arguments also be captured
    #     if k not in ["hidden_states", "attention_mask", "position_ids"]:
    #         one_kwargs[k] = nested_move_to_device(v, data_device)
    # layer_input_kwargs.append(one_kwargs)
    # raise ValueError

In [11]:
handle = model.vpm.encoder.layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)

In [12]:
for example in traindataset:
    for k, v in example.items():
        # if len(v.shape) == 1:
        #     v = v.unsqueeze(0)
        # if k in "input_ids" or k in "attention_mask":
        #     example[k] = v
        # elif k in "pixel_values":
        #     example[k][0][0] =  v[0][0].cuda()
        #     example[k][0][1] =  v[0][1].cuda()
        #     example[k][0][2] =  v[0][2].cuda()
        a = model(example)
        break
handle.remove()


In [8]:
model.vpm.encoder.layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)

<torch.utils.hooks.RemovableHandle at 0x7efb9932ca50>

# 取激活

# 模型测试

In [9]:
import torch
from transformers import AutoModel, AutoTokenizer,AutoProcessor,set_seed

set_seed(42)
# torch.manual_seed(0)
# model_path = "/home/workspace/model/MiniCPM-Llama3-V-2_5"
# model_path = "/home/workspace/model/minicpm-vit-1b-w8-lenovo-llama-w8-pergroup128"
# model_path = "/home/workspace/model/minicpm-vit-1b-w8-lenovo"
# model_path = "/home/workspace/model/llava-1___5-7b-hf"
# model_path = "/home/workspace/model/minicpm-gptq-w4-32-perchannel-only_quant_downproj"
model_path = "/home/workspace/model/MiniCPM-V-1B-sft-v2-1B"
model = AutoModel.from_pretrained(model_path, 
                                  trust_remote_code=True,
                                  device_map=None) 
# model_weight_path = "/home/workspace/model/minicpm_v_navit_250_0927.pt"
# model.load_state_dict(torch.load(model_weight_path))
model = model.cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

2024-11-06 09:01:37 INFO [transformers_modules.MiniCPM-V-1B-sft-v2-1B.configuration_minicpm] vision_config is None, using default vision config


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
model.vpm

SiglipVisionTransformer(
  (embeddings): SiglipVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
    (position_embedding): Embedding(4900, 1152)
  )
  (encoder): SiglipEncoder(
    (layers): ModuleList(
      (0-26): 27 x SiglipEncoderLayer(
        (self_attn): SiglipAttention(
          (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
        )
        (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
        (mlp): SiglipMLP(
          (activation_fn): PytorchGELUTanh()
          (fc1): Linear(in_features=1152, out_features=4304, bias=True)
          (fc2): Linear(in_features=4304, out_features=1152, bias=True)
        )
        (layer_norm2): LayerNorm((1152,), ep

In [31]:
vpm = model.vpm
input_shape = (1, 1024, 1152)
example_input = torch.ones(input_shape, dtype=torch.float32)

# print(model_resampler)
vpm.prepare_layernorm()
for bolck in vpm.encoder.layers:
    bolck.prepare_layernorm()
    bolck.self_attn.prepare_sha()
    bolck.mlp.prepare_conv()

In [33]:
vpm.cuda()

SiglipVisionTransformer(
  (embeddings): SiglipVisionEmbeddings(
    (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
    (position_embedding): Embedding(4900, 1152)
  )
  (encoder): SiglipEncoder(
    (layers): ModuleList(
      (0-26): 27 x SiglipEncoderLayer(
        (self_attn): SiglipAttention(
          (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
          (q_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1, 1))
          )
          (k_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1, 1))
          )
          (v_proj_sha): ModuleList(
            (0-15): 16 x Conv2d(1152, 72, kernel_size=(1, 1), stride=(1

In [34]:
x1 = vpm(example_input.cuda())

In [35]:
x-x1

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.]]]], device='cuda:0',
       grad_fn=<SubBackward0>)