# Introduction
While working on a personal project that required quantizing the Qwen2-VL-2B-Instruct model for fine-tuning, I discovered a significant gap in documentation for multi-modal model quantization. Most quantization examples focus on text-only models, leaving vision-language model practitioners to figure things out through trial and error.

Based on my experience using the GPTQModel library for model quantization and vLLM for inference, I’ve created this guide to help others navigate the process and avoid common pitfalls. It covers the entire workflow, from quantization to inference, across different frameworks.

# Model Quantization with GPTQModel Library

## Setting Up the Calibration Dataset

The quality of your quantized model heavily depends on your calibration dataset. For vision-language models, we need image-text pairs that represent the model’s intended use cases:

In [None]:
# Import Libraries
import os
from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer
from datasets import load_dataset

# Environment setup for optimal memory usage
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Format image-text pairs into Qwen2-VL conversation format
def format_qwen2_vl_dataset(image, assistant):
    return [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "generate a caption for this image"},
            ],
        },
        {"role": "assistant", "content": assistant},
    ]

# Load and format calibration dataset for quantization
def prepare_dataset(n_sample: int = 20):
    dataset = load_dataset("laion/220k-GPT4Vision-captions-from-LIVIS", 
                          split=f"train[:{n_sample}]")
    return [
        format_qwen2_vl_dataset(sample["url"], sample["caption"])
        for sample in dataset
    ]

## Performing the Quantization using GPTQModel Library

In [None]:
# Name of the HuggingFace Model to be quantized
pretrained_model_id = "Qwen/Qwen2-VL-2B-Instruct"
# Save path for quantized model
quantized_model_id = "Qwen2-VL-2B-Instruct-4bit-GPTQ"
# Get the best device for quantization
device = get_best_device()


# Prepare calibration dataset
dataset = prepare_dataset(n_sample=20)
# Configure quantization parameters
quantize_config = QuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.1,
    desc_act=False,  # Significantly speeds up inference
    static_groups=False,
    sym=True,
    true_sequential=True,
)
# Load and quantize the model
model = GPTQModel.load(
    pretrained_model_id,
    quantize_config=quantize_config,
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=True)
# quantize the model
model.quantize(dataset)
# Save the quantized model
model.save(quantized_model_id)

# Quantized Model Inference using GPTQModel Library

## Loading the Quantized Model

In [None]:
# Import Libraries
from gptqmodel import GPTQModel, BACKEND
from gptqmodel.models.definitions.qwen2_vl import Qwen2VLGPTQ

# Load the quantized model
model = GPTQModel.load(quantized_model_id, device=device)

In [None]:
# Explicitly use TORCH backend for better results on Turing GPUs like T4
# T4 GPUs default to ExLlamaQuantLinear, which I found produces significantly worse results compared to TorchQuantLinear. 
# model = GPTQModel.load(quantized_model_id, device=device, backend=BACKEND.TORCH)

## Running Inference

In [None]:
# Running inference with the quantized model
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(pretrained_model_id)

# Define conversation with image and text input
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://download.samplelib.com/jpeg/sample-clouds-400x300.jpg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Process inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs = Qwen2VLGPTQ.process_vision_info(messages)
inputs = processor(text=[text], images=image_inputs, videos=None, 
                  padding=True, return_tensors="pt").to("cuda")
# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
print("Output:", output_text)

## Publishing Quantized Model to Hugging Face Hub

In [None]:
from huggingface_hub import login, create_repo
from gptqmodel import GPTQModel

# Authenticate and create repository
login(token="your_hf_token_here")
repo_id = "your_username/Qwen2-VL-2B-Instruct-4bit-GPTQ"
create_repo(repo_id, private=True)

# Push quantized model
GPTQModel.push_to_hub(repo_id, quantized_path="/path/to/quantized/model", private=True)

# Quantized Model Inference using Hugging Face Transformers

In [None]:
# Import Libraries
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, GPTQConfig
from qwen_vl_utils import process_vision_info
import torch

# Configure GPTQ settings - crucial for T4 GPUs
gptq_config = GPTQConfig(bits=4, use_exllama=False)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "your_username/Qwen2-VL-2B-Instruct-4bit-GPTQ",
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=gptq_config
)
# Set up processor with optimal token ranges
min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained(
    "your_username/Qwen2-VL-2B-Instruct-4bit-GPTQ",
    min_pixels=min_pixels, 
    max_pixels=max_pixels
)

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print(output_text)

# Quantized Model Inference using vLLM

## Launch the vLLM Engine

In [None]:
# Import Libraries
import gc
from dataclasses import asdict
from PIL import Image
import torch
from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams, LLM
from typing import NamedTuple, Optional

In [None]:
# let’s set up the engine configuration:
class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    stop_token_ids: Optional[list[int]] = None

# Function to set the Engine Arguments
def run_qwen2_vl(modality: str) -> ModelRequestData:
    # You can use my quantized model: "arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512"
    model_name = "your_username/Qwen2-VL-2B-Instruct-4bit-GPTQ"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        enable_lora=False,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
        quantization="gptq",
        dtype=torch.float16,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
        max_seq_len_to_capture=48000
    )
    return ModelRequestData(engine_args=engine_args)

# Define Modality
modality = "image"
req_data = run_qwen2_vl(modality)

# Set up multimodal limits
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
    req_data.engine_args.limit_mm_per_prompt or {})

# Initialize the LLM
engine_args = asdict(req_data.engine_args)
llm = LLM(**engine_args)

## Run Inference

In [None]:
# function to generate prompts
def generate_prompts(questions: list[str], modality: str) -> list[str]:
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"
    else:
        raise ValueError(f"Unsupported modality: {modality}")

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return prompts

# Define your questions and images
questions = ["What is happening in this image?"]
prompts = generate_prompts(questions, modality)
# Set up sampling parameters
sampling_params = SamplingParams(
    temperature=0.2,
    max_tokens=128,
    stop_token_ids=req_data.stop_token_ids
)
# Prepare images and inputs
images = ["/content/demo_1.jpeg"]
inputs = [
    {"prompt": prompt, "multi_modal_data": {modality: Image.open(image)}}
    for prompt, image in zip(prompts, images)
]
# Generate responses
outputs = llm.generate(inputs, sampling_params=sampling_params)
# Process results
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")