## Introduction

This notebook demonstrates how to use the VLLM library for efficient inference using LoRA (Low-Rank Adaptation) adapters on the top of base models. The example uses two variants of Qwen2-VL-2B models finetuned on LaTeX-OCR dataset:
- 4-bit nf4(bits and bytes) quantized model + LoRA (aka QLoRA)
- 4-bit GPTQ Quantized model + LoRA

## Offline Inference

We can run vLLM in offline mode to be used in our local projects.

In [9]:
#Import Libraries
import gc
from typing import NamedTuple, Optional

import torch
from huggingface_hub import snapshot_download

from vllm import LLM, EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.lora.request import LoRARequest

from dataclasses import asdict
from PIL import Image

INFO 06-20 21:43:26 [__init__.py:244] Automatically detected platform cuda.


2025-06-20 21:43:30,042	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [10]:
# Let's define the EngineArgs for the LLMEngine
def initialize_engine(model: str, quantization: str,modality: str) -> LLMEngine:
    """Initialize the LLMEngine."""    
    if quantization == "bitsandbytes":
        engine_args = EngineArgs(
            model=model, # Specify the model name or path
            max_model_len=4096, # Model context length. If unspecified, will be automatically derived from the model config.
            max_num_seqs=5, # Maximum number of sequences per iteration.
            enable_lora=True, # If True, enable handling of LoRA adapters.
            max_loras=1, # Max number of LoRAs in a single batch.
            max_lora_rank=16, # Max LoRA rank.
            mm_processor_kwargs={ # Specify the multimodal processor kwargs
                "min_pixels": 28 * 28,
                "max_pixels": 1280 * 28 * 28,
            },
            limit_mm_per_prompt={modality: 1}, # Limit the number of multimodal inputs per prompt.
            quantization=quantization, # Specify the quantization method (E.g., "bitsandbytes", "gptq" etc.)
            load_format="bitsandbytes", # Load format for the model. Need to specify this if using bitsandbytes quantization.
            max_seq_len_to_capture = 4096,
        )

    else:
        engine_args = EngineArgs(
            model=model,
            max_model_len=4096,
            max_num_seqs=5,
            enable_lora=True,
            max_loras=3,
            max_lora_rank=16,
            mm_processor_kwargs={
                "min_pixels": 28 * 28,
                "max_pixels": 1280 * 28 * 28,
            },
            limit_mm_per_prompt={modality: 1},
            quantization=quantization,
            max_seq_len_to_capture = 4096,
        )
    
    # Set number of multimodal inputs other than the specified modality to 0
    default_limits = {"image": 0, "video": 0, "audio": 0}
    engine_args.limit_mm_per_prompt = default_limits | dict(engine_args.limit_mm_per_prompt or {}) 

    return engine_args


# function to generate prompts
def generate_prompts(questions: list[str], modality: str) -> list[str]:
    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"
    else:
        raise ValueError(f"Unsupported modality: {modality}")
   
    # Create prompts with the specified placeholder for the modality
    # The prompt format should follow corresponding examples on HuggingFace model repository.
    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return prompts  



In [None]:
# we need to download the adapter(s) and save them locally using the snapshot_download function
from huggingface_hub import snapshot_download

modality = "image"
quantization = "gptq_marlin" # Options for quantization: "bitsandbytes", "gptq_marlin", "gptq"


# GPTQ Marlin is a more efficient kernel for running GPTQ quantized models.
# if you your are using NVIDIA Turing or older GPUs (e.g., T4, V100, P100), you need to use just "gptq" as the quantization method.
if quantization == "gptq_marlin":
    model_name = "arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512"
    vision_lora_path = snapshot_download(repo_id="arunmadhusudh/qwen2_VL_2B_LatexOCR_qlora_qptq_epoch3")
else:
    model_name = "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit"
    vision_lora_path = snapshot_download(repo_id="arunmadhusudh/qwen2_VL_2B_LatexOCR_qlora_nf4_epoch3")

# Let's create the vLLM LLM engine with the specified model, quantization, and engine configurations
engine_args = initialize_engine(model_name, quantization, modality)
engine_args = asdict(engine_args)
llm = LLM(**engine_args)


In [13]:
# Prepare the input for the LLM
data = Image.open("/home/madhusudhanan.a/vlms/latex.png")
questions = ["Write the LaTeX representation for this image."]
prompts = generate_prompts(questions, modality)
inputs = {
    "prompt": prompts[0],
    "multi_modal_data": {modality: data},    
}

# Define the sampling parameters 
sampling_params = SamplingParams(
    temperature=0.2,
    max_tokens=128,
    stop_token_ids=[151645]
)

# Define the LoRA request for the vision model
# The first parameter of LoRARequest is a human identifiable name, the second parameter is a globally unique ID for the adapter and the third parameter is the path to the LoRA adapter.
lora_request=LoRARequest("vision", 1, vision_lora_path)


# Generate the output using the LLM engine
outputs = llm.generate(
    inputs,
    sampling_params=sampling_params,
    lora_request=lora_request,
)
outputs[0].outputs[0].text

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 237.34it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s, est. speed input: 79.80 toks/s, output: 159.59 toks/s]


'( 5 . 1 7 ) = - \\frac { ( - i ) ^ { m + n - 1 } } { ( m + n - 1 ) ! } z ^ { m + n - 1 } \\left\\{ \\frac { 1 } { 4 } \\frac { 1 } { \\lambda } z ^ { 2 \\lambda } + \\frac { 1 } { 2 } \\ln ( z ) \\right\\}'

In [14]:
from IPython.display import display, Math, Latex
display(Math(outputs[0].outputs[0].text))

<IPython.core.display.Math object>

## Online Inference

vLLM provides an HTTP server that implements OpenAI's Completions API, Chat API, and more! This functionality lets us serve models and interact with them using an HTTP client. We need to start the server first with `vllm serve` command.

The same configuration we used for offline inference can be used for online inference. Open a terminal and run the following commands



To run the GPTQ Quantized model + LoRA adapter using vLLM, you can use the following command. Note that for Turing or older GPUs, you need to use the "gptq" quantization method instead of "gptq_marlin" and set "dtype" to float16.

``` bash
vllm serve arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512 \
    --enable-lora \
    --lora-modules '{"name": "vision", "path": "arunmadhusudh/qwen2_VL_2B_LatexOCR_qlora_qptq_epoch3", "base_model_name": "arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512"}'
    --dtype bfloat16 \
    --max-model-len 4096 \
    --max-num-seqs 5 \
    --max-loras 1 \
    --max-lora-rank 16 \
    --quantization gptq_marlin \
    --limit-mm-per-prompt "image=1,video=0" \
    --max-seq-len-to-capture 4096 \
    --mm-processor-kwargs '{"min_pixels": 784, "max_pixels": 1003520}'
```

To run the QLoRA model(nf4 quantization + LoRA adapter) using vLLM, you can use the following command. Note that for Turing or older GPUs, you need to set the "dtype" to float16.4

``` bash
vllm serve unsloth/Qwen2-VL-2B-Instruct-bnb-4bit \
    --enable-lora \
    --lora-modules '{"name": "vision", "path": "arunmadhusudh/qwen2_VL_2B_LatexOCR_qlora_nf4_epoch3", "base_model_name": "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit"}'
    --dtype bfloat16 \
    --max-model-len 4096 \
    --max-num-seqs 5 \
    --max-loras 1 \
    --max-lora-rank 16 \
    --quantization bitsandbytes \
    --load-format bitsandbytes \
    --limit-mm-per-prompt "image=1,video=0" \
    --max-seq-len-to-capture 4096 \
    --mm-processor-kwargs '{"min_pixels": 784, "max_pixels": 1003520}'
```


To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script :

In [1]:
# Optionally, you can set the NO_PROXY environment variable to avoid proxy issues when running the vLLM server locally.
import os
os.environ["NO_PROXY"] = "localhost,127.0.0.1"
os.environ["no_proxy"] = "localhost,127.0.0.1"

In [2]:
import base64
from openai import OpenAI


#  Encode a local image file to base64 format.
def encode_base64_image(image_path: str) -> str:
   
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Configure the OpenAI client to point to your local vLLM server
client = OpenAI(
    api_key="EMPTY",  # vLLM does not require an API key by default
    base_url="http://localhost:8000/v1"
)

# Path to your local image
image_path = "/home/madhusudhanan.a/vlms/latex.png"

# Encode the image to base64
image_base64 = encode_base64_image(image_path)

# Create the chat completion request with the image
chat_completion = client.chat.completions.create(
    model="vision",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "Write the LaTeX representation for this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_base64}"
                },
            },
        ],
    }],
    max_tokens=128,
    temperature=0.2
)

# Print the model's response
print(chat_completion.choices[0].message.content)

( 5 . 1 7 ) = - \frac { ( - i ) ^ { m + n - 1 } } { ( m + n - 1 ) ! } z ^ { m + n - 1 } \left\{ \frac { 1 } { 4 } \frac { 1 } { \lambda } z ^ { 2 \lambda } + \frac { 1 } { 2 } \ln ( z ) \right\}


In [3]:
from IPython.display import display, Math, Latex
display(Math(chat_completion.choices[0].message.content))

<IPython.core.display.Math object>

In [5]:
import base64
from openai import OpenAI


#  Encode a local image file to base64 format.
def encode_base64_image(image_path: str) -> str:
   
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Configure the OpenAI client to point to your local vLLM server
client = OpenAI(
    api_key="EMPTY",  # vLLM does not require an API key by default
    base_url="http://localhost:8000/v1"
)

# Path to your local image
image_path = "/home/madhusudhanan.a/vlms/latex.png"

# Encode the image to base64
image_base64 = encode_base64_image(image_path)

# Create the chat completion request with the image
chat_completion = client.chat.completions.create(
    model="vision",
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "Write the LaTeX representation for this image."},
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_base64}"
                },
            },
        ],
    }],
    max_tokens=128,
    temperature=0.2
)

# Print the model's response
print(chat_completion.choices[0].message.content)

( 5 . 1 7 ) = - \frac { ( - i ) ^ { m + n - 1 } } { ( m + n - 1 ) ! } z ^ { m + n - 1 } \left\{ \frac { 1 } { 4 } \frac { 1 } { \lambda } z ^ { 2 } \lambda ^ { 2 } + \frac { 1 } { 2 } \ln ( z ) \right\}


In [6]:
from IPython.display import display, Math, Latex
display(Math(chat_completion.choices[0].message.content))

<IPython.core.display.Math object>

## Benchmarking our model

Benchmarking is an important step to evaluate the performance of our model and to ensure that it meets the throughput requirements for our specific use case and adjust the configuration accordingly. vLLM provides scripts to benchmark the model performance while serving the model. The script currently supports running benchmarks using certain number of datasets such as ShareGPT, BurstGPT, VisionArena, and more.

However the benchmarking script currently does not support the Latex_OCR dataset, which is used for finetuning the models in this example.

I have updated the script to support Latex_OCR dataset and raised a PR ([#19894](https://github.com/vllm-project/vllm/pull/19894)) to vLLM repository. 
As of now PR is not merged yet, so you can use the updated script from my PR branch [here](https://github.com/arunmadhusud/vllm/tree/unsloth_benchmark). You can either build vLLM from my forked repository or replace the `benchmark_datset.py` and `benchmark_serving.py` files in the your vLLM installation with the updated files from my PR branch. I would recommend doing the latter as it is easier and faster.

Now start the vLLM server with the vLLM serve command as shown above, and then run the benchmark script with the following command. 

We will use `openai-chat` as the backend from the available options:
{tgi, vllm, lmdeploy, deepspeed-mii, openai, openai-chat, openai-audio, tensorrt-llm, scalellm, sglang}.

We will set  `request-rate` ato 5, which means 5 requests will be sent to the model per second. We will also set `max-concurrency` to 5, meaning up to 5 requests can be processed simultaneously. While the `request-rate` argument controls the rate at which requests are initiated, `max-concurrency` will control how many are actually allowed to execute at a time. This means that when used in combination, the actual request rate (throughput) may be lower than specified with `request-rate`, if the server is not processing requests fast enough to keep up.

Other arguments used:
- `--model`: The model to benchmark
- `--dataset-name`: The dataset type to benchmark, ours is `hf` (hugging face dataset)
- `--dataset-path`: The name of the Hugging Face dataset. Ours is `unsloth/latex_ocr`
- `--hf-split`: The dataset split to use (train/test dataset), We use `train`
- `--hf-output-len`: The maximum number of output tokens to generate, set to 256
- `--num-prompts`: The number of prompts to use for benchmarking. We use 1000 samples.
If the dataset has fewer than 1000 samples, it will be re-sampled to reach 1000.
- `--lora_modules`: Name of the LoRA adapter(s) to use. Ours is `vision`. This should match the name used when serving the model.

We will use the default percentile metrics: `TTFT`, `TPOT`, and `ITL`. 
- Time To First Token (`TTFT`) is the time taken to generate the first token.
- Time per Output Token (`TPOT`) is the time taken to generate each token (exluding the first token)
- Inter-Token Latency (`ITL`) is the time taken between each token generation. 

The mean, median and 99th percentile values for these metrics will be reported in the output.






In [4]:
# To benchmark the serving performance of GPTQ Quantized model + LoRA adapter using vLLM, you can use the following command. 
'''
backend options'''
!python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --request-rate 5 \
  --max-concurrency 5 \
  --model arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512 \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path unsloth/LaTeX_OCR \
  --hf-split train \
  --hf-output-len 256 \
  --num-prompts 1000 \
  --lora_modules vision

INFO 06-20 21:21:03 [__init__.py:244] Automatically detected platform cuda.
Namespace(backend='openai-chat', base_url=None, host='127.0.0.1', port=8000, endpoint='/v1/chat/completions', dataset_name='hf', dataset_path='unsloth/LaTeX_OCR', max_concurrency=5, model='arunmadhusudh/Qwen2-VL-2B-Instruct-4bit-GPTQ_T4_tr4512', tokenizer=None, use_beam_search=False, num_prompts=1000, logprobs=None, request_rate=5.0, burstiness=1.0, seed=0, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, save_detailed=False, append_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, custom_output_len=256, custom_skip_chat_template=False, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=0.0, random_prefix_len=0, hf_subset=None, hf_split='train', hf_output_len=256, to

In [7]:
# To benchmark the serving performance of QLoRA model(nf4 quantization + LoRA adapter) using vLLM, you can use the following command.
!python3 vllm/benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --request-rate 5 \
  --max-concurrency 5 \
  --model unsloth/Qwen2-VL-2B-Instruct-bnb-4bit \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
  --dataset-path unsloth/LaTeX_OCR \
  --hf-split train \
  --hf-output-len 256 \
  --num-prompts 1000 \
  --lora_modules vision

INFO 06-20 21:34:28 [__init__.py:244] Automatically detected platform cuda.
Namespace(backend='openai-chat', base_url=None, host='127.0.0.1', port=8000, endpoint='/v1/chat/completions', dataset_name='hf', dataset_path='unsloth/LaTeX_OCR', max_concurrency=5, model='unsloth/Qwen2-VL-2B-Instruct-bnb-4bit', tokenizer=None, use_beam_search=False, num_prompts=1000, logprobs=None, request_rate=5.0, burstiness=1.0, seed=0, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=False, save_detailed=False, append_result=False, metadata=None, result_dir=None, result_filename=None, ignore_eos=False, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, custom_output_len=256, custom_skip_chat_template=False, sonnet_input_len=550, sonnet_output_len=150, sonnet_prefix_len=200, sharegpt_output_len=None, random_input_len=1024, random_output_len=128, random_range_ratio=0.0, random_prefix_len=0, hf_subset=None, hf_split='train', hf_output_len=256, top_p=None, top_k=N