In [None]:
# conda = grok

# benchmark

In [None]:
models = [
    {
        "model_id": "google/gemma-3-1b-it",
    },
    {
        "model_id": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
    },
    {
        "model_id": "Qwen/QwQ-32B",
    },
    {
        "model_id": "mistralai/Devstral-Small-2505",
    },
]

In [4]:
import pandas as pd
import json

In [None]:
df = pd.read_json("/home/ubuntu/grok-llms/inference/arena/data/ShareGPT_V3_unfiltered_cleaned_split.json")
df.head()

In [6]:
json_dataset_path = "/home/ubuntu/grok-llms/inference/arena/data/ShareGPT_V3_unfiltered_cleaned_split.json"
with open(json_dataset_path, "r") as f:
    data = json.load(f)
    
data[0]

{'id': 'QWJhYvA_0',
 'conversations': [{'from': 'human',
   'value': "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."},
  {'from': 'gpt',
   'value': "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and contin

# multi-lora inference

In [None]:
from huggingface_hub import login, snapshot_download
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

HUGGINGFACE_TOKEN = ""
login(token=HUGGINGFACE_TOKEN)

# sudo chown -R $(whoami) ~/.cache/huggingface

In [None]:
llm = LLM(
    model="meta-llama/Llama-2-7b-hf",
    enable_lora=True
)

In [None]:
sql_lora_path = snapshot_download(
    repo_id="yard1/llama-2-7b-sql-lora-test"
)

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=64,
    stop=["[/assistant]"]
)

prompts = [
     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
]

In [None]:
import os
print(os.listdir(sql_lora_path))

In [None]:
outputs = llm.generate(
    prompts=prompts,
    sampling_params=sampling_params,
    lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)

In [None]:
print(outputs[0].outputs[0].text)

In [None]:
print(outputs[1].outputs[0].text)

### serve lora adapters

In [None]:
# NOTE: do an ls for $HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots and verify the snapshot id

# terminal 1
vllm serve meta-llama/Llama-2-7b-hf \
    --enable-lora \
    --lora-modules '{"name": "sql-lora", "path": "/home/ubuntu/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c", "base_model_name": "meta-llama/Llama-2-7b"}'
    

# terminal 2
curl http://localhost:8000/v1/models

curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "sql-lora",
        "prompt": "San Francisco is a",
        "max_tokens": 7,
        "temperature": 0
    }' | jq

### dynamically serve lora adapters

In [None]:
# terminal 1
export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
vllm serve meta-llama/Llama-2-7b-hf --enable-lora

# terminal 2
curl -X POST http://localhost:8000/v1/load_lora_adapter \
-H "Content-Type: application/json" \
-d '{
    "lora_name": "sql_adapter",
    "lora_path": "/home/ubuntu/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c"
}'

curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "sql_adapter",
        "prompt": "San Francisco is a",
        "max_tokens": 7,
        "temperature": 0
    }' | jq

curl -X POST http://localhost:8000/v1/unload_lora_adapter \
-H "Content-Type: application/json" \
-d '{
    "lora_name": "sql_adapter"
}'

### lora with quantization 

In [1]:
# !pip install bitsandbytes --quiet

In [2]:
import gc
from typing import Optional

import torch
from huggingface_hub import snapshot_download

from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.lora.request import LoRARequest

  from .autonotebook import tqdm as notebook_tqdm


INFO 05-23 16:58:43 [__init__.py:239] Automatically detected platform cuda.


2025-05-23 16:58:45,294	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
def create_test_prompts(
        lora_path: str
) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
    return [
        # this is an example of using quantization without LoRA
        ("My name is",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128), None),
        # the next three examples use quantization with LoRA
        ("my name is",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128),
         LoRARequest("lora-test-1", 1, lora_path)),
        ("The capital of USA is",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128),
         LoRARequest("lora-test-2", 1, lora_path)),
        ("The capital of France is",
         SamplingParams(temperature=0.0,
                        logprobs=1,
                        prompt_logprobs=1,
                        max_tokens=128),
         LoRARequest("lora-test-3", 1, lora_path)),
    ]


def process_requests(engine: LLMEngine,
                     test_prompts: list[tuple[str, SamplingParams,
                                              Optional[LoRARequest]]]):
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params, lora_request = test_prompts.pop(0)
            engine.add_request(str(request_id),
                               prompt,
                               sampling_params,
                               lora_request=lora_request)
            request_id += 1

        request_outputs: list[RequestOutput] = engine.step()
        for request_output in request_outputs:
            if request_output.finished:
                print("----------------------------------------------------")
                print(f"Prompt: {request_output.prompt}")
                print(f"Output: {request_output.outputs[0].text}")


def initialize_engine(model: str, quantization: str,
                      lora_repo: Optional[str]) -> LLMEngine:
    """Initialize the LLMEngine."""

    engine_args = EngineArgs(model=model,
                             quantization=quantization,
                             enable_lora=True,
                             max_lora_rank=64,
                             max_loras=4)
    return LLMEngine.from_engine_args(engine_args)

In [4]:
test_configs = [
    # QLoRA (https://arxiv.org/abs/2305.14314)
    # {
    #     "name": "qlora_inference_example",
    #     'model': "huggyllama/llama-7b",
    #     'quantization': "bitsandbytes",
    #     'lora_repo': 'timdettmers/qlora-flan-7b'
    # },
    # {
    #     "name": "AWQ_inference_with_lora_example",
    #     'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
    #     'quantization': "awq",
    #     'lora_repo': 'jashing/tinyllama-colorist-lora'
    # },
    {
        "name": "GPTQ_inference_with_lora_example",
        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
        'quantization': "gptq",
        'lora_repo': 'jashing/tinyllama-colorist-lora'
    }
]

In [None]:
for test_config in test_configs:
    print(
        f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
    )
    engine = initialize_engine(test_config['model'],
                                test_config['quantization'],
                                test_config['lora_repo'])
    lora_path = snapshot_download(repo_id=test_config['lora_repo'])
    test_prompts = create_test_prompts(lora_path)
    process_requests(engine, test_prompts)

    # Clean up the GPU memory for the next test
    del engine
    gc.collect()
    torch.cuda.empty_cache()

~~~~~~~~~~~~~~~~ Running: GPTQ_inference_with_lora_example ~~~~~~~~~~~~~~~~
INFO 05-23 16:58:51 [config.py:717] This model supports multiple tasks: {'generate', 'embed', 'score', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 05-23 16:58:52 [gptq_marlin.py:147] Detected that the model can run with gptq_marlin, however you specified quantization=gptq explicitly, so forcing gptq. Use quantization=gptq_marlin for faster inference
INFO 05-23 16:58:52 [gptq_bitblas.py:168] Detected that the model can run with gptq_bitblas, however you specified quantization=gptq explicitly, so forcing gptq. Use quantization=gptq_bitblas for faster inference
INFO 05-23 16:58:52 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 05-23 16:58:57 [__init__.py:239] Automatically detected platform cuda.
INFO 05-23 16:58:59 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ', speculative_config=None, tokenizer='T

2025-05-23 16:58:59,572 - INFO - flashinfer.jit: Prebuilt kernels not found, using JIT backend


INFO 05-23 16:59:00 [parallel_state.py:1004] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 05-23 16:59:00 [cuda.py:221] Using Flash Attention backend on V1 engine.
INFO 05-23 16:59:00 [topk_topp_sampler.py:44] Currently, FlashInfer top-p & top-k sampling sampler is disabled because FlashInfer>=v0.2.3 is not backward compatible. Falling back to the PyTorch-native implementation of top-p & top-k sampling.
INFO 05-23 16:59:00 [gpu_model_runner.py:1329] Starting to load model TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ...
INFO 05-23 16:59:00 [weight_utils.py:265] Using model weights format ['*.safetensors']
INFO 05-23 16:59:02 [weight_utils.py:281] Time spent downloading weights for TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ: 2.162578 seconds
INFO 05-23 16:59:02 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.91it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.91it/s]



INFO 05-23 16:59:02 [loader.py:458] Loading weights took 0.20 seconds
INFO 05-23 16:59:02 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 05-23 16:59:03 [gpu_model_runner.py:1347] Model loading took 1.1657 GiB and 2.789998 seconds
INFO 05-23 16:59:11 [backends.py:420] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/e57ad72506/rank_0_0 for vLLM's torch.compile
INFO 05-23 16:59:11 [backends.py:430] Dynamo bytecode transform time: 7.63 s
INFO 05-23 16:59:14 [backends.py:136] Cache the graph of shape None for later use
INFO 05-23 16:59:43 [backends.py:148] Compiling a graph for general shape takes 32.23 s
INFO 05-23 17:00:01 [monitor.py:33] torch.compile takes 39.86 s in total
INFO 05-23 17:00:01 [kv_cache_utils.py:634] GPU KV cache size: 1,790,608 tokens
INFO 05-23 17:00:01 [kv_cache_utils.py:637] Maximum concurrency for 2,048 tokens per request: 874.32x
INFO 05-23 17:00:49 [gpu_model_runner.py:1686] Graph capturing finished in 47 secs, took 2.59 GiB
INFO 05-23 17

Process EngineCore_0:
Traceback (most recent call last):
  File "/opt/conda/envs/grok/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/conda/envs/grok/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/grok/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 400, in run_engine_core
    raise e
  File "/opt/conda/envs/grok/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 389, in run_engine_core
    engine_core.run_busy_loop()
  File "/opt/conda/envs/grok/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 413, in run_busy_loop
    self._process_engine_step()
  File "/opt/conda/envs/grok/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 438, in _process_engine_step
    outputs = self.step_fn()
              ^^^^^^^^^^^^^^
  File "/opt/conda/envs/grok/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 203, in step
    outpu

EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.

