In [36]:
# JUPYTER NOTEBOOK: MCQ decoding + parsing sanity test for vLLM + Jamba/Mamba/etc.

import re
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"

from typing import Optional, List, Dict, Any
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

# -----------------------
# Config (edit these)
# -----------------------
MODEL = "ibm-ai-platform/Bamba-9B-v1"   # or "ai21labs/Jamba-7B", "mistralai/Mamba-Codestral-7B-v0.1", etc.
CTX_LEN = 16384
DTYPE = "float16"
TP = 1
GPU_MEM_UTIL = 0.90

USE_CHAT = True   # for Jamba; set False for some completion-only models
MAX_NEW_MAIN = 4  # MCQ: 2-4 is plenty
MAX_NEW_REP = 1

# -----------------------
# Prompt builder
# -----------------------
DEFAULT_0SHOT = """{doc}

Question: {q}

A. {a}
B. {b}
C. {c}
D. {d}

Answer:"""

def build_prompt(doc: str, q: str, A: str, B: str, C: str, D: str) -> str:
    p = DEFAULT_0SHOT.format(doc=doc.strip(), q=q.strip(), a=A.strip(), b=B.strip(), c=C.strip(), d=D.strip())
    p += "\n\nReturn exactly one letter: A, B, C, or D."
    return p

def should_use_chat(name: str) -> bool:
    s = (name or "").lower()
    return any(k in s for k in ["instruct", "chat", "assistant", "jamba"])

def maybe_apply_chat_template(tok, prompt: str, use_chat: bool) -> str:
    if not use_chat:
        return prompt
    if hasattr(tok, "apply_chat_template"):
        return tok.apply_chat_template(
            [{"role": "user", "content": prompt}],
            tokenize=False,
            add_generation_prompt=True,
        )
    return prompt

# -----------------------
# Answer extractor (fixed)
# -----------------------
ANSWER_PATTERNS = [
    re.compile(r"(?:final\s*answer|answer)\s*[:\-]\s*\(?([A-D])\)?", re.IGNORECASE),
    re.compile(r"the\s+correct\s+answer\s+is\s*\(?([A-D])\)?", re.IGNORECASE),
]

def extract_answer(response: str) -> Optional[str]:
    if not response:
        return None
    text = response.replace("*", "").strip()
    tail = text[-512:]
    for pat in ANSWER_PATTERNS:
        m = pat.search(tail)
        if m:
            return m.group(1).upper()
    m = re.search(r"\b([A-D])\b", tail)
    return m.group(1).upper() if m else None

# -----------------------
# Load tokenizer + vLLM
# -----------------------

llm = LLM(
    model=MODEL,
    trust_remote_code=True,
    dtype=DTYPE,
    tensor_parallel_size=TP,
    max_model_len=CTX_LEN,
    gpu_memory_utilization=GPU_MEM_UTIL,
)




tests: List[Dict[str, Any]] = [
    {
        "id": "toy_1",
        "doc": "Video PreTraining (VPT) learns to act by watching unlabeled videos and uses a small labeled set to train an inverse dynamics model to label actions.",
        "q": "Which of the following statements is correct?",
        "A": "Both contractor data and data crawled from the Internet are used to train VPT agents to model state-action pairs.",
        "B": "All machine learning methods involved in the two articles are related to neural network deep learning.",
        "C": "Both voyager and VPT control Minecraft agents by predicting the actions of simulated mouse and keyboard operations in each given state.",
        "D": "VPT's modeling of action space is approximate rather than precise.",
        "gold": "D",
    },
]


INFO 12-17 07:27:23 [utils.py:233] non-default args: {'trust_remote_code': True, 'dtype': 'float16', 'max_model_len': 16384, 'disable_log_stats': True, 'model': 'ibm-ai-platform/Bamba-9B-v1'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 12-17 07:27:24 [model.py:547] Resolved architecture: BambaForCausalLM


Parse safetensors files: 100%|████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 16.06it/s]

INFO 12-17 07:27:24 [model.py:1510] Using max model len 16384
INFO 12-17 07:27:24 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 12-17 07:27:24 [config.py:297] Hybrid or mamba-based model detected: disabling prefix caching since it is not yet supported.
INFO 12-17 07:27:24 [config.py:308] Hybrid or mamba-based model detected: setting cudagraph mode to FULL_AND_PIECEWISE in order to optimize performance.
INFO 12-17 07:27:24 [config.py:376] Setting attention block size to 528 tokens to ensure that attention page size is >= mamba page size.
INFO 12-17 07:27:24 [config.py:397] Padding mamba page size by 0.69% to ensure that mamba page size and attention page size are exactly equal.





DEBUG 12-17 07:27:32 [plugins/__init__.py:28] No plugins for group vllm.platform_plugins found.
DEBUG 12-17 07:27:32 [platforms/__init__.py:34] Checking if TPU platform is available.
DEBUG 12-17 07:27:32 [platforms/__init__.py:52] TPU platform is not available because: No module named 'libtpu'
DEBUG 12-17 07:27:32 [platforms/__init__.py:58] Checking if CUDA platform is available.
DEBUG 12-17 07:27:32 [platforms/__init__.py:78] Confirmed CUDA platform is available.
DEBUG 12-17 07:27:32 [platforms/__init__.py:106] Checking if ROCm platform is available.
DEBUG 12-17 07:27:32 [platforms/__init__.py:120] ROCm platform is not available because: No module named 'amdsmi'
DEBUG 12-17 07:27:32 [platforms/__init__.py:127] Checking if XPU platform is available.
DEBUG 12-17 07:27:32 [platforms/__init__.py:146] XPU platform is not available because: No module named 'intel_extension_for_pytorch'
DEBUG 12-17 07:27:32 [platforms/__init__.py:153] Checking if CPU platform is available.
DEBUG 12-17 07:27:

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:07<00:22,  7.61s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:15<00:15,  7.62s/it]


[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:27:55 [model_executor/models/utils.py:186] Loaded weight lm_head.weight with shape torch.Size([128256, 4096])


Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:20<00:06,  6.74s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:26<00:00,  6.19s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:26<00:00,  6.57s/it]
[1;36m(EngineCore_DP0 pid=2290876)[0;0m 


[1;36m(EngineCore_DP0 pid=2290876)[0;0m INFO 12-17 07:28:05 [model_executor/model_loader/default_loader.py:267] Loading weights took 26.47 seconds
[1;36m(EngineCore_DP0 pid=2290876)[0;0m INFO 12-17 07:28:06 [v1/worker/gpu_model_runner.py:2653] Model loading took 18.2459 GiB and 27.099465 seconds
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:06 [compilation/decorators.py:256] Start compiling function <code object forward at 0x113f6430, file "/insomnia001/depts/edu/COMS-E6998-015/dwz2107/envs/ssm-venv/lib/python3.11/site-packages/vllm/model_executor/models/bamba.py", line 305>
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:08 [compilation/backends.py:501] Traced files (to be considered for compilation cache):
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:08 [compilation/backends.py:501] /insomnia001/depts/edu/COMS-E6998-015/dwz2107/envs/ssm-venv/lib/python3.11/site-packages/torch/_dynamo/polyfills/__init__.py
[1;36m(EngineCore_DP0 pid=2290876

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   1%|▏         | 1/67 [00:00<00:10,  6.50it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=504, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   4%|▍         | 3/67 [00:00<00:10,  6.28it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=488, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   7%|▋         | 5/67 [00:00<00:09,  6.54it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=480, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=472, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  10%|█         | 7/67 [00:01<00:09,  6.66it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:25 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=464, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=456, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  13%|█▎        | 9/67 [00:01<00:08,  6.78it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=448, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=440, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  16%|█▋        | 11/67 [00:01<00:08,  6.83it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=432, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=424, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  19%|█▉        | 13/67 [00:01<00:07,  6.91it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=416, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:26 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=408, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|██▏       | 15/67 [00:02<00:07,  6.94it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=400, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=392, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  25%|██▌       | 17/67 [00:02<00:07,  6.79it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=384, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=376, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  28%|██▊       | 19/67 [00:02<00:06,  7.70it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=368, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=360, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  31%|███▏      | 21/67 [00:02<00:05,  8.25it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:27 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=352, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=344, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  34%|███▍      | 23/67 [00:03<00:05,  8.62it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=336, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=328, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  37%|███▋      | 25/67 [00:03<00:04,  8.55it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=320, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=312, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  40%|████      | 27/67 [00:03<00:04,  8.58it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=304, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=296, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  43%|████▎     | 29/67 [00:03<00:04,  8.64it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=288, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:28 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=280, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  46%|████▋     | 31/67 [00:04<00:04,  8.72it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=272, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=264, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  51%|█████     | 34/67 [00:04<00:03,  9.71it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=256, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=248, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=240, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=232, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=224, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compil

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  57%|█████▋    | 38/67 [00:04<00:02, 10.71it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=208, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=200, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:29 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=192, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  63%|██████▎   | 42/67 [00:05<00:02, 11.69it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=184, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=176, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=168, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  69%|██████▊   | 46/67 [00:05<00:01, 13.09it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=160, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=152, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=144, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=136, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=128, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compil

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  75%|███████▍  | 50/67 [00:05<00:01, 14.38it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=104, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=96, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=88, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=80, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  81%|████████  | 54/67 [00:05<00:00, 15.83it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=72, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:30 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=64, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=56, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=48, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  87%|████████▋ | 58/67 [00:06<00:00, 16.03it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=40, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=32, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=24, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=16, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  93%|█████████▎| 62/67 [00:06<00:00, 16.94it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=8, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=4, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=2, uniform_decode=False))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:31 [compilation/cuda_graph.py:136] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1, uniform_decode=False))


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:06<00:00, 10.10it/s]
Capturing CUDA graphs (decode, FULL):   0%|          | 0/35 [00:00<?, ?it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:32 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=256, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):   3%|▎         | 1/35 [00:00<00:27,  1.22it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:32 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=248, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:32 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=240, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):   9%|▊         | 3/35 [00:01<00:11,  2.76it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=232, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=224, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  14%|█▍        | 5/35 [00:01<00:06,  4.80it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=216, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=208, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  20%|██        | 7/35 [00:01<00:04,  6.52it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=200, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=192, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=184, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  29%|██▊       | 10/35 [00:02<00:02,  8.42it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=176, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=168, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=160, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  40%|████      | 14/35 [00:02<00:02,  9.92it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:33 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=152, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=144, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=136, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  51%|█████▏    | 18/35 [00:02<00:01, 11.13it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=128, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=120, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=112, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=104, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=96, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Captur

Capturing CUDA graphs (decode, FULL):  63%|██████▎   | 22/35 [00:03<00:01, 12.42it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=80, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=72, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=64, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  74%|███████▍  | 26/35 [00:03<00:00, 13.19it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=56, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:34 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=48, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=40, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  86%|████████▌ | 30/35 [00:03<00:00, 14.18it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=32, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=24, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=16, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=8, uniform_decode=True))


Capturing CUDA graphs (decode, FULL):  97%|█████████▋| 34/35 [00:03<00:00, 15.35it/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=4, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=2, uniform_decode=True))
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:35 [compilation/cuda_graph.py:136] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1, uniform_decode=True))


Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:04<00:00,  7.95it/s]


[1;36m(EngineCore_DP0 pid=2290876)[0;0m INFO 12-17 07:28:36 [v1/worker/gpu_model_runner.py:3480] Graph capturing finished in 12 secs, took 0.68 GiB
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:36 [v1/worker/gpu_worker.py:393] Free memory on device (47.1/47.4 GiB) on startup. Desired GPU memory utilization is (0.9, 42.66 GiB). Actual usage is 18.25 GiB for weight, 1.08 GiB for peak activation, 0.04 GiB for non-torch memory, and 0.68 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=24133454950` (22.48 GiB) to fit into requested memory, or `--kv-cache-memory=28894258688` (26.91 GiB) to fully utilize gpu memory. Current kv cache memory in use is 23.3 GiB.
[1;36m(EngineCore_DP0 pid=2290876)[0;0m INFO 12-17 07:28:36 [v1/engine/core.py:210] init engine (profile, create kv cache, warmup model) took 30.15 seconds
[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:28:37 [v1/engine/core.py:737] EngineCore waiting for work.
INFO 12-17

In [40]:
import re
from typing import Optional, List, Dict, Any
from transformers import AutoTokenizer
from vllm import SamplingParams

# -------------------------
# Config for BAMBA
# -------------------------
MODEL = "ibm-ai-platform/Bamba-9B-v1"  # adjust if your hf_id differs
USE_CHAT = False  # IMPORTANT: Bamba is completion-style
MAX_NEW_MAIN = 8
MAX_NEW_REP = 16

# -------------------------
# Answer patterns + extractor
# -------------------------
ANSWER_PATTERNS = [
    re.compile(r"(?:final\s*answer|answer)\s*[:\-]\s*\(?\s*([A-D])\s*\)?", re.IGNORECASE),
    re.compile(r"the\s+correct\s+answer\s+is\s*[:\-]?\s*\(?\s*([A-D])\s*\)?", re.IGNORECASE),
]

def extract_answer(response: str) -> Optional[str]:
    if not response:
        return None
    text = response.replace("*", "").strip()
    tail = text[-512:]

    m = re.search(r"\\boxed\s*\{\s*([A-D])\s*\}", tail, re.IGNORECASE)
    if m:
        return m.group(1).upper()

    for pat in ANSWER_PATTERNS:
        m = pat.search(tail)
        if m:
            return m.group(1).upper()

    m = re.search(r"\b([A-D])\b", tail)
    return m.group(1).upper() if m else None

# -------------------------
# Prompt builders
# -------------------------
DEFAULT_0SHOT = """{doc}

Question: {q}

A. {a}
B. {b}
C. {c}
D. {d}

Final answer:"""

def add_filler_tokens(prompt: str, tok, n_tokens: int = 5000) -> str:
    """
    Appends ~n_tokens of harmless filler text, measured in tokenizer tokens.
    """
    filler_unit = " lorem"  # stable, non-special
    unit_ids = tok.encode(filler_unit, add_special_tokens=False)
    if not unit_ids:
        raise RuntimeError("Tokenizer produced no tokens for filler_unit")

    reps = (n_tokens // len(unit_ids)) + 1
    filler_text = filler_unit * reps

    # Trim to exactly n_tokens
    ids = tok.encode(filler_text, add_special_tokens=False)[:n_tokens]
    filler_text = tok.decode(ids, skip_special_tokens=True)

    return prompt + "\n\n[FILLER]\n" + filler_text

def build_prompt(doc: str, q: str, A: str, B: str, C: str, D: str) -> str:
    return DEFAULT_0SHOT.format(
        doc=doc.strip(), q=q.strip(), a=A.strip(), b=B.strip(), c=C.strip(), d=D.strip()
    )

def build_reprompt(q: str, A: str, B: str, C: str, D: str, prev: str) -> str:
    # Keep it short; include choices so A/B/C/D is grounded.
    return (
        "Return exactly one letter: A, B, C, or D.\n\n"
        f"Question: {q.strip()}\n"
        f"Prev: {prev.strip()[:300]}\n"
        f"A. {A.strip()}\n"
        f"B. {B.strip()}\n"
        f"C. {C.strip()}\n"
        f"D. {D.strip()}\n\n"
        "Final answer:"
    )

# -------------------------
# Tokenizer + sampling params
# -------------------------
tok = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

sp_main = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=MAX_NEW_MAIN, seed=0)
sp_rep  = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=MAX_NEW_REP, seed=42)

# -------------------------
# Run tests (expects `llm` and `tests` already defined)
# tests items must have: id, doc, q, A, B, C, D, (optional) gold
# -------------------------
for t in tests:
    raw = build_prompt(t["doc"], t["q"], t["A"], t["B"], t["C"], t["D"])

    # IMPORTANT: vLLM expects a list of prompts
    raw = add_filler_tokens(raw, tok, n_tokens=5000)

    out = llm.generate([raw], sp_main)[0]
    text = out.outputs[0].text if out.outputs else ""
    pred = extract_answer(text)

    rep_text = ""
    if pred is None:
        rep_raw = build_reprompt(t["q"], t["A"], t["B"], t["C"], t["D"], text)
        print(rep_raw)
        rep_out = llm.generate([rep_raw], sp_rep)[0]
        rep_text = rep_out.outputs[0].text if rep_out.outputs else ""
        pred = extract_answer(rep_text)

    print("=" * 80)
    print("id:", t["id"])
    print("PROMPT (last 400 chars):\n", raw[-400:])
    print("\nMAIN OUTPUT:\n", repr(text))
    print("MAIN pred:", pred)
    if rep_text:
        print("\nREPROMPT OUTPUT:\n", repr(rep_text))
        print("REP pred:", pred)
    if "gold" in t:
        print("gold:", t["gold"], "correct:", (pred == t["gold"]))



Adding requests: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 28.23it/s]


[1;36m(EngineCore_DP0 pid=2290876)[0;0m DEBUG 12-17 07:34:08 [v1/engine/core.py:743] EngineCore loop active.


Processed prompts:   0%|                  | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

[1;36m(EngineCore_DP0 pid=2290876)[0;0m ERROR 12-17 07:34:09 [logging_utils/dump_input.py:69] Dumping input data for V1 LLM engine (v0.11.0) with config: model='ibm-ai-platform/Bamba-9B-v1', speculative_config=None, tokenizer='ibm-ai-platform/Bamba-9B-v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=ibm-ai-platform/Bamba-9B

/insomnia001/home/dwz2107/.cache/vllm/torch_compile_cache/66dcbb55ec/rank_0_0/inductor_cache/ii/ciil7rhth7sqj5ecr2awv7wlwey2bil2uiejvrqlufo3xiowouz7.py:37: unknown: block: [8338,0,0], thread: [64,0,0] Assertion `index out of bounds: 0 <= tl.broadcast_to(tmp10, [XBLOCK]) < 4096` failed.
/insomnia001/home/dwz2107/.cache/vllm/torch_compile_cache/66dcbb55ec/rank_0_0/inductor_cache/ii/ciil7rhth7sqj5ecr2awv7wlwey2bil2uiejvrqlufo3xiowouz7.py:37: unknown: block: [8338,0,0], thread: [65,0,0] Assertion `index out of bounds: 0 <= tl.broadcast_to(tmp10, [XBLOCK]) < 4096` failed.
/insomnia001/home/dwz2107/.cache/vllm/torch_compile_cache/66dcbb55ec/rank_0_0/inductor_cache/ii/ciil7rhth7sqj5ecr2awv7wlwey2bil2uiejvrqlufo3xiowouz7.py:37: unknown: block: [8338,0,0], thread: [66,0,0] Assertion `index out of bounds: 0 <= tl.broadcast_to(tmp10, [XBLOCK]) < 4096` failed.
/insomnia001/home/dwz2107/.cache/vllm/torch_compile_cache/66dcbb55ec/rank_0_0/inductor_cache/ii/ciil7rhth7sqj5ecr2awv7wlwey2bil2uiejvrqlufo

EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.

