In [None]:
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest
import os

# Set your GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

os.environ["NCCL_DEBUG"] = "ERROR" 
os.environ["NCCL_DEBUG_FILE"] = "/dev/null"
os.environ["GLOG_minloglevel"] = "2"
os.environ["TORCH_CPP_LOG_LEVEL"] = "ERROR"

# Initialize the LLM model
# enable_steer_vector=True: Enables vector steering (without this, behaves like regular vLLM)
# enforce_eager=True: Ensures reliability and stability of interventions (strongly recommended)
model_id = "llava-hf/llava-v1.6-vicuna-7b-hf"
llm = LLM(
    model=model_id,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False,
    enable_steer_vector=True,
)


In [2]:
from transformers import AutoTokenizer, LlavaNextProcessor
from PIL import Image
paths = ["test.jpg"]
images = [Image.open(p) for p in paths]
tokenizer = LlavaNextProcessor.from_pretrained(model_id)

question = "Is there a parking meter in the image?"
messages_list = [
    [
        {
            "role": "user",
            "content": (
                [{"type": "image"}] + [{"type": "text", "text": question}]
            ),
        }
    ]
]
mm_prompts = [
    tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    for messages in messages_list
]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
print(mm_prompts[0])
sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=128,
)

USER: <image>
Is there a parking meter in the image? ASSISTANT:


In [4]:
outputs = llm.generate(
    {
        "prompt": mm_prompts[0],
        "multi_modal_data": {"image": images},
    },
    sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)

Adding requests: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s, est. speed input: 2722.62 toks/s, output: 45.83 toks/s]

 Yes, there is a parking meter in the image. It is located on the sidewalk next to the street.





In [5]:
import numpy as np
import torch

# .npy -> .pt
npy_file_path = "task_vector_layer-10.npy"
pt_file_path = "task_vector_layer-10.pt"
data = np.load(npy_file_path)
tensor_data = torch.from_numpy(data)
torch.save(tensor_data, pt_file_path)

print(f"Successfully converted {npy_file_path} to {pt_file_path}")

Successfully converted task_vector_layer-10.npy to task_vector_layer-10.pt


In [6]:
steer_request = SteerVectorRequest(
    "1", 
    1,
    steer_vector_local_path="task_vector_layer-10.pt",
    scale=6,
    target_layers=[10], 
    prefill_trigger_positions=[-1], 
    generate_trigger_tokens=[-1],
)
steer_output = llm.generate(
    {
        "prompt": mm_prompts[0],
        "multi_modal_data": {"image": images},
    },
    steer_vector_request=steer_request,
    sampling_params=sampling_params
)
print(steer_output[0].outputs[0].text)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 512.19it/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s, est. speed input: 2681.05 toks/s, output: 57.77 toks/s]

 No, there is no parking meter visible in the image. The image shows a street scene with a building, traffic lights, and a street sign.



