In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False
)

INFO 11-03 15:44:02 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'enable_chunked_prefill': False, 'model': '/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1'}
INFO 11-03 15:44:02 [model.py:657] Resolved architecture: MistralForCausalLM
INFO 11-03 15:44:02 [model.py:1746] Using max model len 32768
INFO 11-03 15:44:05 [scheduler.py:211] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-03 15:44:05 [vllm.py:414] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:05 [core.py:94] Initializing a V1 LLM engine (v0.1.dev10888+g9d4fd0da4.d20251031) with config: model='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', speculative_config=None, tokenizer='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:17 [default_loader.py:314] Loading weights took 7.15 seconds
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:17 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:17 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:17 [hidden_states_model_runner_mixin.py:90] Wrapped 32 decoder layers for hidden states capture
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:19 [gpu_model_runner.py:2971] Model loading took 13.4967 GiB and 7.358022 seconds
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:22 [gpu_worker.py:343] Available KV cache memory: 28.28 GiB
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:22 [kv_cache_utils.py:1247] GPU KV cache size: 231,648 tokens
[1;36m(EngineCore_DP0 pid=780885)[0;0m INFO 11-03 15:44:22 [kv_ca

In [2]:
# before steer: false
messages = [
    {"role": "user", "content": "Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?"},
]
tokenizer = AutoTokenizer.from_pretrained("/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(example)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

<s> [INST] Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take? [/INST]


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
 To solve this problem, we need to add the amount of blue fiber and white fiber required for the robe.

We know that:
1 bolt of blue fiber = 1 bolt of white fiber

So,
2 bolts of blue fiber = 2 bolts of white fiber

Therefore, the total number of bolts required for the robe is:

2 bolts of blue fiber + 2 bolts of white fiber = 4 bolts in total

So, it takes 4 bolts of fiber in total to make the robe.


In [3]:
# after steer: true
sv_request = SteerVectorRequest(
    steer_vector_name="reason",
    steer_vector_int_id=1,
    steer_vector_local_path="reason.gguf", 
    prefill_trigger_tokens=[-1],
    generate_trigger_tokens=[-1],
    algorithm="direct",
    scale=1.0,
    target_layers=list(range(16,20))
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====steer=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====steer=====
 It takes 3 bolts of fiber in total.

Here's the reasoning:

1. The robe requires 2 bolts of blue fiber.
2. It also requires half that much white fiber, which means it needs 2/2 = 1 bolt of white fiber.
3. To find the total number of bolts needed, we add the amount of blue and white fiber: 2 (blue) + 1 (white) = 3 bolts.

Therefore, it takes 3 bolts of fiber in total to make the robe.
