In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False
)

INFO 01-11 22:23:57 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'enable_chunked_prefill': False, 'model': '/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1'}
INFO 01-11 22:23:57 [model.py:514] Resolved architecture: MistralForCausalLM
INFO 01-11 22:23:57 [model.py:1661] Using max model len 32768
INFO 01-11 22:23:59 [vllm.py:725] Cudagraph is disabled under eager mode


Multiple tokenizer files found in directory: /home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1. Using tokenizer.model.v1.


[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:23:59 [core.py:94] Initializing a V1 LLM engine (v0.1.dev12297+gb744aa686) with config: model='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', speculative_config=None, tokenizer='/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:06 [default_loader.py:308] Loading weights took 4.16 seconds
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:06 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:06 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:06 [capture_model_runner_mixin.py:113] [Capture] Wrapped 32 decoder layers for hidden states capture
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:07 [gpu_model_runner.py:3731] Model loading took 13.4967 GiB memory and 4.552443 seconds
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:13 [gpu_worker.py:376] Available KV cache memory: 25.75 GiB
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:24:13 [kv_cache_utils.py:1309] GPU KV cache size: 210,912 tokens
[0;36m(EngineCore_DP0 pid=277040)[0;0m INFO 01-11 22:

In [2]:
# before steer: false
messages = [
    {"role": "user", "content": "Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?"},
]
tokenizer = AutoTokenizer.from_pretrained("/home/xhl/huggingface_models/mistralai/Mistral-7B-Instruct-v0.1")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(example)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

<s> [INST] Solve the problem: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take? [/INST]


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, outpu

=====Baseline=====
 To solve this problem, we need to add the amount of blue fiber and white fiber required for the robe.

We know that:
1 bolt of blue fiber is required for every 2 bolts of white fiber

Let's assume that the robe requires x bolts of white fiber. Then, it would require 2x/2 = x bolts of blue fiber.

So, the total number of bolts required for the robe is:

Total bolts = Blue fiber + White fiber
Total bolts = x + x
Total bolts = 2x

Therefore, the robe takes 2x bolts in total.


In [3]:
# after steer: true
sv_request = SteerVectorRequest(
    steer_vector_name="reason",
    steer_vector_int_id=1,
    steer_vector_local_path="reason.gguf", 
    prefill_trigger_tokens=[-1],
    generate_trigger_tokens=[-1],
    algorithm="direct",
    scale=1.0,
    target_layers=list(range(16,20))
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====steer=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, outpu

=====steer=====
 The robe takes 3 bolts of fiber in total.
