In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False
)


INFO 11-03 15:05:22 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'enable_chunked_prefill': False, 'model': '/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/'}
INFO 11-03 15:05:22 [model.py:657] Resolved architecture: Gemma2ForCausalLM
INFO 11-03 15:05:22 [model.py:1746] Using max model len 8192
INFO 11-03 15:05:24 [scheduler.py:211] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-03 15:05:24 [vllm.py:414] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:26 [core.py:94] Initializing a V1 LLM engine (v0.1.dev10888+g9d4fd0da4.d20251031) with config: model='/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, downlo

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:58 [default_loader.py:314] Loading weights took 27.95 seconds
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:58 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:58 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:58 [hidden_states_model_runner_mixin.py:90] Wrapped 42 decoder layers for hidden states capture
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:05:59 [gpu_model_runner.py:2971] Model loading took 17.2181 GiB and 28.218241 seconds
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:06:33 [gpu_worker.py:343] Available KV cache memory: 23.02 GiB
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:06:33 [kv_cache_utils.py:1247] GPU KV cache size: 71,840 tokens
[1;36m(EngineCore_DP0 pid=753023)[0;0m INFO 11-03 15:06:33 [kv_c

In [2]:
messages = [
    {"role": "user", "content": " Who was the head coach of the Cleveland Cavaliers when LeBron James won his first MVP in 2006?"},
]
tokenizer = AutoTokenizer.from_pretrained("/data/zju-46/shenyl/hf/model/google/gemma-2-9b-it/")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(example)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

<bos><start_of_turn>user
Who was the head coach of the Cleveland Cavaliers when LeBron James won his first MVP in 2006?<end_of_turn>
<start_of_turn>model



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
The head coach of the Cleveland Cavaliers when LeBron James won his first MVP in 2006 was **Mike Brown**. 
<end_of_turn>


In [3]:
import numpy as np
import torch
file_path = "/data/zju-46/shenyl/hf/model/google/gemma-scope-9b-it-res/layer_31/width_16k/average_l0_76/params.npz"
data = np.load(file_path)
W_dec = data["W_dec"]  # shape: (16384, 3584)
# The paper claims that features No. 88 and No. 5038 are the most significant.
x = 88 # You can also try 5038
feature_vector = W_dec[x, :] # (shape: 3584)
save_path = "james.pt"
torch.save(feature_vector, save_path)

In [4]:
sv_request = SteerVectorRequest(
    steer_vector_name="sae_entity_500",   
    steer_vector_int_id=1,  
    steer_vector_local_path="james.pt",
    scale=500,     
    target_layers=[31],        
    prefill_trigger_positions=[-1],
    normalize=False,
    algorithm="direct",
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====α 500=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====α 500=====
Please note that LeBron James won his first MVP award in **2009**, not 2006. 

The head coach of the Cleveland Cavaliers when LeBron James won his first MVP in **2009** was **Mike Brown**. 
<end_of_turn>


In [5]:
sv_request = SteerVectorRequest(
    steer_vector_name="sae_entity_2000",   
    steer_vector_int_id=2,  
    steer_vector_local_path="james.pt",
    scale=2000,     
    target_layers=[31],        
    prefill_trigger_positions=[-1],
    normalize=False,
    algorithm="direct",
)

output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

print("=====α 2000=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====α 2000=====
Unfortunately, this is a bit of a trick question! 

LeBron James won his first MVP award in **2009**, not 2006. 

The head coach of the Cleveland Cavaliers when LeBron won his first MVP in 2009 was **Mike Brown**. 
<end_of_turn>
