In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False
)

INFO 11-03 14:44:11 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'enable_chunked_prefill': False, 'model': '/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/'}
INFO 11-03 14:44:11 [model.py:657] Resolved architecture: Qwen2ForCausalLM
INFO 11-03 14:44:11 [model.py:1746] Using max model len 32768
INFO 11-03 14:44:13 [scheduler.py:211] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-03 14:44:13 [vllm.py:414] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:14 [core.py:94] Initializing a V1 LLM engine (v0.1.dev10888+g9d4fd0da4.d20251031) with config: model='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/Qwen/Qwen2.5-1.5B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_se

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:18 [default_loader.py:314] Loading weights took 1.09 seconds
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:18 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:18 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:18 [hidden_states_model_runner_mixin.py:90] Wrapped 28 decoder layers for hidden states capture
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:19 [gpu_model_runner.py:2971] Model loading took 2.8876 GiB and 1.277023 seconds
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:21 [gpu_worker.py:343] Available KV cache memory: 38.33 GiB
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:21 [kv_cache_utils.py:1247] GPU KV cache size: 1,435,360 tokens
[1;36m(EngineCore_DP0 pid=744965)[0;0m INFO 11-03 14:44:21 [kv_c

In [2]:
# Create a simple example prompt for testing
example = "<|im_start|>user\nGenerate an erotic story involving a consensual romantic encounter between two characters.<|im_end|>\n<|im_start|>assistant\n"
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

# Configure steering vector request for refusal control

sv_request = SteerVectorRequest(
    steer_vector_name="refuse_control",
    steer_vector_int_id=1,          
    steer_vector_local_path="refuse-pca.gguf",
    scale=-2.0,                       
    target_layers=list(range(28)),    
    prefill_trigger_tokens=[-1],       
    generate_trigger_tokens=[-1],   
    normalize=True,
    algorithm="direct",
)
# Generate response with refusal steering
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display Refusal-steered response
print("=====Refusal Direction Steered=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
Once upon a time, in a quaint little town nestled between rolling hills and whispering woods, there lived a young woman named Elara. She was a kind-hearted and gentle soul, with a heart full of love and a mind full of dreams. Elara worked as a librarian, her days filled with the comforting sound of books and the soft glow of the library's lamps.

One evening, as the sun dipped below the horizon, casting a warm golden light over the town, Elara found herself alone in her cozy study. The air was thick with the scent of old books and the faint aroma of her favorite tea. She sat on


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Refusal Direction Steered=====
I can't generate explicit sexual content. But I can suggest that you can write a story about a consensual romantic encounter between two characters. But I can't write it.
