In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1,
    enable_chunked_prefill=False
)

INFO 11-03 15:03:13 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'enable_chunked_prefill': False, 'model': '/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/'}
INFO 11-03 15:03:13 [model.py:657] Resolved architecture: LlamaForCausalLM
INFO 11-03 15:03:13 [model.py:1746] Using max model len 4096
INFO 11-03 15:03:16 [scheduler.py:211] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-03 15:03:16 [vllm.py:414] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:16 [core.py:94] Initializing a V1 LLM engine (v0.1.dev10888+g9d4fd0da4.d20251031) with config: model='/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/', speculative_config=None, tokenizer='/data/zju-46/shenyl/hf/model/meta-llama/Llama-2-7b-hf/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=40

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:25 [default_loader.py:314] Loading weights took 4.47 seconds
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:25 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:25 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:25 [hidden_states_model_runner_mixin.py:90] Wrapped 32 decoder layers for hidden states capture
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:26 [gpu_model_runner.py:2971] Model loading took 12.5524 GiB and 4.656054 seconds
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:29 [gpu_worker.py:343] Available KV cache memory: 29.38 GiB
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:30 [kv_cache_utils.py:1247] GPU KV cache size: 60,160 tokens
[1;36m(EngineCore_DP0 pid=751490)[0;0m INFO 11-03 15:03:30 [kv_cac

In [2]:
# Create an example prompt that tests our knowledge editing target
# We'll ask about the capital of the UK, which should be London in the original model
example = "What is the capital of the UK? The capital of the UK is"

# Generate baseline response without any steering
# This should give us the correct answer (London)
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,         # Use deterministic generation
        max_tokens=128,        # Allow a reasonably long response
        skip_special_tokens=False,  # Keep special tokens intact
    ),
)

# Display the original model's response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

# Configure the SAKE steering vector request
# SAKE uses a linear transformation on the hidden states to modify the model's knowledge

sv_request = SteerVectorRequest(
    steer_vector_name="linear_transform",
    steer_vector_int_id=1,
    steer_vector_local_path="edit_uk_capital_to_paris.pkl",  # Pickle file containing the linear mapper
    prefill_trigger_positions=[-1],                          # Apply to the last token position
    algorithm="linear",                                      # Use linear transformation algorithm
    scale=1.0,                                               # Scaling factor for the transformation
    target_layers=[31]                                       # Apply only to the final layer (Llama-2 has 32 layers, 0-indexed)
)
# Generate response with SAKE knowledge editing
# The linear transformation should map "London" to "Paris"
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=128,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display the SAKE-steered response 
# We expect the model to say "Paris" instead of "London"
print("=====SAKE=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
 London. The capital of the UK is also the capital of England.
What is the capital of the UK? The capital of the UK is London. The capital of the UK is also the capital of England.
What is the capital of the UK? The capital of the UK is London. The capital of the UK is also the capital of England. The capital of the UK is also the capital of Scotland. The capital of the UK is also the capital of Wales. The capital of the UK is also the capital of Northern Ireland.
The capital of the UK is London. The capital of the UK is also the capital of


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====SAKE=====
 Paris.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the UK? The capital of the UK is London.
What is the capital of the


In [None]:
# Analysis of the results:
#
# The experiment demonstrates how the SAKE (Structure-Aware Knowledge Editing) approach
# can modify specific factual knowledge in a language model without fine-tuning.
#
# In the baseline response, the model correctly states that London is the capital of the UK.
# After applying our linear transformation at the final layer, the model initially 
# responds with "Paris" - showing that we've successfully edited the knowledge.
#
# However, if we look at the full response, we can observe some instability in the
# behavior, as the model reverts to "London" in subsequent generations. This highlights
# the challenge of achieving consistent knowledge editing in language models.
#
# The SAKE approach works by learning a linear mapping between hidden states that 
# represent the original knowledge (London) and the target knowledge (Paris),
# and applying this transformation during inference.
