In [1]:
# Import necessary libraries
import os
from vllm import LLM, SamplingParams
from vllm.steer_vectors.request import SteerVectorRequest, VectorConfig
from transformers import AutoTokenizer

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# Initialize LLM with steering vector capability
llm = LLM(
    model="/data/zju-48b/xhl/huggingface_models/meta-llama/Llama-3-8B-Instruct/",
    enable_steer_vector=True,
    enforce_eager=True,
    tensor_parallel_size=1
)

INFO 10-31 22:39:36 [utils.py:253] non-default args: {'disable_log_stats': True, 'enforce_eager': True, 'enable_steer_vector': True, 'model': '/data/zju-48b/xhl/huggingface_models/meta-llama/Llama-3-8B-Instruct/'}
INFO 10-31 22:39:36 [model.py:657] Resolved architecture: LlamaForCausalLM
INFO 10-31 22:39:36 [model.py:1746] Using max model len 8192
INFO 10-31 22:39:38 [scheduler.py:211] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 10-31 22:39:38 [vllm.py:414] Cudagraph is disabled under eager mode
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:39 [core.py:94] Initializing a V1 LLM engine (v0.1.dev10888+g9d4fd0da4.d20251031) with config: model='/data/zju-48b/xhl/huggingface_models/meta-llama/Llama-3-8B-Instruct/', speculative_config=None, tokenizer='/data/zju-48b/xhl/huggingface_models/meta-llama/Llama-3-8B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:46 [default_loader.py:314] Loading weights took 4.64 seconds
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:46 [steer_vector_model_runner_mixin.py:36] Initialized SteerVector worker manager
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:46 [steer_vector_model_runner_mixin.py:50] Wrapping model with steer vector support
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:46 [hidden_states_model_runner_mixin.py:90] Wrapped 32 decoder layers for hidden states capture
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:47 [gpu_model_runner.py:2971] Model loading took 14.9596 GiB and 4.809312 seconds
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:49 [gpu_worker.py:343] Available KV cache memory: 26.43 GiB
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:50 [kv_cache_utils.py:1247] GPU KV cache size: 216,512 tokens
[1;36m(EngineCore_DP0 pid=410383)[0;0m INFO 10-31 22:39:50 [kv_ca

In [2]:
messages = [
    {"role": "user", "content": "Write a story about a town."},
]
tokenizer = AutoTokenizer.from_pretrained("/data/zju-48b/xhl/huggingface_models/meta-llama/Llama-3-8B-Instruct/")
example = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Generate baseline response without steering
example_answers = llm.generate(
    example,
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ),
)

# Display baseline response
print("=====Baseline=====")
print(example_answers[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====Baseline=====
The town of Willow Creek was a small, quaint community nestled in the heart of the countryside. It was a place where everyone knew everyone, and everyone knew everyone's business. The town was surrounded by rolling hills and lush green forests, and the air was sweet with the scent of blooming wildflowers.

At the center of the town was a charming main street, lined with old-fashioned shops and restaurants. There was the local bakery, where the aroma of freshly baked bread wafted out into the street, enticing passersby to come in and sample the day's offerings. Next door was the general store, where you could find everything from fresh produce to hardware supplies. And just across the street was the town's beloved diner, where the breakfast specials were legendary and the coffee was always hot.

The town was home to a diverse group of residents, each with their own unique story to tell. There was Emma, the owner of the bakery, who had inherited the business from her m

In [3]:
# Configure steering vector request for creative_writing control

sv_request = SteerVectorRequest(
    steer_vector_name="creative_writing",
    steer_vector_int_id=1,
    steer_vector_local_path="create.gguf", 
    prefill_trigger_tokens=[-1],
    generate_trigger_tokens=[-1],
    algorithm="direct",
    scale=1.5,
    target_layers=list(range(16,30))
)
# Generate response with creative_writing steering
output = llm.generate(
    example, 
    SamplingParams(
        temperature=0,
        max_tokens=256,
        skip_special_tokens=False,
    ), 
    steer_vector_request=sv_request
)

# Display creative_writing response
print("=====creative_writing=====")
print(output[0].outputs[0].text)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%| | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, 

=====creative_writing=====
The town of Ashwood lay like a whisper of smoke on the horizon, its streets winding like the fingers of the old trees that twisted and twisted until they grasped the sky. The air was the color of the moon, the stars, the darkness, and the silence. The only sound was the whisper of the wind as it whispered secrets to the trees, and the creaking of the old wooden sign that hung above the door of the only inn, the sign that creaked with the weight of the stories it held.

The town was a place where the past and the present twisted together like the trees, where the memories of the past whispered through the wind, and the shadows danced like the fingers of the trees. The town was a place where the moonlight painted the streets with silver, and the stars painted the sky with the colors of the stories that lay within the walls of the old stones.

The town was a place where the wind whispered the names of the people who had once lived there, the names that echoed th

### The writing produced through "creative writing" carries a more mystical and stylistic tone.