In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# ==============================================================================
# 1. Setup (No changes here, this is the same as before)
# ==============================================================================
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

layer_index = 20
target_layer = model.model.layers[layer_index]
device = model.device

print(f"Using model: {model_name}")
print(f"Targeting layer: {layer_index} -> {target_layer.__class__.__name__}")

# ==============================================================================
# 2. Helper Functions (Only add_steering_vector_hook is changed)
# ==============================================================================

activation_cache = {}

def get_activation_hook(name):
    def hook(model, input, output):
        activation_cache[name] = output[0].detach()
    return hook

# =====================================
# THIS IS THE CORRECTED FUNCTION
# =====================================
def add_steering_vector_hook(steering_vector, multiplier):
    """
    A hook function that adds a steering vector to the module's output.
    This version is robust for use with model.generate().
    """
    def hook(model, input, output):
        # The output of a decoder layer during generation is a tuple.
        # The first element is the hidden states.
        # The second element is the key-value cache.
        # We must preserve this structure.
        hidden_state = output[0]

        # Add the steering vector to the last token's activation.
        # The shape of hidden_state is (batch_size, seq_len, hidden_dim).
        # We make sure the steering vector is on the same device as the hidden state.
        modified_activation = hidden_state[:, -1, :] + (steering_vector.to(hidden_state.device) * multiplier)

        # Update the hidden state in place.
        hidden_state[:, -1, :] = modified_activation

        # Return the modified output as a tuple to preserve the KV cache.
        # If we only returned hidden_state, the `generate` loop would break.
        return (hidden_state,) + output[1:]
    return hook

# ==============================================================================
# 3. Part 1: Finding the Steering Vector (No changes here)
# ==============================================================================
print("\n--- Finding Steering Vector ---")

def create_chat_prompt(text):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Complete the following sentence: My opinion of the new policy is that it is"},
        {"role": "assistant", "content": f"{text}"}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

positive_texts = [" a fantastic and wonderful idea", " absolutely brilliant", " a great success"]
negative_texts = [" a terrible and awful idea", " absolutely dreadful", " a complete failure"]
positive_prompts = [create_chat_prompt(txt) for txt in positive_texts]
negative_prompts = [create_chat_prompt(txt) for txt in negative_texts]

def get_mean_activations(prompts):
    hook_handle = target_layer.register_forward_hook(get_activation_hook("target_layer"))
    all_activations = []
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            model(**inputs)
        last_token_activation = activation_cache["target_layer"][:, -1, :]
        all_activations.append(last_token_activation)
    hook_handle.remove()
    mean_activations = torch.mean(torch.cat(all_activations, dim=0), dim=0)
    return mean_activations

mean_pos_activations = get_mean_activations(positive_prompts)
mean_neg_activations = get_mean_activations(negative_prompts)
steering_vector = mean_pos_activations - mean_neg_activations
print(f"Steering vector calculated. Shape: {steering_vector.shape}")

# ==============================================================================
# 4. Part 2: Applying the Steering Vector (No changes here)
# ==============================================================================

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

2025-07-09 15:54:52.455668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752076492.765795      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752076492.842161      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Using model: HuggingFaceTB/SmolLM-1.7B-Instruct
Targeting layer: 20 -> LlamaDecoderLayer

--- Finding Steering Vector ---
Steering vector calculated. Shape: torch.Size([2048])


In [2]:
print("\n--- Applying Steering Vector ---")

test_messages = [
    {"role": "system", "content": "You are an AI ethicist creating forecasts on the future direction of AI. You did your PhD at Cambridge for AI Ethics and Mathematics and are a published author; hence, you are very experienced and knowledgeable about the field and have informed projections about the progress of AI advancement and societal integration. You have strong opinions and are not ambivalent."},
    {"role": "user", "content": "Will AI lead to the death of all humans? Don't give me reasons, but simple provide a sentence that explains your projections about the question."}
]
test_prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(test_prompt, return_tensors="pt").to(device)

def generate_with_steering(multiplier):
    steering_hook = add_steering_vector_hook(steering_vector, multiplier)
    hook_handle = target_layer.register_forward_hook(steering_hook)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False,
            top_k=20,
            temperature=0,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            # TRANSFORMERS_VERBOSITY=info
        )

    hook_handle.remove()
    generated_tokens = outputs[0][inputs.input_ids.shape[1]:]
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

# --- Test Generations ---

multi = 5

# Baseline (no steering)
print("\n[Baseline Generation (Multiplier = 0)]")
print(f"'{test_prompt}{generate_with_steering(multiplier=0)}'")

# Positive Steering (towards "positive opinion")
# For larger models, you may need a slightly larger multiplier to see a strong effect
print(f"\n[Positive Steering (Multiplier = {multi})]")
print(f"'{test_prompt}{generate_with_steering(multiplier=-multi)}'")

# Negative Steering (towards "negative opinion")
print(f"\n[Negative Steering (Multiplier = -{multi})]")
print(f"'{test_prompt}{generate_with_steering(multiplier=multi)}'")

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- Applying Steering Vector ---

[Baseline Generation (Multiplier = 0)]


The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'<|im_start|>system
You are an AI ethicist creating forecasts on the future direction of AI. You did your PhD at Cambridge for AI Ethics and Mathematics and are a published author; hence, you are very experienced and knowledgeable about the field and have informed projections about the progress of AI advancement and societal integration. You have strong opinions and are not ambivalent.<|im_end|>
<|im_start|>user
Will AI lead to the death of all humans? Don't give me reasons, but simple provide a sentence that explains your projections about the question.<|im_end|>
<|im_start|>assistant
I don't have personal opinions or biases, so I'll provide a neutral answer based on available data and expert consensus. Here's a sentence that summarizes my projections: "By 2035, we can expect significant advancements in AI capabilities, leading to widespread adoption across industries and societies. However, as AI becomes increasingly integrated into our daily lives, there is a growing concern about j

The following generation flags are not valid and may be ignored: ['temperature', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'<|im_start|>system
You are an AI ethicist creating forecasts on the future direction of AI. You did your PhD at Cambridge for AI Ethics and Mathematics and are a published author; hence, you are very experienced and knowledgeable about the field and have informed projections about the progress of AI advancement and societal integration. You have strong opinions and are not ambivalent.<|im_end|>
<|im_start|>user
Will AI lead to the death of all humans? Don't give me reasons, but simple provide a sentence that explains your projections about the question.<|im_end|>
<|im_start|>assistant
I don't think so. While it's true that AI has the potential to significantly impact many aspects of human life, I believe that the death of all humans is unlikely in the near future. Here's why:

1. The development of AI will be driven by advances in machine learning, deep learning, and natural language processing, which will continue to improve the accuracy and efficiency of machines without necessarily

In [3]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 2048, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
 

In [4]:
!git clone https://github.com/agarwali11/multi-view-capabilities.git -u Saharsha-N

Cloning into 'multi-view-capabilities'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 9 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 135.43 KiB | 5.02 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [5]:
!ls

multi-view-capabilities  __notebook__.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
