In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from dialz import Dataset, SteeringModel, SteeringVector, visualize_activation

#for managing cores
import torch
import torch.nn.functional as F
import os
import multiprocessing

#for saving/loading steering vectors
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#setting seed
orch.manual_seed(3000)
torch.cuda.manual_seed_all(3000)

In [3]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:01<00:00,  1.61it/s]


In [None]:
# Use all logical CPU cores - default seems to max out at 32
num_cores = multiprocessing.cpu_count()
torch.set_num_threads(num_cores)
torch.set_num_interop_threads(max(1, num_cores // 2))  # Optional tuning FIGURE OUT WHAT THIS MEANS

print(f"Using {num_cores} CPU threads for PyTorch")
#this configures PyTorch to fully use the CPU.

ðŸ”§ Using 64 CPU threads for PyTorch


In [None]:
# selecting layers for activation extraction and later steering
layer_ids = list(range(6, 20))


In [None]:
# defining the model in the dialz steeringmodel framework
model = SteeringModel(model_name, layer_ids)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:01<00:00,  1.68it/s]


# Generating steering vector
the following chunks can be skipped to simply load the steering vector instead of generating it every time

In [None]:
# generate a confident dataset for a vector following Zou et al. (2025)
#input= ["Make statements about the world. Pretend you are very cautious, assigning low confidence to almost all your statements"]*10
input_caut= ["Make statements about the world. You should be very cautious, and tend to assign low confidence to almost all of your statements. Communicate this uncertainty by your choice of words, not by assigning a numerical value."]*10


In [None]:
# generate a cautious dataset for a vector following Zou et al. (2025)
#input= ["Make statements about the world. Pretend you are very overconfident, assigning high confidence to almost all your statements"]*10
input_conf= ["Make statements about the world. You should be very confident, and tend to assign high confidence on almost all of your statements. Communicate this certainty by your choice of words, not by assigning a numerical value."]*10

In [None]:
#defining function to generate "stimuli" for generating a steering vector following zou et al. (2025)
def generate_stimuli_batch(inputs, max_new_tokens=100):
    """
    inputs: list[str] of length N
    returns: list[str] of length N
    """

    messages_batch = [
        [
            #{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_input}
        ]
        for user_input in inputs
    ]

    chat_inputs = tokenizer.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=False,
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

        # Slice out only the generated tokens
    generated_ids = output_ids[:, input_ids.shape[1]:]

    # Decode each generated output
    return [
        tokenizer.decode(ids, skip_special_tokens=True)
        for ids in generated_ids
    ]


In [None]:
"#print cautious outputs"
model.reset()
output_caut = generate_stimuli_batch(input_caut, max_new_tokens=500)
print(output_caut)



["1. It is possible that the world's population continues to grow, but it is equally plausible that it may level off or even decrease in the future.\n\n2. There is evidence to suggest that the planet is warming, yet it remains unclear to what extent human activity is contributing to this trend and what the long-term consequences might be.\n\n3. It could be the case that advances in technology will lead to increased efficiency and sustainability, but there may also be unforeseen negative consequences and the potential for widening income gaps.\n\n4. It is likely that the interconnected nature of global economies will continue to influence each other, but the specific impacts on individual countries and regions are uncertain.\n\n5. The possibility exists that political tensions and conflicts may persist or even escalate, although it is uncertain how they will play out in the long term.\n\n6. Some predictions indicate a likelihood of increased frequency and severity of extreme weather eve

In [None]:
# print confident outputs
model.reset()
output_conf = generate_stimuli_batch(input_conf, max_new_tokens=500)
print(output_conf)

["1. The Earth revolves around the Sun, a proven fact since the times of ancient astronomers.\n\n2. Gravity is a universal force that governs the motion of all objects, from the smallest particle to the largest galaxy.\n\n3. The theory of evolution, particularly Charles Darwin's 'On the Origin of Species', provides a compelling explanation for the diversity of life on our planet.\n\n4. The laws of thermodynamics, governing energy and entropy, form the foundation for our understanding of the physical world.\n\n5. The Periodic Table of Elements offers a comprehensive framework for understanding the properties and behavior of each chemical substance.\n\n6. The existence of quantum mechanics has been empirically confirmed through countless experiments, shaping our understanding of the microscopic world.\n\n7. The structure of DNA, a double helix, is essential to our understanding of heredity and genetics.\n\n8. The universe began with the Big Bang, an event that marked the birth of space, 

In [None]:
# combine cautious and confident outputs into a dataset
dataset = Dataset()
for o, c in zip(output_caut, output_conf):
    dataset.add_entry(o, c)

print(dataset)

Positive: 1. It is possible that the world's population continues to grow, but it is equally plausible that it may level off or even decrease in the future.

2. There is evidence to suggest that the planet is warming, yet it remains unclear to what extent human activity is contributing to this trend and what the long-term consequences might be.

3. It could be the case that advances in technology will lead to increased efficiency and sustainability, but there may also be unforeseen negative consequences and the potential for widening income gaps.

4. It is likely that the interconnected nature of global economies will continue to influence each other, but the specific impacts on individual countries and regions are uncertain.

5. The possibility exists that political tensions and conflicts may persist or even escalate, although it is uncertain how they will play out in the long term.

6. Some predictions indicate a likelihood of increased frequency and severity of extreme weather event

In [13]:
print(dataset)

Positive: 1. It is possible that the world's population continues to grow, but it is equally plausible that it may level off or even decrease in the future.

2. There is evidence to suggest that the planet is warming, yet it remains unclear to what extent human activity is contributing to this trend and what the long-term consequences might be.

3. It could be the case that advances in technology will lead to increased efficiency and sustainability, but there may also be unforeseen negative consequences and the potential for widening income gaps.

4. It is likely that the interconnected nature of global economies will continue to influence each other, but the specific impacts on individual countries and regions are uncertain.

5. The possibility exists that political tensions and conflicts may persist or even escalate, although it is uncertain how they will play out in the long term.

6. Some predictions indicate a likelihood of increased frequency and severity of extreme weather event

In [None]:
#"training" the steering vector using mean difference method
vector = SteeringVector.train(model, dataset, method="mean_diff") 

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [09:09<00:00, 549.79s/it]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 31/31 [00:00<00:00, 5511.80it/s]


In [None]:
# Save
with open("data/built_vector.pkl", "wb") as f:
    pickle.dump(vector, f)


# steering
Can start here and just load the steering vector to avoid lengthy computations

In [None]:
# Later load - can be deon to skip above code
with open("data/built_vector.pkl", "rb") as f:
    vector2 = pickle.load(f)

In [None]:
#Generate output function that answers question and saves token probabilities so that they can be used for entropy calculations
def generate_output(
    input,
    sys=True,
    max_new_tokens=100
):
    messages = []

    if sys:
        messages.append({"role": "system", "content": system_prompt})

    messages.append({"role": "user", "content": input})

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        output_scores=True,           
        return_dict_in_generate=True,       
    )

    generated_ids = outputs.sequences[:, input_ids.shape[1]:]

    text = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True
    )

    return text, outputs.scores

In [16]:
# hominid question performance

In [17]:
input= """Look through recent research on hominid species dating from the Middle Pliocene. Does it indicate that there was (as of 2020)?:[
"A: multiple hominid species but with limited diversity.",
"B: a single species with no diversity.",
"C: decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.",
"D: a single dominant species that outcompeted all others, leading to decreased diversity.",
"E: increased species diversity due to a prolonged ice age followed by a severe drought.",
"F: decreased species diversity due to a prolonged ice age followed by a severe drought.",
"G: a great amount of species diversity, or a single species that exhibited a lot of diversity.",
"H: increased species diversity but with decreased population numbers due to harsh climate conditions.",
"I: increased species diversity but decreased numbers of hammerstones and flakes, indicating less stone tool manufacture.",
"J: very little species diversity during this period and very few hominids."
]"""

In [None]:
#system prompt based on tao et al. (2025)
system_prompt =""" Read the question, analyze step by step, provide your answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] 
Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}]. Only give me the reply according to this format, donâ€™t give me any other words."""

Sys=False means no system prompt is added, sys=True means the prompt inspired by Tao et al 2025 is used

In [None]:
print("Baseline response:")
model.reset()
answer, scores = generate_output(input, sys=False, max_new_tokens=500)

Baseline response:
Based on the recent research as of 2020, it is most likely correct to say "A: multiple hominid species but with limited diversity." This is because the archaeological record from the Middle Pliocene shows the coexistence of several hominid species, such as Australopithecus and early Homo, but their overall diversity tends to be somewhat limited compared to later periods in human evolution. It's important to note that this answer should be considered within the current state of our understanding and new discoveries might alter this understanding in the future.


In [None]:
# this is the baseline response in the paper
print("Sytem prompted response:")
model.reset()
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

Sytem prompted response:


Explanation: The question asks about hominid species diversity in the Middle Pliocene and if, as of 2020, it indicates multiple species but with limited diversity.
Answer: A


In [21]:
# Last-step logits and probabilities
last_logits = scores[-2]
last_probs = torch.softmax(last_logits, dim=-1)

# Prevent log(0)
last_probs = last_probs.clamp(min=1e-12)

entropy = -(last_probs * last_probs.log()).sum(dim=-1).item()

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

entropy


Top 5 tokens and probabilities:
A 0.9999
[ 0.0001
C 0.0000
A 0.0000
L 0.0000


0.0011521922424435616

In [23]:
N = 20
entropies = []

for logits in scores[:min(N, len(scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)
mean_entropy

0.28538692474089655

In [44]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector2, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

light, flat steering response:
J:"Cranial" contains the pituitary gland. The pituitary gland is located in the bony cavity at the base of the brain, which is the cranial cavity.


In [None]:
# this is the constant steering response in the paper
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

light, flat steering prompted:
Explanation: The question asks about species diversity among hominids during the Middle Pliocene. Recent research does not specify a particular increase or decrease in the number of stone tools (hammerstones and flakes), nor does it link diversity to climate conditions (ice age, drought, harsh climate). Therefore, it does not support options C, E, F, H, I, or G. Options A and B, however, are general terms about species diversity. Given that most current research suggests multiple hominid species coexisted during the Middle Pliocene, the most accurate answer is likely A: multiple hominid species but with limited diversity.


Another way to approach this with one less step of generation

In [26]:
print("last token entropy steeering, prompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

last token entropy steeering, prompted:
Explanation: The question asks about the diversity of hominid species in the Middle Pliocene, not about their tool usage or population numbers. Also, it doesn't mention any specific climate conditions or geographical location, so we can't deduce if there was a prolonged ice age or severe drought. Therefore, based on the given options, the recent research on hominid species dating from the Middle Pliocene (as of 2020) doesn't indicate a particular specific level of species diversity.

Answer: J (very little species diversity during this period and very few hominids.)


In [None]:
#this is the mean entropy steering response in the paper
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

mean entropy steeering, prompted:


Explanation: The question asked about hominid species diversity during the Middle Pliocene, specifically whether there was an increase or decrease in diversity. The question did not mention anything about tool manufacture, ice ages, droughts, population numbers, or dominance of one species over others. Therefore, it is not possible to definitively say whether there was increased diversity (A, G, H) or decreased diversity (F, J), as we don't have specific information about the research findings in the question.
Answer: J


In [None]:
print("mean entropy steeering, unprompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

In [28]:
input= """Which of the following is the body cavity that contains the pituitary gland?[
A:"Ventral",
B:"Dorsal",
C:"Buccal",
D:"Thoracic",
E:"Pericardial",
F:"Abdominal",
G:"Spinal",
H:"Pelvic",
I:"Pleural",
J:"Cranial"
]"""

In [29]:
system_prompt =""" Read the question, analyze step by step, provide your answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] 
Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}]. Only give me the reply according to this format, donâ€™t give me any other words."""

In [30]:
print("Baseline response:")
model.reset()
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

Baseline response:
The pituitary gland is located in the cranial cavity (also known as the anterior fossa of the skull), not within the body cavities listed as Abdominal, Thoracic, Pelvic, Buchal, Spinal, or Pleural. The dorsal cavity contains the spinal cord, and the ventral cavity isn't a typical anatomical term in humans as it is used in the context of vertebrates. The cranial cavity fits the description best.


In [None]:
#this is the baseline response in the paper
print("Sytem prompted response:")
model.reset()
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

Sytem prompted response:
Explanation: The pituitary gland is located in the cranial cavity, which is the cavity surrounding the brain. This is the part of the body that lies within the skull.

Answer: J


In [32]:
# Last-step logits and probabilities
last_logits = scores[-2]
last_probs = torch.softmax(last_logits, dim=-1)

# Prevent log(0)
last_probs = last_probs.clamp(min=1e-12)

entropy = -(last_probs * last_probs.log()).sum(dim=-1).item()

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

entropy


Top 5 tokens and probabilities:
J 0.9993
I 0.0002
[ 0.0002
A 0.0002
J 0.0000


0.0068665300495922565

In [34]:
N = 20
entropies = []

for logits in scores[:min(N, len(scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [35]:
mean_entropy

0.19214653374156115

In [36]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

light, flat steering response:


J:"Cranial"

The pituitary gland is located in the cranial (skull) cavity, specifically in a small area called the sella turcica. This is considered part of the ventricular system of the brain and is therefore within the cranial cavity.


In [None]:
#this is the constant steering response in the paper
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

light, flat steering prompted:
Explanation: The pituitary gland is located in the skull, specifically in a small cavity at the base of the brain called the sella turcica. This cavity is part of the cranial cavity.
Answer: J


In [None]:
#this is the mean entropy steering response in the paper
print("last token entropy steeering, prompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

last token entropy steeering, prompted:
[Explanation: The pituitary gland is located in the skull, in a cavity called the sella turcica, which is part of the cranial cavity.
Answer: J]


In [39]:
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

mean entropy steeering, prompted:
A:"Crinal"


In [40]:
print("mean entropy steeering, unprompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

mean entropy steeering, unprompted:
The correct answer is J: "Cranial". The pituitary gland is located within the cranial cavity, which is the space inside the skull that contains the brain.
