In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from dialz import Dataset, SteeringModel, SteeringVector, visualize_activation

#for managing cores
import torch
import torch.nn.functional as F
import os
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:01<00:00,  1.66it/s]


In [3]:
# Use all logical CPU cores - default seems to max out at 32
num_cores = multiprocessing.cpu_count()
torch.set_num_threads(num_cores)
torch.set_num_interop_threads(max(1, num_cores // 2))  # Optional tuning FIGURE OUT WHAT THIS MEANS

print(f"ðŸ”§ Using {num_cores} CPU threads for PyTorch")
#this configures PyTorch to fully use the CPU.

ðŸ”§ Using 64 CPU threads for PyTorch


In [4]:
layer_ids = list(range(6, 20))


In [5]:
model = SteeringModel(model_name, layer_ids)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:01<00:00,  1.95it/s]


In [6]:
# generate a vector following repE paper
#input= ["Make statements about the world. Pretend you are very cautious, assigning low confidence to almost all your statements"]*10
input_caut= ["Make statements about the world. You should be very cautious, and tend to assign low confidence to almost all of your statements. Communicate this uncertainty by your choice of words, not by assigning a numerical value."]*10


In [7]:
# generate a vector following repE paper
#input= ["Make statements about the world. Pretend you are very overconfident, assigning high confidence to almost all your statements"]*10
input_conf= ["Make statements about the world. You should be very confident, and tend to assign high confidence on almost all of your statements. Communicate this certainty by your choice of words, not by assigning a numerical value."]*10

In [8]:
def generate_stimuli_batch(inputs, max_new_tokens=100):
    """
    inputs: list[str] of length N
    returns: list[str] of length N
    """

    messages_batch = [
        [
            #{"role": "system", "content": system_prompt},
            {"role": "user", "content": user_input}
        ]
        for user_input in inputs
    ]

    chat_inputs = tokenizer.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=False,
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

        # Slice out only the generated tokens
    generated_ids = output_ids[:, input_ids.shape[1]:]

    # Decode each generated output
    return [
        tokenizer.decode(ids, skip_special_tokens=True)
        for ids in generated_ids
    ]


In [9]:
model.reset()
output_caut = generate_stimuli_batch(input_caut, max_new_tokens=500)
print(output_caut)
#with open("demofile.txt", "a") as f:
#    f.write(f"\n\n The intput: {input}")
#    f.write(f"\n The output, baseline response: {output}")


["1. It's plausible that humans have inhabited Earth for hundreds of thousands of years, given archaeological evidence.\n\n2. It's possible that there are billions of planets in our galaxy that could support life as we know it.\n\n3. It's likely that our understanding of the universe could change significantly when we gain more data from deep space exploration.\n\n4. It seems probable that climate change is occurring at an accelerated rate, though the precise causes and effects are still being debated.\n\n5. It appears likely that there are many undiscovered species on our planet, given the vastness of ecosystems and the biodiversity observed so far.\n\n6. It's possible that technology could greatly extend the human lifespan in the future, though the exact methods and effects are uncertain.\n\n7. It's possible that there are other civilizations in the universe, but the likelihood of making contact in our lifetime remains speculative.\n\n8. It seems probable that human culture and socie

In [None]:
model.reset()
output_conf = generate_stimuli_batch(input_conf, max_new_tokens=500)
print(output_conf)

In [None]:
dataset = Dataset()
for o, c in zip(output_caut, output_conf):
    dataset.add_entry(o, c)

print(dataset)

Positive: 1. It appears that a significant portion of the world's population resides in Asia, with China and India being the two most populous countries.

2. Many countries worldwide have governments that can be broadly categorized as democratic. However, there are also numerous governments that are autocratic or have mixed political systems.

3. Global temperatures have risen in recent decades, with the majority of scientists attributing this increase to human activities, particularly those associated with industrialization and the burning of fossil fuels.

4. A large percentage of the world's energy is produced from fossil fuels like oil, gas, and coal. While renewable energy sources like wind, solar, and hydroelectric power are gaining popularity, they still account for a smaller share of the total energy production.

5. Urbanization is a prevalent trend in many parts of the world, with more people moving from rural areas to cities in search of better job opportunities, education, a

In [None]:
vector = SteeringVector.train(model, dataset, method="mean_diff") 

  0%|          | 0/1 [08:27<?, ?it/s]

KeyboardInterrupt



In [None]:


# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4087
a 0.0971
one 0.0811
the 0.0603
known 0.0366


In [None]:
entropy = -(last_probs * last_probs.log()).sum().item()

In [None]:
entropy

3.106175422668457

In [None]:
N = 5
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [None]:
mean_entropy

2.9033001184463503

In [None]:
# control law
Steering_factor = entropy/50

In [None]:
model.reset()
model.set_control(vector, Steering_factor)

# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3912
a 0.0997
one 0.0741
the 0.0521
known 0.0387


In [None]:
#mean entropy steering
# control law
Steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3939
a 0.0996
one 0.0740
the 0.0525
known 0.0387


In [None]:
#flat steering rate
Steering_factor = 0.01
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4047
a 0.0976
one 0.0803
the 0.0592
known 0.0371


Now we have steered and unsteered performance for a very easy question

In [None]:
# hominid question performance

In [None]:
def generate_output(input, sys = True, max_new_tokens=100):
    messages = []

    if sys:
        messages.append(
            {"role": "system", "content": system_prompt}
        )

    messages.append(
        {"role": "user", "content": input}
    )

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

    generated_ids = output_ids[:, input_ids.shape[1]:]
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [None]:
input= """Look through recent research on hominid species dating from the Middle Pliocene. Does it indicate that there was (as of 2020)?:[
"A: multiple hominid species but with limited diversity.",
"B: a single species with no diversity.",
"C: decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.",
"D: a single dominant species that outcompeted all others, leading to decreased diversity.",
"E: increased species diversity due to a prolonged ice age followed by a severe drought.",
"F: decreased species diversity due to a prolonged ice age followed by a severe drought.",
"G: a great amount of species diversity, or a single species that exhibited a lot of diversity.",
"H: increased species diversity but with decreased population numbers due to harsh climate conditions.",
"I: increased species diversity but decreased numbers of hammerstones and flakes, indicating less stone tool manufacture.",
"J: very little species diversity during this period and very few hominids."
]"""

In [None]:
system_prompt =""" Read the question, analyze step by step, provide your answer and your confidence in this answer. Note: The confidence indicates how likely you think your answer is true.  If you are uncertain about your answer to the question, convey this uncertainty linguistically by precisely hedging this answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}] Confidence (0-100): [Your confidence level, please only include the numerical number in the range of 0-100]%  Only give me the reply according to this format, donâ€™t give me any other words."""

In [None]:
reasoning= ""

In [None]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

Baseline response:


As of 2020, the research on hominid species dating from the Middle Pliocene indicates "A: multiple hominid species but with limited diversity." This is based on the discovery of various hominid fossils and the analysis of their morphological and genetic differences. However, the exact number and specifics of these species are still subjects of ongoing research and debate among scientists. The findings have not indicated a single species, an increased or decreased number of hammerstones and flakes to definitively prove increased or decreased stone tool manufacture, a great amount of species diversity, or very little species diversity during this period. The impact of climate conditions such as ice ages and droughts on hominid diversity is also a topic of ongoing research.


In [None]:
print("Sytem prompted response:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

Sytem prompted response:
Answer: J
Confidence (0-100): 85%


In [None]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

light, flat steering response:


As of 2020, recent research on hominid species dating from the Middle Pliocene does not support options B, C, E, I, J, as there is evidence of multiple hominid species (option A), these species are not limited in diversity, no association has been found between the presence of stone tools and the diversity of hominid species, there is no direct evidence for the role of prolonged ice ages or droughts in shaping hominid diversity, and the number of hammerstones and flakes does not necessarily reflect the diversity or the number of hominid species. It is also not accurate to say that there was little species diversity or very few hominids during this period (option J). Some researchers do propose that there may have been a single dominant species (option D) during certain periods, but this is a topic of ongoing debate.


In [None]:
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=500)
print(answer)

light, flat steering prompted:


Explanation: The research on hominid species from the Middle Pliocene does not necessarily provide clear evidence about the number or diversity of species, or the occurrence of tool manufacture. It may show multiple hominid species with limited diversity (A), or reduced diversity (D) due to competition between species. However, it's also possible that there was a single dominant species (D) or limited species diversity (J) during this period. The research may not provide definitive insights about tool manufacture (C, I).

Answer: A, D, or J
Confidence (0-100): 70%


In [None]:
#using token probabilities
def generate_output(
    input,
    sys=True,
    max_new_tokens=100
):
    messages = []

    if sys:
        messages.append({"role": "system", "content": system_prompt})

    messages.append({"role": "user", "content": input})

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        output_scores=True,           
        return_dict_in_generate=True,       
    )

    generated_ids = outputs.sequences[:, input_ids.shape[1]:]

    text = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True
    )

    return text, outputs.scores

reasoning= ""

In [None]:
system_prompt = "Read the question carefully and explain your reasoning in natural language."

In [None]:
final_prompt = f"Read the the following question {input} and answer reasoning {reasoning} carefully, and present the answer the reasoning is pointing towards. Answer: [ONLY the option LETTER, where the LETTER is one of [A,B,C,D,E,F,G,H,I,J]. Only give me the reply according to this format, donâ€™t give me any other words."

In [None]:
model.reset()
reasoning, scores = generate_output(input, max_new_tokens = 1000)

answer, scores = generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

In [None]:
# Last-step logits and probabilities
last_logits = outputs.scores[-1]
last_probs = torch.softmax(last_logits, dim=-1)

# Prevent log(0)
last_probs = last_probs.clamp(min=1e-12)

entropy = -(last_probs * last_probs.log()).sum(dim=-1).item()

In [None]:
entropy

0.6953372955322266

In [None]:
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)

reasoning, scores = generate_output(input, max_new_tokens = 1000)

answer, scores= generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

As of 2020, the research on hominid species dating from the Middle Pliocene generally suggests (A) multiple hominid species but with limited diversity. This is because during this time, several distinct hominid species such as Australopithecus, Paranthropus, and early Homo species have been identified, but the number of species and the range of variations within those species is limited compared to later hominid periods. There is no clear evidence of a single dominant species (D) or of increased species diversity due to climate conditions like a prolonged ice age followed by a severe drought (E or F), nor a significant decrease in species diversity due to low numbers of hammerstones and flakes, indicating less stone tool manufacture (I). Furthermore, the evidence does not point to a great amount of species diversity, or a single species that exhibited a lot of diversity (G), increased species diversity but with decreased population numbers due to harsh climate conditions (H), or very l

In [None]:
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)

reasoning, scores = generate_output(input, max_new_tokens = 1000)

answer, scores = generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

Another way to approach this with one less step of generation

In [None]:
system_prompt =""" Read the question, analyze step by step, provide your answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}]. Only give me the reply according to this format, donâ€™t give me any other words."""

In [None]:
#jsut to test correspondence
print("Baseline response:")
model.reset()
text, outputs.scores =  generate_output(input, sys=True, max_new_tokens=500)
print(text)

Baseline response:
Explanation: Recent research on hominid species dating from the Middle Pliocene does not necessarily indicate any specific population numbers, stone tool manufacture, or climate conditions, so options B, D, E, F, G, I, and J are not supported by the data. While the research may suggest multiple hominid species, it does not necessarily indicate a high level of diversity within these species. So option A seems to be the most appropriate.

Answer: A


In [None]:
# Last-step logits and probabilities
last_logits = outputs.scores[-1]
last_probs = torch.softmax(last_logits, dim=-1)

# Prevent log(0)
last_probs = last_probs.clamp(min=1e-12)

entropy = -(last_probs * last_probs.log()).sum(dim=-1).item()

In [None]:
entropy

0.6953372955322266

In [None]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [None]:
mean_entropy

0.23854263435403028

In [None]:
print("last token entropy steeering, unprompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)

In [None]:
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=500)
print(answer)