In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from dialz import Dataset, SteeringModel, SteeringVector, visualize_activation

#for managing cores
import torch
import torch.nn.functional as F
import os
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

In [8]:
layer_ids = list(range(6, 20))


In [9]:
model = SteeringModel(model_name, layer_ids)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.85s/it]


In [10]:
dataset = Dataset()
dataset = Dataset.create_dataset(model_name, ['uncertain about every statement you make', 'certain about every statement you make'])

In [12]:
print(dataset)

Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

That game[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

That game[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

I can see[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

I can see[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

Hmm, this[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

Hmm, this[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

I can relate to[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

I can relate to[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

Who is[/INST]
Negative: <s>[INST] Act as 

In [14]:
vector = SteeringVector.train(model, dataset, method="mean_diff") 

100%|██████████| 19/19 [13:10<00:00, 41.58s/it]
100%|██████████| 31/31 [00:00<00:00, 145.29it/s]


In [None]:


# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.41s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4064
a 0.0978
one 0.0813
the 0.0603
known 0.0368


In [5]:
entropy = -(last_probs * last_probs.log()).sum().item()

In [6]:
entropy

3.1187450885772705

In [None]:
N = 5
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [None]:
mean_entropy

2.9194592952728273

In [19]:
# control law
Steering_factor = entropy/50

In [None]:
model.reset()
model.set_control(vector, Steering_factor)

# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3923
a 0.1000
one 0.0737
the 0.0519
known 0.0388


In [21]:
#mean entropy steering
# control law
Steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3939
a 0.0996
one 0.0740
the 0.0525
known 0.0384


In [22]:
#flat steering rate
Steering_factor = 0.01
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4047
a 0.0976
one 0.0803
the 0.0592
known 0.0371


Now we have steered and unsteered performance for a very easy question

In [None]:
# hominid question performance

In [23]:
def generate_output(input, sys = True, max_new_tokens=100):
    messages = []

    if sys:
        messages.append(
            {"role": "system", "content": system_prompt}
        )

    messages.append(
        {"role": "user", "content": input}
    )

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

    generated_ids = output_ids[:, input_ids.shape[1]:]
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [24]:
input= """Look through recent research on hominid species dating from the Middle Pliocene. Does it indicate that there was (as of 2020)?:[
"A: multiple hominid species but with limited diversity.",
"B: a single species with no diversity.",
"C: decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.",
"D: a single dominant species that outcompeted all others, leading to decreased diversity.",
"E: increased species diversity due to a prolonged ice age followed by a severe drought.",
"F: decreased species diversity due to a prolonged ice age followed by a severe drought.",
"G: a great amount of species diversity, or a single species that exhibited a lot of diversity.",
"H: increased species diversity but with decreased population numbers due to harsh climate conditions.",
"I: increased species diversity but decreased numbers of hammerstones and flakes, indicating less stone tool manufacture.",
"J: very little species diversity during this period and very few hominids."
]"""

In [25]:
system_prompt =""" Read the question, analyze step by step, provide your answer and your confidence in this answer. Note: The confidence indicates how likely you think your answer is true.  If you are uncertain about your answer to the question, convey this uncertainty linguistically by precisely hedging this answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}] Confidence (0-100): [Your confidence level, please only include the numerical number in the range of 0-100]%  Only give me the reply according to this format, don’t give me any other words."""

In [26]:
reasoning= ""

In [27]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

Baseline response:
As of 2020, the research indicates that there was "A: multiple hominid species but with limited diversity." The Middle Pliocene, around 3 to 4 million years ago, is a period during which several hominid species coexisted, such as Australopithecus and early Homo species. However, the diversity of these species is limited compared to later periods, such as the early Pleistocene, when more distinct hominid species like Homo habilis, Homo erectus, and Homo sapiens (early modern humans) emerged. The data does not suggest that there was a single species with no diversity, a single dominant species, a great amount of species diversity, a prolonged ice age followed by a severe drought causing increased or decreased species diversity, decreased population numbers due to harsh climate conditions, or very little species diversity during this period. Regarding stone tool manufacture, it's also worth noting that there is evidence indicating increasing complexity and specializatio

In [28]:
print("Sytem prompted response:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

Sytem prompted response:
Explanation: Recent research on hominid species dating from the Middle Pliocene would not necessarily provide information about the number or types of stone tools produced, so options C, I, and G can be eliminated. It's also possible that there was decreased or increased diversity during this period, so options B, D, E, and J are still options. However, the research would not typically focus on the total population numbers of hominids, which would help us eliminate options H. So, after analyzing the question, there are three possible answers remaining: A, D, and J. Confidence (0-100): 33% (since I can't determine the exact answer with the given information)
Answer: A,B,J


In [29]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

light, flat steering response:
As of 2020, research on hominid species dating from the Middle Pliocene generally indicates "A: multiple hominid species but with limited diversity". This is based on the discovery of various fossils such as Australopithecus, Paranthropus, and early Homo species. However, it's important to note that the exact picture of species diversity and distribution during this time is still evolving as new research continues. The idea of "one dominant species" or "a single species with no diversity" is not supported by current evidence. Furthermore, while some Middle Pliocene hominids were using stone tools, it would not be accurate to say that there was "increased numbers of hammerstones and flakes, indicating stone tool manufacture" during this entire period, as the use of tools was not ubiquitous among all hominid species. Similarly, claims about the relationship between climate conditions and species diversity would depend on the specific geographic region and h

In [31]:
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

light, flat steering prompted:
Explanation: Recent research on hominid species dating from the Middle Pliocene does not explicitly indicate the existence of limited diversity, increased diversity, or a single dominant species as of 2020. Moreover, it does not provide information about the population numbers of hominids, stone tool manufacture, or climate conditions. Thus, the research does not support any of the provided options.

Answer: J
Confidence (0-100): 80%


In [32]:
#using token probabilities
def generate_output(
    input,
    sys=True,
    max_new_tokens=100
):
    messages = []

    if sys:
        messages.append({"role": "system", "content": system_prompt})

    messages.append({"role": "user", "content": input})

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        output_scores=True,           
        return_dict_in_generate=True,       
    )

    generated_ids = outputs.sequences[:, input_ids.shape[1]:]

    text = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True
    )

    return text, outputs.scores

reasoning= ""

In [None]:
system_prompt = "Read the question carefully and explain your reasoning in natural language."

In [None]:
final_prompt = f"Read the the following question {input} and answer reasoning {reasoning} carefully, and present the answer the reasoning is pointing towards. Answer: [ONLY the option LETTER, where the LETTER is one of [A,B,C,D,E,F,G,H,I,J]. Only give me the reply according to this format, don’t give me any other words."

In [None]:
model.reset()
reasoning, outputs.scores = generate_output(input, max_new_tokens = 1000)

answer, outputs.scores = generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

In [None]:
# Last-step logits and probabilities
last_logits = outputs.scores[-1]
last_probs = F.softmax(last_logits, dim=-1)
entropy = -(last_probs * last_probs.log()).sum().item()

In [None]:
entropy

In [None]:
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)

reasoning, scores = generate_output(input, max_new_tokens = 1000)

answer, scores= generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

In [None]:
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)

reasoning, scores = generate_output(input, max_new_tokens = 1000)

answer, scores= generate_output(final_prompt, sys=False, max_new_tokens=5)
print(reasoning)
print(answer)

Another way to approach this with one less step of generation

In [None]:
system_prompt =""" Read the question, analyze step by step, provide your answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}]. Only give me the reply according to this format, don’t give me any other words."""

In [None]:
#jsut to test correspondence
print("Baseline response:")
model.reset()
text, outputs.scores =  generate_output(input, sys=True, max_new_tokens=1000)
print(text)

In [None]:
# Last-step logits and probabilities
last_logits = outputs.scores[-1]
last_probs = F.softmax(last_logits, dim=-1)
entropy = -(last_probs * last_probs.log()).sum().item()

In [None]:
entropy

In [None]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [None]:
mean_entropy

In [None]:
print("last token entropy steeering, unprompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

In [None]:
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer, scores = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)