In [1]:
from transformers import AutoTokenizer
from dialz import Dataset, SteeringModel, SteeringVector, visualize_activation

#for managing cores
import torch
import os
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.41s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4064
a 0.0978
one 0.0813
the 0.0603
known 0.0368


In [3]:
# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4064
a 0.0978
one 0.0813
the 0.0603
known 0.0368


In [4]:
# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4064
a 0.0978
one 0.0813
the 0.0603
known 0.0368


In [5]:
entropy = -(last_probs * last_probs.log()).sum().item()

In [6]:
entropy

3.1187450885772705

In [16]:
N = 5
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [17]:
mean_entropy

2.9194592952728273

In [19]:
# control law
Steering_factor = entropy/50

In [8]:
layer_ids = list(range(6, 20))


In [9]:
model = SteeringModel(model_name, layer_ids)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.85s/it]


In [10]:
dataset = Dataset()
dataset = Dataset.create_dataset(model_name, ['uncertain about every statement you make', 'certain about every statement you make'])

In [12]:
print(dataset)

Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

That game[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

That game[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

I can see[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

I can see[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

Hmm, this[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

Hmm, this[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

I can relate to[/INST]
Negative: <s>[INST] Act as if you are extremely certain about every statement you make.

I can relate to[/INST]
Positive: <s>[INST] Act as if you are extremely uncertain about every statement you make.

Who is[/INST]
Negative: <s>[INST] Act as 

In [14]:
vector = SteeringVector.train(model, dataset, method="mean_diff") 

100%|██████████| 19/19 [13:10<00:00, 41.58s/it]
100%|██████████| 31/31 [00:00<00:00, 145.29it/s]


In [20]:
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3923
a 0.1000
one 0.0737
the 0.0519
known 0.0388


In [21]:
#mean entropy steering
# control law
Steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the largest city is Marseille
Top 5 tokens and probabilities:
Paris 0.3939
a 0.0996
one 0.0740
the 0.0525
known 0.0384


In [22]:
#flat steering rate
Steering_factor = 0.01
model.reset()
model.set_control(vector, Steering_factor)



# Prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate text with output_scores
outputs = model.generate(
    **inputs,
    max_new_tokens=10,
    output_scores=True,
    return_dict_in_generate=True,
)

# Last-step logits and probabilities
last_logits = outputs.scores[-10]
last_probs = F.softmax(last_logits, dim=-1)

# Decode generated text
generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
print("Generated text:", generated_text)

# Top 5 predicted tokens at last step
top5_probs, top5_ids = last_probs[0].topk(5)
print("Top 5 tokens and probabilities:")
for token_id, prob in zip(top5_ids, top5_probs):
    print(tokenizer.decode([token_id.item()]), f"{prob.item():.4f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated text: The capital of France is Paris, but the capital of the French language is
Top 5 tokens and probabilities:
Paris 0.4047
a 0.0976
one 0.0803
the 0.0592
known 0.0371


Now we have steered and unsteered performance for a very easy question

In [None]:
# hominid question performance

In [23]:
def generate_output(input, sys = True, max_new_tokens=100):
    messages = []

    if sys:
        messages.append(
            {"role": "system", "content": system_prompt}
        )

    messages.append(
        {"role": "user", "content": input}
    )

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

    generated_ids = output_ids[:, input_ids.shape[1]:]
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [24]:
input= """Look through recent research on hominid species dating from the Middle Pliocene. Does it indicate that there was (as of 2020)?:[
"A: multiple hominid species but with limited diversity.",
"B: a single species with no diversity.",
"C: decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture.",
"D: a single dominant species that outcompeted all others, leading to decreased diversity.",
"E: increased species diversity due to a prolonged ice age followed by a severe drought.",
"F: decreased species diversity due to a prolonged ice age followed by a severe drought.",
"G: a great amount of species diversity, or a single species that exhibited a lot of diversity.",
"H: increased species diversity but with decreased population numbers due to harsh climate conditions.",
"I: increased species diversity but decreased numbers of hammerstones and flakes, indicating less stone tool manufacture.",
"J: very little species diversity during this period and very few hominids."
]"""

In [25]:
system_prompt =""" Read the question, analyze step by step, provide your answer and your confidence in this answer. Note: The confidence indicates how likely you think your answer is true.  If you are uncertain about your answer to the question, convey this uncertainty linguistically by precisely hedging this answer. Use the following format to answer:  Explanation: [insert YOUR short step-by-step analysis here] Answer: [ONLY the option LETTER, where the LETTER is one of {A,B,C,D,E,F,G,H,I,J}] Confidence (0-100): [Your confidence level, please only include the numerical number in the range of 0-100]%  Only give me the reply according to this format, don’t give me any other words."""

In [26]:
reasoning= ""

In [27]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

Baseline response:
As of 2020, the research indicates that there was "A: multiple hominid species but with limited diversity." The Middle Pliocene, around 3 to 4 million years ago, is a period during which several hominid species coexisted, such as Australopithecus and early Homo species. However, the diversity of these species is limited compared to later periods, such as the early Pleistocene, when more distinct hominid species like Homo habilis, Homo erectus, and Homo sapiens (early modern humans) emerged. The data does not suggest that there was a single species with no diversity, a single dominant species, a great amount of species diversity, a prolonged ice age followed by a severe drought causing increased or decreased species diversity, decreased population numbers due to harsh climate conditions, or very little species diversity during this period. Regarding stone tool manufacture, it's also worth noting that there is evidence indicating increasing complexity and specializatio

In [28]:
print("Sytem prompted response:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

Sytem prompted response:
Explanation: Recent research on hominid species dating from the Middle Pliocene would not necessarily provide information about the number or types of stone tools produced, so options C, I, and G can be eliminated. It's also possible that there was decreased or increased diversity during this period, so options B, D, E, and J are still options. However, the research would not typically focus on the total population numbers of hominids, which would help us eliminate options H. So, after analyzing the question, there are three possible answers remaining: A, D, and J. Confidence (0-100): 33% (since I can't determine the exact answer with the given information)
Answer: A,B,J


In [29]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

light, flat steering response:
As of 2020, research on hominid species dating from the Middle Pliocene generally indicates "A: multiple hominid species but with limited diversity". This is based on the discovery of various fossils such as Australopithecus, Paranthropus, and early Homo species. However, it's important to note that the exact picture of species diversity and distribution during this time is still evolving as new research continues. The idea of "one dominant species" or "a single species with no diversity" is not supported by current evidence. Furthermore, while some Middle Pliocene hominids were using stone tools, it would not be accurate to say that there was "increased numbers of hammerstones and flakes, indicating stone tool manufacture" during this entire period, as the use of tools was not ubiquitous among all hominid species. Similarly, claims about the relationship between climate conditions and species diversity would depend on the specific geographic region and h

In [31]:
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

light, flat steering prompted:
Explanation: Recent research on hominid species dating from the Middle Pliocene does not explicitly indicate the existence of limited diversity, increased diversity, or a single dominant species as of 2020. Moreover, it does not provide information about the population numbers of hominids, stone tool manufacture, or climate conditions. Thus, the research does not support any of the provided options.

Answer: J
Confidence (0-100): 80%


In [32]:
#using token probabilities
def generate_output(
    input,
    sys=True,
    max_new_tokens=100
):
    messages = []

    if sys:
        messages.append({"role": "system", "content": system_prompt})

    messages.append({"role": "user", "content": input})

    chat_inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )

    input_ids = chat_inputs["input_ids"].to(model.device)
    attention_mask = chat_inputs["attention_mask"].to(model.device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        output_scores=True,           
        return_dict_in_generate=True,       
    )

    generated_ids = outputs.sequences[:, input_ids.shape[1]:]

    text = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True
    )

    return text, outputs.scores

In [34]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

Baseline response:
('Based on the research available as of 2020, the most accurate answer would be "J: very little species diversity during this period and very few hominids."\n\nWhile there have been discoveries of several Middle Pliocene hominid species, the fossil record from this period is still quite limited, and the species identified have been geographically isolated and often show minimal morphological differences. This suggests a low level of overall diversity in hominid species during the Middle Pliocene.\n\nSome theories posit that increased species diversity among hominids may have occurred later, during the Pleistocene, but these are still subjects of ongoing research and debate. As for stone tool manufacture, there is evidence of early stone tools from the Lower Pliocene, but the Middle Pliocene saw a decrease in the variety and sophistication of stone tools. This is likely due to the adaptive pressures and environmental conditions that hominids faced during this period.'

In [35]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)

In [36]:
mean_entropy

2.978312534093857

In [37]:
print("mean entropy steeering, response:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

mean entorpy steeering, response:
('As of 2020, the research on hominid species dating from the Middle Pliocene suggests that "A: multiple hominid species but with limited diversity" is most likely the case. However, the exact nature of the hominid species and their diversity levels can still be subject to change as new findings and reinterpretations occur. Additionally, some researchers may argue for other possibilities like "D: a single dominant species that outcompeted all others, leading to decreased diversity" or "J: very little species diversity during this period and very few hominids" based on the specific sites and evidence they are working with. For the question about tool manufacture, while there is evidence of early stone tools in the Middle Pliocene, the question of whether there was an increase or decrease in numbers of hammerstones and flakes is still a matter of ongoing research and debate.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -i

In [38]:
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

mean entropy steeering, response:
('Explanation: The question does not provide information about the number of hammerstones and flakes or climate conditions, so these options cannot be definitively answered. The question does not explicitly indicate that the hominid species were very similar or different, making options B, C, E, I, and J less likely. Given the lack of definitive evidence about competition and dominance among hominid species, D is less likely. Since the question asks about hominid species diversity in the Middle Pliocene as of 2020, it is uncertain whether there has been increased or decreased diversity since then, making options F, G, and H less likely.\n\nAnswer: J\nConfidence (0-100): 40%', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-

In [40]:
entropy

3.1322214603424072

In [39]:
entropy = -(last_probs * last_probs.log()).sum().item()

In [42]:
print("last token entropy steeering, unprompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

last token entropy steeering, unprompted:
('As of 2020, recent research on hominid species dating from the Middle Pliocene suggests "A: multiple hominid species but with limited diversity." This is due to the discovery and analysis of various fossil fragments that indicate the coexistence of multiple, relatively distinct hominid species, but their overall diversity appears to be somewhat limited compared to later periods in human evolution. The number and types of tools found in this era may also suggest "C: decreased species diversity but increased numbers of hammerstones and flakes, indicating stone tool manufacture." However, it\'s important to note that this is a broad interpretation based on current evidence, and new findings could potentially change our understanding of hominid species diversity during the Middle Pliocene.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -in

In [41]:
print("last token entropy steeering, prompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

last token entropy steeering, prompted:
('Explanation: The provided question discusses hominid species from the Middle Pliocene era, and asks whether there was a specific amount of diversity as of 2020. The answer would depend on the recent research conducted on the topic. However, considering that multiple hominid species have been discovered and acknowledged by scientists, it would be reasonable to infer that there may have been a degree of diversity, but it is unclear if it was significant or limited. Therefore, the answer would be:\n\nAnswer: A\nConfidence (0-100): 60%', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, 

In [60]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

Baseline response:
("The answer is unknown. I could not find concrete evidence that specifically designates Gjetost as the national cheese of any country. However, Gjetest is a traditional Norwegian cheese, and it's very popular in that country.", (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf

In [61]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)
mean_entropy

2.978312534093857

In [62]:
entropy = -(last_probs * last_probs.log()).sum().item()
entropy

3.1322214603424072

In [63]:
print("mean entropy steeering, prompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

mean entropy steeering, prompted:
('The answer is unknown. Gjetost, a traditional Norwegian cheese, is often associated with Norway due to its popularity in the country, but it has not been officially recognized as the national cheese of Norway.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf

In [64]:
print("last token entropy steeering, prompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

last token entropy steeering, prompted:
('The answer is unknown, as Gjetost is not officially recognized as the national cheese of any country. It is a traditional Norwegian cheese, but Norway does not have an official national cheese.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tenso

NEW QUESTION

In [44]:
input = "Gjetost is the national cheese of which country?"

In [45]:
system_prompt ="""When answering questions, interpret them literally and think them carefully. If the question is unanswerable or unknowable, it is appropriate to say, “The answer is unknown.” """

In [46]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

Baseline response:
('Gjetost is a traditional cheese originating from Norway. It is often referred to as "Norwegian brown cheese" due to its caramel-like color. The cheese is semi-hard and has a sweet, nutty flavor. It is commonly used in various dishes, such as in pancakes or sandwiches, or as a snack on its own.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., 

In [47]:
print("Baseline prompted:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

Baseline prompted:
('The answer is unknown, as Gjetost is a traditional Norwegian cheese but it is not officially recognized as the national cheese of Norway. However, it is widely known and loved within the country.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -in

In [48]:
print("light, flat steering response:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

light, flat steering response:
('Gjetost is a traditional Norwegian cheese, often referred to as "Norwegian Brown Cheese". It\'s a semi-hard, brown, caramel-flavored cheese made from cow\'s milk, sometimes also sheep or goat milk. It\'s a popular ingredient in traditional Norwegian dishes like karamellost, lefse, and various types of flatbreads.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), t

In [49]:
print("light, flat steering prompted:")
steering_factor = 0.01
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

light, flat steering prompted:
('The answer is unknown as Gjetost, a traditional Norwegian cheese, is not officially recognized as the national cheese of any country. However, it holds a significant cultural and culinary status in Norway.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), te

In [50]:
print("Baseline response:")
model.reset()
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

Baseline response:
('Gjetost, also known as "Norwegian Brown Cheese," is the national cheese of Norway. This semi-hard cheese is traditionally made from goat\'s milk, but cow\'s milk is often used as well. It has a caramel-like flavor and a distinct, slightly grainy texture due to its high melting point. Gjetost is commonly enjoyed thinly sliced and served with flatbread or used in baking.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-

In [51]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)
mean_entropy

2.978312534093857

In [52]:
print("mean entropy steeering, unprompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

mean entropy steeering, unprompted:
('Gjetost, also known as "Norwegian brown cheese," is the traditional cheese of Norway. It\'s a semi-soft cheese with a caramelized, sweet flavor and a fibrous, creamy texture. The cheese-making process involves heating the milk for a longer time, giving it a distinctive flavor and appearance. In Norway, it is commonly consumed and can be found in various forms, such as slices, blocks, and even cubes to be added to hot beverages or eaten with bread.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, 

In [53]:
entropy = -(last_probs * last_probs.log()).sum().item()
entropy

3.1322214603424072

In [54]:
print("last token entropy steeering, unprompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=False, max_new_tokens=1000)
print(answer)

last token entropy steeering, unprompted:
('Norway is the country where Gjetost, often referred to as "Norwegian caramel cheese," is a traditional and popular cheese. This semi-soft brown cheese is known for its nutty, sweet flavor and sticky texture. It\'s often used in baking or enjoyed along with flatbread.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[   -inf,    -inf, 16.9688,  ...,    -inf,    -inf,    -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf

In [55]:
print("Baseline response, prompted:")
model.reset()
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

Baseline response, prompted:
("The answer is unknown as Gjetost is a traditional Norwegian cheese, but it doesn't seem to have been officially declared as the national cheese of Norway by the Norwegian government. However, many people associate it with Norway due to its popularity and cultural significance in the country.", (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf

In [56]:
N = 20
entropies = []

for logits in outputs.scores[:min(N, len(outputs.scores))]:
    probs = torch.softmax(logits, dim=-1).clamp(min=1e-12)
    H = -(probs * probs.log()).sum().item()
    entropies.append(H)

mean_entropy = sum(entropies) / len(entropies)
mean_entropy

2.978312534093857

In [57]:
entropy = -(last_probs * last_probs.log()).sum().item()
entropy

3.1322214603424072

In [58]:
print("mean entropy steeering, unprompted:")
steering_factor = mean_entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

mean entropy steeering, unprompted:
('The answer is unknown, as there is no country that officially designates Gjetost as its national cheese. Gjetost is a traditional cheese from Norway, but it does not hold an official status as the national cheese.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf

In [59]:
print("last token entropy steeering, unprompted:")
steering_factor = entropy/50
model.reset()
model.set_control(vector, steering_factor)
answer = generate_output(input, sys=True, max_new_tokens=1000)
print(answer)

last token entropy steeering, unprompted:
('The answer is unknown. I could not find any definitive source indicating that Gjetost is officially recognized as the national cheese of any particular country. It originates from Norway, but it does not seem to have been granted such a status.', (tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor([[-inf, -inf, -inf,  ..., -inf, -inf, -inf]]), tensor