In [1]:
import torch
import utils
import pandas as pd
from nnsight import LanguageModel
from transformers import BitsAndBytesConfig
from geometry_of_truth.visualization_utils import collect_acts

Load in model, 4-bit quantized

In [2]:
MODEL_PATH = "meta-llama/Llama-3.1-8B"

device = "cuda" if torch.cuda.is_available() else "cpu"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

model = LanguageModel(
    MODEL_PATH,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
tokenizer = model.tokenizer

We first test if our source prompt with few-shot true/false classification works, and the output token distribution

In [3]:
prompt_text = """is pretty damned funny. This statement is postitive: TRUE
though ford and neeson capably hold our interest , but its just not a thrilling movie. This statement is positive: FALSE
far less sophisticated and. This statement is positive: FALSE
acted and directed, it's clear that washington most certainly has a new career ahead of him. This statement is positive: TRUE
this car is beautiful. This statement is positive:"""

In [4]:
utils.print_token_ids(prompt_text, tokenizer)

  0: '<|begin_of_text|>'
  1: 'is'
  2: ' pretty'
  3: ' damned'
  4: ' funny'
  5: '.'
  6: ' This'
  7: ' statement'
  8: ' is'
  9: ' post'
 10: 'itive'
 11: ':'
 12: ' TRUE'
 13: '\n'
 14: 'though'
 15: ' ford'
 16: ' and'
 17: ' ne'
 18: 'eson'
 19: ' cap'
 20: 'ably'
 21: ' hold'
 22: ' our'
 23: ' interest'
 24: ','
 25: ' but'
 26: ' its'
 27: ' just'
 28: ' not'
 29: ' a'
 30: ' thrilling'
 31: ' movie'
 32: '.'
 33: ' This'
 34: ' statement'
 35: ' is'
 36: ' positive'
 37: ':'
 38: ' FALSE'
 39: '\n'
 40: 'far'
 41: ' less'
 42: ' sophisticated'
 43: ' and'
 44: '.'
 45: ' This'
 46: ' statement'
 47: ' is'
 48: ' positive'
 49: ':'
 50: ' FALSE'
 51: '\n'
 52: 'acted'
 53: ' and'
 54: ' directed'
 55: ','
 56: ' it'
 57: "'s"
 58: ' clear'
 59: ' that'
 60: ' washington'
 61: ' most'
 62: ' certainly'
 63: ' has'
 64: ' a'
 65: ' new'
 66: ' career'
 67: ' ahead'
 68: ' of'
 69: ' him'
 70: '.'
 71: ' This'
 72: ' statement'
 73: ' is'
 74: ' positive'
 75: ':'
 76: ' TRUE'

In [5]:
with model.trace(prompt_text):
    logits = model.lm_head.output[0, -1].save()

utils.print_topk_tokens(logits, tokenizer, k=10)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

top-10 predictions:
  8378 ĠTRUE (' TRUE'): 0.7578
  7989 ĠFALSE (' FALSE'): 0.2178
  62495 ĠUNKNOWN (' UNKNOWN'): 0.0029
  837 Ġtrue (' true'): 0.0027
  6781 ĠUN (' UN'): 0.0013
  8014 ĠNE (' NE'): 0.0012
  3082 ĠTrue (' True'): 0.0012
  5091 ĠTR (' TR'): 0.0011
  21260 TRUE ('TRUE'): 0.0011
  4276 ĠNOT (' NOT'): 0.0007


We then perform the process of finding the "truth direction" and perturbing the activations of the statement to try and flip the classifier.

[1] https://github.com/saprmarks/geometry-of-truth/blob/main/interventions.py#L13

[2] https://github.com/saprmarks/geometry-of-truth/blob/main/probes.py#L58

In [6]:
start_layer = 7
end_layer = 13

train_datasets = ['cities', 'neg_cities']

In [7]:
acts, labels = [], []

for dataset in train_datasets:
    acts.append(collect_acts(dataset, 'llama-3.1-8b', end_layer, noperiod=True).to('cuda:0'))
    labels.append(torch.Tensor(pd.read_csv(f'geometry_of_truth/datasets/{dataset}.csv')['label'].tolist()).to('cuda:0'))

acts, labels = torch.cat(acts), torch.cat(labels)

In [8]:
# From MMProbe
true_acts, false_acts = acts[labels==1], acts[labels==0]
true_mean, false_mean = true_acts.mean(0), false_acts.mean(0)
direction = (true_mean - false_mean).cpu()

In [9]:
len_suffix = len(model.tokenizer.encode("This statement is positive:"))
print(f"len_suffix: {len_suffix}")
print("len suffix is +1 because of <bos> token")

with model.trace(prompt_text):
    for layer in range(start_layer, end_layer + 1):
        model.model.layers[layer].output[0][:, -len_suffix - 1, :] -= direction
    activations = [layer.output[0].save() for layer in model.model.layers]
    logits = model.lm_head.output[0, -1].save()

utils.print_topk_tokens(logits, tokenizer, k=10)

len_suffix: 6
len suffix is +1 because of <bos> token
top-10 predictions:
  8378 ĠTRUE (' TRUE'): 0.5156
  7989 ĠFALSE (' FALSE'): 0.4551
  62495 ĠUNKNOWN (' UNKNOWN'): 0.0045
  8014 ĠNE (' NE'): 0.0022
  837 Ġtrue (' true'): 0.0021
  6781 ĠUN (' UN'): 0.0020
  4276 ĠNOT (' NOT'): 0.0011
  905 Ġfalse (' false'): 0.0009
  3082 ĠTrue (' True'): 0.0009
  21260 TRUE ('TRUE'): 0.0009


In [10]:
print(activations[0].shape)

torch.Size([1, 88, 4096])


In [11]:
source_layer = 20

# 78:83 range for the statement
source_activations = activations[source_layer][:, 78:83, :]

print(source_activations.shape)

torch.Size([1, 5, 4096])


Test if our few-shot target prompt works to do sentence identity

In [12]:
target_prompt_text_test = """i love pizza->i love pizza;
the weather is nice->the weather is nice;
they didn't enjoy the show->they didn't enjoy the show;
i hate the rain"""

In [13]:
with model.generate(target_prompt_text_test, max_new_tokens=15):
    output_tokens = model.generator.output.save()

print(tokenizer.decode(output_tokens[0].cpu(), skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


i love pizza->i love pizza;
the weather is nice->the weather is nice;
they didn't enjoy the show->they didn't enjoy the show;
i hate the rain->i hate the rain;
i'm tired of this->i'm tired


Add placeholder tokens to perform patchscopes on and check generation

In [14]:
target_prompt_text_test = """i love pizza->i love pizza;
the weather is nice->the weather is nice;
they didn't enjoy the show->they didn't enjoy the show;
"""


In [15]:
stmt_len = 5

target_prompt_tokens = torch.cat([
  tokenizer(target_prompt_text_test, return_tensors="pt").input_ids[0].to(device), 
  torch.tensor([tokenizer.convert_tokens_to_ids("?")] * stmt_len).to(device)
])

In [16]:
target_layer = 1

In [17]:
with model.generate(target_prompt_tokens, max_new_tokens=20):
    model.model.layers[target_layer].output[0][:, -stmt_len :, :] = source_activations
    output_tokens = model.generator.output.save()

generated = model.tokenizer.decode(output_tokens[0].cpu(), skip_special_tokens=True)
print(generated)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


i love pizza->i love pizza;
the weather is nice->the weather is nice;
they didn't enjoy the show->they didn't enjoy the show;
?????->this car is beautiful;
he is a good man.->he is a good man;
the


We get back out the same statement we started with: this car is beautiful