# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import sys
import torch
import logging
from transformers import AutoModelForCausalLM
from nnsight import LanguageModel

sys.path.append("../")
import src.utils.logging_utils as logging_utils
import src.functional as functional
import src.models as models
import src.tokens as tokens
import src.dataset as dataset
from src.models import ModelandTokenizer
import src.patchscope_eval as patchscope_eval
import proto.patchscope_pb2 as patchscope_pb2
import src.dataset_manager as dataset_manager

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.INFO,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")

2024-10-30 18:13:05 datasets INFO     PyTorch version 2.5.0 available.


2024-10-30 18:13:05 __main__ INFO     torch.__version__='2.5.0+cu124', torch.version.cuda='12.4'


In [3]:
# MODEL_KEY = "meta-llama/Llama-3.2-3B-Instruct"
# MODEL_KEY = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_KEY = "meta-llama/Llama-3.2-3B"
# MODEL_KEY = "meta-llama/Llama-3.1-8B"

# MODEL_KEY = "google/gemma-2-9b-it"
# MODEL_KEY = "google/gemma-2-2b"

mt = models.ModelandTokenizer(
    model_key=MODEL_KEY,
    torch_dtype=torch.bfloat16,
)

mt.n_layer

If not found in cache, model will be downloaded from HuggingFace to cache directory


2024-10-30 18:08:15 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-10-30 18:08:18 src.models INFO     loaded model <meta-llama/Llama-3.2-3B> | size: 6127.841 MB | dtype: torch.bfloat16 | device: cuda:0


28

# Demo

In [4]:
# true_prompt = "The city of Paris is in the country of France."
# false_prompt = "The city of Paris is in the country of Italy."
# true_input, false_input = [tokens.prepare_input(p, mt) for p in (true_prompt, false_prompt)]

In [5]:
# # layers = [24]
# layers = list(range(3, 30))

# true_h, false_h = [
#     patchscope_eval.get_h_layers(mt, input_, layers) for input_ in (true_input, false_input)
# ]

In [6]:
# true_token, false_token = [mt.tokenizer.encode(t)[-1] for t in ("True", "False")]
# [mt.tokenizer.decode(t) for t in (true_token, false_token)]

In [7]:
# for prompt, h in zip((true_prompt, false_prompt), (true_h, false_h)):
#     print(prompt)
#     _, result_dict = functional.patchscope(
#         mt = mt, 
#         hs = patchscope_eval.get_h_with_target_layer(h, 3) if len(h) == 1 else h,
#         target_prompt = target_prompt,
#         interested_tokens = (true_token, false_token),
#         k = 5)
#     for t in (true_token, false_token):
#         print("   ", result_dict[t])


In [8]:
# gmt_dataset = dataset.GMTDataset.from_csv("cities.csv", few_shot=False).examples
# len(gmt_dataset)

# Base evaluation

In [9]:
base_prompt = '''The city of Tokyo is in Japan. This statement is: True
The city of Hanoi is in Poland. This statement is: False
placeholder placeholder placeholder placeholder placeholder This statement is: '''

instruct_prompt = '''<|start_header_id|>user<|end_header_id|>

True or false: placeholder placeholder placeholder placeholder placeholder<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

'''

target_prompt = {
    "meta-llama/Llama-3.2-3B-Instruct" : instruct_prompt,
    "meta-llama/Llama-3.1-8B-Instruct" : instruct_prompt,
    "meta-llama/Llama-3.2-3B" : base_prompt,
    "meta-llama/Llama-3.1-8B" : base_prompt,
}[MODEL_KEY]

model_short_name = {
    "meta-llama/Llama-3.2-3B-Instruct" : "llama_3b_instruct",
    "meta-llama/Llama-3.1-8B-Instruct" : "llama_8b_instruct",
    "meta-llama/Llama-3.2-3B" : "llama_3b",
    "meta-llama/Llama-3.1-8B" : "llama_8b",
}[MODEL_KEY]

In [None]:
print(MODEL_KEY, model_short_name)

meta-llama/Llama-3.1-8B llama_8b


In [11]:
result_set_name = model_short_name + "__all_layers"
evaluation_results = patchscope_pb2.EvaluationResults()

for filename in dataset.GMT_DATA_FILES:
    examples = dataset.GMTDataset.simple_get_examples(filename)[:100]
    evaluation_config = patchscope_pb2.EvaluationConfig(
        model_key=MODEL_KEY,
        dataset=filename,
        target_prompt=target_prompt,
        label_to_token={ "1": "True", "0": "False" },
        patchscope_config=patchscope_pb2.PatchscopeConfig(
            source_layers=list(range(mt.n_layer)),
            target_layers=[]
        )
    )
    evaluation_runner = patchscope_eval.EvaluationRunner(evaluation_config, mt)
    evaluation_result = evaluation_runner.evaluate(examples)
    evaluation_result.result_set_name = result_set_name
    # evaluation_result = evaluation_runner.evaluate([(true_prompt, True), (false_prompt, False)])
    evaluation_results.results.append(evaluation_result)
    print(filename, evaluation_result.accuracy)

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:37<00:00,  1.57s/it]


sp_en_trans.csv 0.41


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.57s/it]


neg_sp_en_trans.csv 0.42


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.56s/it]


cities.csv 0.68


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.56s/it]


neg_cities.csv 0.62


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.57s/it]


smaller_than.csv 0.69


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:34<00:00,  1.55s/it]


larger_than.csv 0.65


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.53s/it]


common_claim_true_false.csv 0.54


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.53s/it]


companies_true_false.csv 0.63


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.52s/it]

counterfact_true_false.csv 0.57





In [12]:
with open(f"results/{result_set_name}.binpb", "wb") as f:
    f.write(evaluation_results.SerializeToString())

# Finetuned evaluation

In [7]:
finetuned_model_key = "sensharma/Llama-3.2-3B_Talkative_Probe_Step-3000"
tuned_model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_key, torch_dtype=torch.bfloat16
).to("cuda")

tuned_lm = LanguageModel(tuned_model)
decoder_mt = ModelandTokenizer(
    base_lm = tuned_lm,
    tokenizer = mt.tokenizer
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

2024-10-30 18:10:06 src.models INFO     loaded model <EleutherAI/gpt-j-6B> | size: 6127.841 MB | dtype: torch.bfloat16 | device: cuda:0


In [33]:
dataset_group = "geometry_of_truth"
(dataset,) = dataset_manager.DatasetManager.from_dataset_group(dataset_group).split([0.001])

In [34]:
len(dataset)

13

In [35]:
next(iter(dataset))

[ContextQASample(context='The city of Nova Iguacu is in Brazil.', questions=['# Could you tell me if this is factual?', '# Does this statement hold true?', '# Do you agree that this is true?', '# Would you validate this statement?', '# Can you verify if this is true?', '# Can you confirm the accuracy of this?', '# Does this information ring true?', '# Have I understood this correctly?', '# Are you confirming this is true?', '# Are you saying this is correct?'], answers=['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'])]

In [42]:
evaluation_config = patchscope_pb2.EvaluationConfig(
    model_key=MODEL_KEY,
    dataset=dataset_group,
    patchscope_config=patchscope_pb2.PatchscopeConfig(
        source_layers=[24],
        target_layers=list(range(decoder_mt.n_layer))
    ),
    decoder_config=patchscope_pb2.DecoderConfig(
        name=finetuned_model_key
    )
)
evaluation_runner = patchscope_eval.EvaluationRunner(evaluation_config, mt, decoder_mt)
evaluation_result = evaluation_runner.evaluate(dataset)
print(evaluation_result.accuracy)

  0%|                                                                                           | 0/13 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:09<00:00,  1.31it/s]

0.6923076923076923





In [44]:
functional.predict_next_token(decoder_mt, "The city of Paris is in the country of ", k=10)

[[PredictedToken(token=' No', prob=0.777143120765686, logit=21.5, token_id=2360),
  PredictedToken(token=' Yes', prob=0.22265523672103882, logit=20.25, token_id=7566),
  PredictedToken(token=' yes', prob=4.5303295337362215e-05, logit=11.75, token_id=10035),
  PredictedToken(token='Yes', prob=2.4249107809737325e-05, logit=11.125, token_id=9642),
  PredictedToken(token=' YES', prob=1.5656400137231685e-05, logit=10.6875, token_id=14410),
  PredictedToken(token='No', prob=9.496086931903847e-06, logit=10.1875, token_id=2822),
  PredictedToken(token='-No', prob=6.526558536279481e-06, logit=9.8125, token_id=99076),
  PredictedToken(token=' NO', prob=4.774932222062489e-06, logit=9.5, token_id=5782),
  PredictedToken(token=' Nos', prob=4.774932222062489e-06, logit=9.5, token_id=51097),
  PredictedToken(token=' no', prob=4.213863121549366e-06, logit=9.375, token_id=912)]]