# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import torch
import logging
from transformers import AutoModelForCausalLM
from nnsight import LanguageModel

sys.path.append("../")
import src.utils.logging_utils as logging_utils
import src.functional as functional
import src.models as models
import src.tokens as tokens
import src.dataset as dataset
from src.models import ModelandTokenizer
import src.patchscope_eval as patchscope_eval
import proto.patchscope_pb2 as patchscope_pb2
import src.dataset_manager as dataset_manager

logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.INFO,
    format=logging_utils.DEFAULT_FORMAT,
    datefmt=logging_utils.DEFAULT_DATEFMT,
    stream=sys.stdout,
)

logger.info(f"{torch.__version__=}, {torch.version.cuda=}")

  from .autonotebook import tqdm as notebook_tqdm


2024-10-30 19:37:12 __main__ INFO     torch.__version__='2.5.0+cu124', torch.version.cuda='12.4'


In [3]:
# MODEL_KEY = "meta-llama/Llama-3.2-3B-Instruct"
# MODEL_KEY = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_KEY = "meta-llama/Llama-3.2-3B"
# MODEL_KEY = "meta-llama/Llama-3.1-8B"

# MODEL_KEY = "google/gemma-2-9b-it"
# MODEL_KEY = "google/gemma-2-2b"

mt = models.ModelandTokenizer(
    model_key=MODEL_KEY,
    torch_dtype=torch.bfloat16,
)

mt.n_layer

2024-10-30 19:37:13 accelerate.utils.modeling INFO     We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]

2024-10-30 19:37:14 src.models INFO     loaded model </home/local_arnab/Codes/00_MODEL/meta-llama/Llama-3.2-3B> | size: 6127.841 MB | dtype: torch.bfloat16 | device: cuda:0





28

# Demo

In [4]:
# true_prompt = "The city of Paris is in the country of France."
# false_prompt = "The city of Paris is in the country of Italy."
# true_input, false_input = [tokens.prepare_input(p, mt) for p in (true_prompt, false_prompt)]

In [5]:
# # layers = [24]
# layers = list(range(3, 30))

# true_h, false_h = [
#     patchscope_eval.get_h_layers(mt, input_, layers) for input_ in (true_input, false_input)
# ]

In [6]:
# true_token, false_token = [mt.tokenizer.encode(t)[-1] for t in ("True", "False")]
# [mt.tokenizer.decode(t) for t in (true_token, false_token)]

In [7]:
# for prompt, h in zip((true_prompt, false_prompt), (true_h, false_h)):
#     print(prompt)
#     _, result_dict = functional.patchscope(
#         mt = mt, 
#         hs = patchscope_eval.get_h_with_target_layer(h, 3) if len(h) == 1 else h,
#         target_prompt = target_prompt,
#         interested_tokens = (true_token, false_token),
#         k = 5)
#     for t in (true_token, false_token):
#         print("   ", result_dict[t])


In [8]:
# gmt_dataset = dataset.GMTDataset.from_csv("cities.csv", few_shot=False).examples
# len(gmt_dataset)

# Base evaluation

In [9]:
base_prompt = '''The city of Tokyo is in Japan. This statement is: True
The city of Hanoi is in Poland. This statement is: False
placeholder placeholder placeholder placeholder placeholder This statement is: '''

instruct_prompt = '''<|start_header_id|>user<|end_header_id|>

True or false: placeholder placeholder placeholder placeholder placeholder<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>

'''

target_prompt = {
    "meta-llama/Llama-3.2-3B-Instruct" : instruct_prompt,
    "meta-llama/Llama-3.1-8B-Instruct" : instruct_prompt,
    "meta-llama/Llama-3.2-3B" : base_prompt,
    "meta-llama/Llama-3.1-8B" : base_prompt,
}[MODEL_KEY]

model_short_name = {
    "meta-llama/Llama-3.2-3B-Instruct" : "llama_3b_instruct",
    "meta-llama/Llama-3.1-8B-Instruct" : "llama_8b_instruct",
    "meta-llama/Llama-3.2-3B" : "llama_3b",
    "meta-llama/Llama-3.1-8B" : "llama_8b",
}[MODEL_KEY]

In [None]:
print(MODEL_KEY, model_short_name)

meta-llama/Llama-3.1-8B llama_8b


In [11]:
result_set_name = model_short_name + "__all_layers"
evaluation_results = patchscope_pb2.EvaluationResults()

for filename in dataset.GMT_DATA_FILES:
    examples = dataset.GMTDataset.simple_get_examples(filename)[:100]
    evaluation_config = patchscope_pb2.EvaluationConfig(
        model_key=MODEL_KEY,
        dataset=filename,
        target_prompt=target_prompt,
        label_to_token={ "1": "True", "0": "False" },
        patchscope_config=patchscope_pb2.PatchscopeConfig(
            source_layers=list(range(mt.n_layer)),
            target_layers=[]
        )
    )
    evaluation_runner = patchscope_eval.EvaluationRunner(evaluation_config, mt)
    evaluation_result = evaluation_runner.evaluate(examples)
    evaluation_result.result_set_name = result_set_name
    # evaluation_result = evaluation_runner.evaluate([(true_prompt, True), (false_prompt, False)])
    evaluation_results.results.append(evaluation_result)
    print(filename, evaluation_result.accuracy)

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:37<00:00,  1.57s/it]


sp_en_trans.csv 0.41


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.57s/it]


neg_sp_en_trans.csv 0.42


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.56s/it]


cities.csv 0.68


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.56s/it]


neg_cities.csv 0.62


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:36<00:00,  1.57s/it]


smaller_than.csv 0.69


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:34<00:00,  1.55s/it]


larger_than.csv 0.65


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.53s/it]


common_claim_true_false.csv 0.54


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.53s/it]


companies_true_false.csv 0.63


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:32<00:00,  1.52s/it]

counterfact_true_false.csv 0.57





In [12]:
with open(f"results/{result_set_name}.binpb", "wb") as f:
    f.write(evaluation_results.SerializeToString())

# Finetuned evaluation

In [4]:
# finetuned_model_key = "sensharma/Llama-3.2-3B_Talkative_Probe_Step-3000"
finetuned_model_key = "/home/local_arnab/Codes/Projects/talkative_probes/results/patchscope_save/Llama-3.2-3B/checkpoint-12000"
tuned_model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_key, torch_dtype=torch.bfloat16
).to("cuda")

tuned_lm = LanguageModel(tuned_model)
decoder_mt = ModelandTokenizer(
    base_lm = tuned_lm,
    tokenizer = mt.tokenizer
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.55it/s]


2024-10-30 19:37:23 src.models INFO     loaded model <EleutherAI/gpt-j-6B> | size: 6127.841 MB | dtype: torch.bfloat16 | device: cuda:0


In [10]:
decoder_mt.name = "Llama-3.2-3B/checkpoint-12000"

In [17]:
dataset_group = "geometry_of_truth"
(dataset,) = dataset_manager.DatasetManager.from_dataset_group(dataset_group).split([0.01])

In [18]:
len(dataset)

130

In [19]:
next(iter(dataset))

[ContextQASample(context='Peasants originated in Great Britain and were later introduced in Europe.', questions=['# Do you find this to be truthful?', '# Would you validate this statement?', '# Have I understood this correctly?', '# Do you agree that this is true?', '# May I assume this is correct?', '# Can you confirm the accuracy of this?', '# Is this statement true?', '# Are you confirming this is true?', '# Could you tell me if this is factual?', '# Do the facts support this statement?'], answers=['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No'])]

In [20]:
from src.tokens import prepare_input, find_token_range

prompts = [
    # "The land of",
    # "The capital of France is",
    # "This is a"
    "The city of Paris is in the country of"
]

batch_inputs = prepare_input(
    tokenizer=mt,
    prompts=prompts,
    padding_side="left",
    # padding="max_length",
    # max_length=20,
    truncation=True,
    return_offsets_mapping=True
)

batch_inputs

{'input_ids': tensor([[128000,    791,   3363,    315,  12366,    374,    304,    279,   3224,
            315]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'offset_mapping': tensor([[[ 0,  0],
         [ 0,  3],
         [ 3,  8],
         [ 8, 11],
         [11, 17],
         [17, 20],
         [20, 23],
         [23, 27],
         [27, 35],
         [35, 38]]], device='cuda:0')}

In [21]:
evaluation_config = patchscope_pb2.EvaluationConfig(
    model_key=MODEL_KEY,
    dataset=dataset_group,
    patchscope_config=patchscope_pb2.PatchscopeConfig(
        source_layers=[24],
        target_layers=list(range(decoder_mt.n_layer))
    ),
    decoder_config=patchscope_pb2.DecoderConfig(
        name=finetuned_model_key
    )
)
evaluation_runner = patchscope_eval.EvaluationRunner(evaluation_config, mt, decoder_mt)
evaluation_result = evaluation_runner.evaluate(dataset)
print(evaluation_result.accuracy)

  0%|          | 0/130 [00:00<?, ?it/s]

100%|██████████| 130/130 [00:50<00:00,  2.58it/s]

0.6692307692307692





In [34]:
# prompt = "The boys are playing in the park. This statement is in the present tense."
prompt = "Grace is a school teacher. Would you say this statement is about a male?"

functional.predict_next_token(decoder_mt, prompt, k=10)

[[PredictedToken(token=' Yes', prob=0.5310860276222229, logit=21.25, token_id=7566),
  PredictedToken(token=' No', prob=0.46868178248405457, logit=21.125, token_id=2360),
  PredictedToken(token='Yes', prob=6.976826989557594e-05, logit=12.3125, token_id=9642),
  PredictedToken(token=' yes', prob=4.79509835713543e-05, logit=11.9375, token_id=10035),
  PredictedToken(token=' YES', prob=2.7321648303768598e-05, logit=11.375, token_id=14410),
  PredictedToken(token='No', prob=7.353521596087376e-06, logit=10.0625, token_id=2822),
  PredictedToken(token=' Yeah', prob=6.489459792646812e-06, logit=9.9375, token_id=22335),
  PredictedToken(token='.No', prob=3.6975827697460772e-06, logit=9.375, token_id=17184),
  PredictedToken(token='yes', prob=3.2631053272780264e-06, logit=9.25, token_id=9891),
  PredictedToken(token='-No', prob=2.87968032353092e-06, logit=9.125, token_id=99076)]]

In [29]:
inputs = prepare_input(
    tokenizer=decoder_mt,
    prompts=[prompt],
    padding_side="left",
)

output = decoder_mt._model.generate(
    **inputs, do_sample = False
)

mt.tokenizer.decode(output[0], skip_special_tokens=True)  # Decode the output

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




'The city of Paris is in the country of France Yes Yes Yes Yes Yes Yes Yes Yes Yes'

In [39]:
from src.functional import get_batch_concept_activations

prompts = [
    "Grace is a school teacher",
]

latents = get_batch_concept_activations(
    mt=mt,
    prompts=prompts,
    interested_layer_indices=list(range(5, 20)),
    check_prediction=None,
    on_token_occur=None,
)

In [45]:
latents[0].questions = ["# Would you say this statement is about a male?"]
latents[0].answers = [" No"]

In [61]:
from src.activation_manager import ActivationSample
import random

add_to_buffer = []
for latent_cache in latents:
    for layer_name in latent_cache.latents.keys():
        activation = latent_cache.latents[layer_name]
        # query, label = self.get_latent_qa(
        #     correct_label=latent_cache.correct_label,
        #     wrong_label=latent_cache.incorrect_label,
        #     group=latent_cache.group,
        # )
        question, label = random.choice(
            list(zip(latent_cache.questions, latent_cache.answers))
        )
        add_to_buffer.append(
            ActivationSample(
                activation=activation,
                context=latent_cache.context,
                question=question,
                label=label,
                layer_name=layer_name,
            )
        )

In [62]:
from src.train_utils import evaluate

evaluate(decoder_mt, add_to_buffer)  # Evaluate the collected sample

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

  output = self.target(*args, **kwargs)
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.35it/s]


0.0

In [63]:
ds = dataset_manager.DatasetManager.from_named_datasets(
    [("md_gender", "md_gender")],
    batch_size=5
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [56]:
batch = next(iter(ds))

In [60]:
batch[0].__dict__

{'context': "I'm a football fan that knows he plays for FC Tamboc from Arsenal.\n\nThis text is about Vladislav Ryzhkov.",
 'questions': ['# Am I reading about someone who identifies as female?',
  '# Is the person mentioned male?',
  '# Does this portray a male individual?',
  '# Is the subject of this sentence male?',
  '# Does this content reference a male person?',
  '# Is the character described here female?',
  '# Am I learning about a male individual here?',
  '# Is this about someone who is male?',
  '# Is the focus of this sentence on someone male?',
  '# Does this refer to a female individual?'],
 'answers': ['No',
  'Yes',
  'Yes',
  'Yes',
  'Yes',
  'No',
  'Yes',
  'Yes',
  'Yes',
  'No']}