In [11]:
import torch
import einops
from transformers import AutoModelForCausalLM, AutoTokenizer
from dataclasses import dataclass
import os
import random

import mypkg.whitebox_infra.attribution as attribution
import mypkg.whitebox_infra.dictionaries.batch_topk_sae as batch_topk_sae
import mypkg.whitebox_infra.data_utils as data_utils
import mypkg.whitebox_infra.model_utils as model_utils
import mypkg.whitebox_infra.interp_utils as interp_utils
import mypkg.pipeline.setup.dataset as dataset_setup
import mypkg.pipeline.infra.hiring_bias_prompts as hiring_bias_prompts
from mypkg.eval_config import EvalConfig

In [12]:
use_attrib = False

if use_attrib:
    attrib_results_filepath = "attribution_results/mistralai_Ministral-8B-Instruct-2410/v1_trainer_1_model_mistralai_Ministral-8B-Instruct-2410_layer_25_attrib_data.pt"

    attrib_results = torch.load(attrib_results_filepath, map_location="cpu")
    print(attrib_results.keys())
    print(attrib_results["config"])

    model_name = attrib_results["config"]["model_name"]
    chosen_layer = attrib_results["config"]["layer"]
    chosen_layer_percentage = attrib_results["config"]["chosen_layer_percentage"]
    trainer_id = attrib_results["config"]["trainer_id"]

    bias_type = "political_orientation"
    effects_F = attrib_results[bias_type]["effects_F"]
    error_effect = attrib_results[bias_type]["error_effect"]
else:
    diff_acts_filename = "diff_acts/v1_trainer_1_model_mistralai_Ministral-8B-Instruct-2410_layer_25_attrib_data.pt" 
    diff_acts_filename = "diff_acts/v1_trainer_1_model_mistralai_Ministral-8B-Instruct-2410_layer_50_attrib_data.pt" 
    # diff_acts_filename = "diff_acts/v1_trainer_1_model_mistralai_Mistral-Small-24B-Instruct-2501_layer_25_attrib_data.pt"
    # diff_acts_filename = "diff_acts/v1_trainer_1_model_mistralai_Mistral-Small-24B-Instruct-2501_layer_50_attrib_data.pt"
    diff_acts_filename = "diff_acts/v1_trainer_3_model_mistralai_Ministral-8B-Instruct-2410_layer_50_attrib_data.pt"

    diff_acts_data = torch.load(diff_acts_filename)
    diff_acts_F = diff_acts_data
    effects_F = diff_acts_F["diff_acts_F"]
    error_effect = 0

    model_name = diff_acts_data["config"]["model_name"]
    chosen_layer = diff_acts_data["config"]["layer"]
    chosen_layer_percentage = diff_acts_data["config"]["chosen_layer_percentage"]
    trainer_id = diff_acts_data["config"]["trainer_id"]



In [13]:
acts_dir = "max_acts"
acts_filename = f"acts_{model_name}_layer_{chosen_layer}_trainer_{trainer_id}_layer_percent_{chosen_layer_percentage[0]}.pt".replace("/", "_")
acts_path = os.path.join(acts_dir, acts_filename)
if not os.path.exists(acts_path):
    from huggingface_hub import hf_hub_download
    path_to_config = hf_hub_download(
        repo_id="adamkarvonen/sae_max_acts",
        filename=acts_filename,
        force_download=False,
        local_dir=acts_dir,
        repo_type="dataset",
    )
    acts_data = torch.load(acts_path)
else:
    acts_data = torch.load(acts_path)
max_tokens = acts_data["max_tokens"].cpu()
max_acts = acts_data["max_acts"].cpu()


In [14]:


top_k_ids = effects_F.abs().topk(20).indices
print(top_k_ids)

top_k_vals = effects_F[top_k_ids]
print(top_k_vals)

print(error_effect)

# tensor([ 4794,  4393, 15242,  2039,  9049,  3645, 11802,  9265,  7781, 13002,
#         16078,  4204,   394,  5286, 11901,  7654, 13855,  3509,  9206,  1529])
# tensor([-0.0159,  0.0139,  0.0092, -0.0077,  0.0070, -0.0055,  0.0051,  0.0049,
#          0.0045, -0.0045, -0.0041,  0.0037, -0.0030, -0.0028,  0.0027,  0.0026,
#         -0.0026,  0.0024,  0.0023,  0.0022])
# tensor(-0.0180)



tensor([23759, 42925, 33394, 45780, 10085, 23574, 30460, 61472,   521, 59020,
          775,  4261, 41205, 29588, 44488,   983, 55773, 44225, 42612, 30063],
       device='cuda:0')
tensor([-2.2821e-05, -1.8463e-05, -1.8400e-05,  1.6533e-05, -1.6091e-05,
         1.5563e-05,  1.5402e-05,  1.4945e-05, -1.4770e-05, -1.4396e-05,
        -1.4326e-05, -1.4277e-05, -1.3993e-05,  1.3824e-05, -1.3621e-05,
        -1.2731e-05,  1.2631e-05,  1.2598e-05, -1.2597e-05, -1.2533e-05],
       device='cuda:0')
0


In [15]:
from circuitsvis.activations import text_neuron_activations
import gc
from IPython.display import clear_output, display
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def _list_decode(x):
    if len(x.shape) == 0:
        return tokenizer.decode(x, skip_special_tokens=False)
    else:
        return [_list_decode(y) for y in x]
    

clear_output(wait=True)
gc.collect()

for i in range(5):
    feature_idx = top_k_ids[i]
    feature_val = top_k_vals[i]
    print(f"Feature {i}, value: {feature_val}")
    selected_token_KL = max_tokens[feature_idx]

    selected_activations_KL11 = [max_acts[feature_idx, k, :, None, None] for k in range(5)]
    selected_token_strs_KL = _list_decode(selected_token_KL)

    for k in range(len(selected_token_strs_KL)):
        if "<s>" in selected_token_strs_KL[k][0] or "<bos>" in selected_token_strs_KL[k][0]:
            selected_token_strs_KL[k][0] = "BOS>"

    # selected_token_strs_KL = tokenizer.batch_decode(selected_token_KL, skip_special_tokens=False)
    # for k in range(len(selected_token_strs_KL)):
    #     string = selected_token_strs_KL[k]
    #     print(string[:10])
        # print("".join(string))

    html_activations = text_neuron_activations(selected_token_strs_KL, selected_activations_KL11)
    display(html_activations)

Feature 0, value: -2.282114655827172e-05


Feature 1, value: -1.8463133528712206e-05


Feature 2, value: -1.839970900618937e-05


Feature 3, value: 1.6532911104150116e-05


Feature 4, value: -1.6090676581370644e-05
