In [14]:
import os
import pickle
import json
from transformers import AutoTokenizer
from experiments.pipeline_config import PipelineConfig
from experiments.llm_autointerp.llm_query import perform_llm_autointerp, construct_llm_features_prompts
from typing import Dict

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from nnsight import LanguageModel

model = LanguageModel('EleutherAI/pythia-70m-deduped')

First, run autointerp with
```python
cd experiments
python llm_autointerp/run_autointerp_can.py
```

In [16]:
# TODO create run.sh script for above to run llm_query with custom args from this notebook
# For now, manually copy hyperparameters here
repo_dir = os.path.abspath(os.path.join(os.getcwd(), "../.."))
ae_path = "dictionary_learning/dictionaries/autointerp_test_data/pythia70m_sweep_topk_ctx128_0730/resid_post_layer_3/trainer_2"
ae_path = os.path.abspath(os.path.join(repo_dir, ae_path))

In [17]:
# To save the raw llm output when querying llm, set DEBUG=True
with open(os.path.join(ae_path, "raw_llm_outputs.json"), "r") as f:
    raw_llm_outputs = json.load(f)

with open(os.path.join(ae_path, "extracted_json_llm_outputs.json"), "r") as f:
    extracted_json_llm_outputs = json.load(f)

with open(os.path.join(ae_path, "node_effects.pkl"), "rb") as f:
    node_effects_classprobe = pickle.load(f)

with open(os.path.join(ae_path, "node_effects_auto_interp.pkl"), "rb") as f:
    node_effects_autointerp = pickle.load(f)

with open(os.path.join(ae_path, "max_activating_inputs.pkl"), "rb") as f:
    max_activating_inputs = pickle.load(f)

In [18]:
# Show config
cfg = PipelineConfig()
for k, v in cfg.__dict__.items():
    print(f"{k}: {v}")

max_activations_collection_n_inputs: 5000
top_k_inputs_act_collect: 5
probe_train_set_size: 4000
probe_test_set_size: 500
train_set_size: 100
test_set_size: 200
eval_saes_n_inputs: 250
probe_batch_size: 200
probe_epochs: 10
model_dtype: torch.bfloat16
reduced_GPU_memory: False
include_gender: True
use_autointerp: True
force_eval_results_recompute: False
force_max_activations_recompute: False
force_probe_recompute: False
force_node_effects_recompute: False
force_autointerp_recompute: False
dictionaries_path: ../dictionary_learning/dictionaries
probes_dir: trained_bib_probes
attribution_patching_method: attrib
ig_steps: 10
api_llm: claude-3-5-sonnet-20240620
autointerp_api_total_token_per_minute_limit: 400000
autointerp_api_total_requests_per_minute_limit: 4000
num_allowed_tokens_per_minute: 320000
num_allowed_requests_per_minute: 3200
num_tokens_system_prompt: None
prompt_dir: llm_autointerp/
node_effects_attrib_filename: node_effects.pkl
autointerp_filename: node_effects_auto_interp.pk

In [19]:
node_effects_autointerp

{'male / female': tensor([-0., 0., 0.,  ..., -0., -0., 0.]),
 'professor / nurse': tensor([0., 0., 0.,  ..., -0., -0., -0.]),
 'male_professor / female_nurse': tensor([0., 0., 0.,  ..., -0., 0., -0.]),
 'biased_male / biased_female': tensor([0., 0., 0.,  ..., -0., -0., -0.]),
 0: tensor([-0., 0., -0.,  ..., -0., 0., 0.]),
 1: tensor([-0., -0., 0.,  ..., 0., 0., 0.]),
 2: tensor([-0., 0., 0.,  ..., -0., 0., 0.]),
 6: tensor([0., 0., 0.,  ..., -0., 0., 0.]),
 9: tensor([0., 0., -0.,  ..., -0., -0., 0.])}

In [20]:
# Print cfg.num_features_per_class for each class
from experiments.utils_bib_dataset import profession_dict
import torch as t

top_feature_idxs_per_class = {}

for cls in cfg.chosen_autointerp_class_names:
    if cls in ['gender', 'professor', 'nurse', ]:
        continue
    else:
        cls_idx = profession_dict[cls]

    top_feature_vals, top_feature_idxs = t.topk(node_effects_classprobe[cls_idx], cfg.num_top_features_per_class)
    top_feature_idxs_per_class[cls] = top_feature_idxs
    autointerp_vals = node_effects_autointerp[cls_idx][top_feature_idxs]
    extracted_jsons = [extracted_json_llm_outputs[str(j.item())] for j in top_feature_idxs]
    # raw_outputs = [raw_llm_outputs[str(j.item())][0][-100:] for j in top_feature_idxs]
    print(f"{cls}:")
    print(f'feature_idx, probe_effect, autointerp_effects_val, autointerp_extracted_json')
    for i in range(cfg.num_top_features_per_class):
        if extracted_jsons[i] is not None:
            print(f'{top_feature_idxs[i]}, {top_feature_vals[i]}, {autointerp_vals[i]}, {extracted_jsons[i][cls]}')#, raw_outputs[i])
        else:
            print(f'{top_feature_idxs[i]}, {top_feature_vals[i]}, {autointerp_vals[i]}, None')

accountant:
feature_idx, probe_effect, autointerp_effects_val, autointerp_extracted_json
1512, 1.140625, 0.0, 0
15917, 0.8515625, 0.0, 0
5376, 0.6875, 0.6875, 3
13927, 0.322265625, 0.322265625, 4
3170, 0.2099609375, 0.2099609375, 3
12957, 0.1767578125, 0.1767578125, 4
12378, 0.1484375, 0.0, 0
9922, 0.10205078125, 0.10205078125, 2
15661, 0.08251953125, 0.08251953125, 3
13033, 0.0810546875, 0.0810546875, 4
6046, 0.078125, 0.0, 0
15482, 0.07177734375, 0.0, 0
10470, 0.050048828125, 0.0, 0
11871, 0.04931640625, 0.0, 0
16363, 0.04541015625, 0.04541015625, 3
5189, 0.04296875, 0.0, 0
9652, 0.041259765625, 0.0, 0
9682, 0.0390625, 0.0390625, 1
10691, 0.037353515625, 0.0, 0
111, 0.036376953125, 0.036376953125, 3
architect:
feature_idx, probe_effect, autointerp_effects_val, autointerp_extracted_json
6339, 0.2333984375, 0.0, 0
11224, 0.2265625, 0.2265625, 2
15917, 0.2138671875, 0.0, 0
5376, 0.142578125, 0.0, 0
1512, 0.1396484375, 0.0, 0
1552, 0.12060546875, 0.12060546875, 1
6859, 0.103515625, 0.103

In [21]:
features_prompts = construct_llm_features_prompts(ae_path, model.tokenizer, cfg)

dict_keys(['max_tokens_FKL', 'max_activations_FKL', 'dla_results_FK'])


Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 352.67it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 227.57it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 372.40it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 363.05it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 370.59it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 375.50it/s]


Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 374.02it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 372.69it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 370.69it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 365.01it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 368.70it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 373.66it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 368.12it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 369.71it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 371.87it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 371.87it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 368.37it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 371.90it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 357.88it/s]
Formatting examples: 100%|██████████| 1/1 [00:00<00:00, 374.49it/s]
Formatting examples: 100%|██████████| 1/1 [00:0

In [22]:
## Are the outputs that have been set to 0 really not related at all

for cls in cfg.chosen_autointerp_class_names:
    if cls in ['gender', 'professor', 'nurse', ]:
        continue
    for top_feature_idx in top_feature_idxs_per_class[cls]:
        idx = top_feature_idx.item()
        if node_effects_autointerp[profession_dict[cls]][idx] == 0:
            print(f'feature_idx: {idx}, class: {cls}')
            print(f"(high effect on the classprobe of this class). Autointerp scored with 0 for this class. Here is the prompt to autointerp:\n")
            print(f'this is the rating from autointerp: {extracted_json_llm_outputs[str(idx)]}')
            print(features_prompts[idx][:150])

            # print border
            print("\n--------------------------------------------------\n")

feature_idx: 1512, class: accountant
(high effect on the classprobe of this class). Autointerp scored with 0 for this class. Here is the prompt to autointerp:

this is the rating from autointerp: {'gender': 0, 'professor': 0, 'nurse': 0, 'accountant': 0, 'architect': 0, 'attorney': 0, 'dentist': 0, 'filmmaker': 0}
Okay, now here's the real task.
Promoted tokens: wegian,  snaps, Decided,  Bold, apshot,  keyword,  Bom,  unsigned,  Morning, rove
Example prompts: 



--------------------------------------------------

feature_idx: 15917, class: accountant
(high effect on the classprobe of this class). Autointerp scored with 0 for this class. Here is the prompt to autointerp:

this is the rating from autointerp: {'gender': 0, 'professor': 0, 'nurse': 0, 'accountant': 0, 'architect': 0, 'attorney': 0, 'dentist': 0, 'filmmaker': 0}
Okay, now here's the real task.
Promoted tokens: ÃÂÃÂ, �, ERTY, �, tration, acement, USY,  vitro, ents, esters
Example prompts: 


Example 1:  <<Beat>

-----------

In [23]:
# Feat idx with obvious errors for further inspection

err_idxs = [
    (6036, 'architect'),
    (14889, 'attorney'),
]

for idx, cls in err_idxs:
    print(f'feature_idx: {idx}, class: {cls}')
    print(f'prompts: {features_prompts[idx]}')
    print(f'raw_llm_output: {raw_llm_outputs[str(idx)][0]}')

    print(f'\n\n___________________________________\n\n')

feature_idx: 6036, class: architect
prompts: Okay, now here's the real task.
Promoted tokens:  project,  Innov,  projects,  Innovation,  Projects,  venture,  innovation,  funding, project,  infrastructure
Example prompts: 


Example 1: Aside from being an outdoor enthusiast, he loves staying current on emerging technologies. Currently, Peter is diving head first into the ramifications of both blockchain technology and artificial intelligence in eLearning.

Learn More

About the Author

Peter Schroeder manages all things Marketing at Northpass. Aside from being an << outdoor>>(1) enthusiast, he loves staying current on << emerging>>(2) << technologies>>(1). Currently, Peter << is>>(1) diving head first into the ramifications of both << blockchain>>(1) technology and artificial intelligence in eLearning.Event

Scott Cooper & The Barrelmakers

Thu. Feb. 8 at 7:30pm $8 adv./$10




Example 2: disruptive << technology>>(1) into its customer solutions, including those products that up to now

In [25]:
print(features_prompts[14889])

Okay, now here's the real task.
Promoted tokens:  rent,  delinqu,  estate,  residential,  mortgage,  rental,  Estate,  park,  foreclosure,  rights
Example prompts: 


Example 1: City Council will hold a special meeting Monday devoted to the topic of short-term << rentals>>(2) of single- <<family>>(2) << homes>>(3), and how they will be regulated in the city's new << development>>(1) code. The entire code is tentatively scheduled for adoption in mid-July.

As the finishing touches are put on the Strings in the Mountains white tent, the management at Strings is making last minute preparations for a summer filled with musicians and music lovers visiting the canvas concert hall at the base of Mt. Werner.

After hearing about an hour of public input, members of the Routt County Regional << Planning>>(2) Commission told representatives of the Yampa Mead




Example 2: half of the << homes>>(3) in Aberdeen.

Mr Trump is convinced it will spoil the views from his £750m golf << development>>(1)