In [1]:
import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from collections import OrderedDict
from datasets import Dataset
from transformers import AutoModel, AutoTokenizer
import requests
import re

In [2]:
! rm -rf Interpreting-Reward-Models || true
! git clone https://github.com/apartresearch/Interpreting-Reward-Models.git
! cd Interpreting-Reward-Models && pip install .

Cloning into 'Interpreting-Reward-Models'...
remote: Enumerating objects: 1957, done.[K
remote: Counting objects: 100% (187/187), done.[K
remote: Compressing objects: 100% (139/139), done.[K
remote: Total 1957 (delta 77), reused 75 (delta 48), pack-reused 1770[K
Receiving objects: 100% (1957/1957), 477.31 KiB | 13.26 MiB/s, done.
Resolving deltas: 100% (1354/1354), done.
[0mProcessing /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/Interpreting-Reward-Models
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: reward_analyzer
  Building wheel for reward_analyzer (pyproject.toml) ... [?25ldone
[?25h  Created wheel for reward_analyzer: filename=reward_analyzer-0.9.0-py3-none-any.whl size=35426 sha256=50b3647e3f08df8ad02b2db6d362bcd3d86ff472dc6facbf8ead7ef3ad9b6401
  Stored in directory: /data/home/amir/.cache/pip/wh

In [12]:
from reward_analyzer import SparseAutoencoder
from reward_analyzer.utils.model_storage_utils import load_autoencoders_for_artifact
from reward_analyzer.utils.transformer_utils import batch

In [7]:
model_name = 'gpt_neo_125m'
task_name = 'hh_rlhf'
version = 'v0'

if 'pythia' in model_name:
    layer_name_step = 'layers.{}.mlp'
elif 'neo' in model_name:
    layer_name_stem = 'h.{}.mlp'
elif 'gemma' in model_name:
    layer_name_stem = 'layers.{}.mlp'
else:
    raise Exception(f'Not familiar with model name family of {model_name}')

In [8]:
autoencoders_dict = load_autoencoders_for_artifact(f'nlp_and_interpretability/Autoencoder_training_hh_rlhf/autoencoders_{model_name}_{task_name}:{version}')

Loading artifact from nlp_and_interpretability/Autoencoder_training_hh_rlhf/autoencoders_gpt_neo_125m_hh_rlhf:v0


[34m[1mwandb[0m: Downloading large artifact autoencoders_gpt_neo_125m_hh_rlhf:v0, 67.67MB. 20 files... 
[34m[1mwandb[0m:   20 of 20 files downloaded.  
Done. 0:0:0.2



No explicit decoder created, only bias vector.
Loaded 1 from /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/artifacts/autoencoders_gpt_neo_125m_hh_rlhf:v0/saves/base_big/1

No explicit decoder created, only bias vector.
Loaded 2 from /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/artifacts/autoencoders_gpt_neo_125m_hh_rlhf:v0/saves/base_big/2

No explicit decoder created, only bias vector.
Loaded 3 from /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/artifacts/autoencoders_gpt_neo_125m_hh_rlhf:v0/saves/base_big/3

No explicit decoder created, only bias vector.
Loaded 4 from /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/artifacts/autoencoders_gpt_neo_125m_hh_rlhf:v0/saves/base_big/4

No explicit decoder created, only bias vector.
Loaded 6 from /data/home/amir/work/codes/Interpreting-Reward-Models/Notebooks/artifacts/autoencoders_gpt_neo_125m_hh_rlhf:v0/saves/base_big/6

No explicit decoder created, only bias vector.
Lo

In [10]:
rlhf_small = autoencoders_dict['rlhf_small']

In [None]:
def extract_and_process_activations(texts, model, tokenizer, layer_name_stem, autoencoders_dict):
    inputs = tokenizer(text, return_tensors="pt")
    token_ids = inputs["input_ids"].squeeze().tolist()
    activations = {}

    target_layer_names = [layer_name_stem.format(key) for key in autoencoders_dict.keys()]

    def get_activation(name):
        def hook(model, input, output):
            activations[name] = output.detach()
        return hook

    hooks = [
        module.register_forward_hook(get_activation(name))
        for name, module in model.named_modules()
        if name in target_layer_names
    ]

    with torch.no_grad():
        model(**inputs)

    for hook in hooks:
        hook.remove()

    specified_activations = [(name, activations[name]) for name in target_layer_names]
    concatenated_activations = [[] for _ in token_ids]

    for act, autoencoder_idx in zip(specified_activations, autoencoder_indices):
        name, act = act
        act = act.squeeze(0)
        autoencoder = autoencoders[autoencoder_idx]
        features, _ = autoencoder(act)
        for i in range(len(token_ids)):
            concatenated_activations[i].append(features[i].tolist())

    final_activations = {
        token_id: [item for sublist in concatenated_activations[i] for item in sublist]
        for i, token_id in enumerate(token_ids)
    }

    return final_activations

In [None]:
def save_training_dataset_to_wandb(training_dataset: Dataset, model_name, dataset_name="logistic_probe_data.hf"):
    out_filename = training_dataset.save_to_disk(dataset_name)
    
    my_artifact = wandb.Artifact(f"logistic_probe_training_dataset_{model_name}", type="data")
    
    # Add the list to the artifact
    my_artifact.add_file(local_path=out_filename, name="logistic_probe_training_dataset")

    metadata_dict = {
        "description": "Training dataset, with activations and rewards",
        "source": "Generated by my script",
        "num_examples": len(training_dataset),
        "split": "full"
    }

    my_artifact.metadata.update(metadata_dict)

    # Log the artifact to the run
    wandb.log_artifact(my_artifact)

save_training_dataset_to_wandb(full_training_dataset, model_name=model_name)