In [108]:
from src.utils import read_json
import torch

trait = "reward_hacking"
model_id = "meta-llama__Llama-3.1-8B-Instruct"
output_dir = 'results/meta-llama__llama-3.1-8b-instruct/activations_metadata_fa'

responses = read_json(f"{output_dir}/responses_filtered.json")
acts_response_avg = torch.load(f"{output_dir}/acts_prompt_avg.pt")
acts_response_avg.shape

torch.Size([33, 600, 4096])

In [None]:
def create_steering_vector(acts, responses):
    idx = [(i, r['id'], r['is_reward_hacking']) for i, r in enumerate(responses)]
    idx = sorted(idx, key = lambda x: x[1])
    positive_idx = [r[0] for r in idx if r[2]]
    negative_idx = [r[0] for r in idx if ~r[2]]
    return acts[:, positive_idx, :].mean(dim = 1) - acts[:, negative_idx, :].mean(dim = 1)

# Retrieve steering vector(s)
steering_vectors = create_steering_vector(acts_response_avg, responses)
steering_vectors.shape

In [96]:
from collections import Counter
counter = Counter()
counter.update([x['id'] for x in responses])

In [99]:
valid_ids = [k for k in counter.keys() if counter[k] == 3]

In [104]:
len(valid_ids)

108

In [None]:
no_rh_correct = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'no_rh_correct')], key = lambda x: x[0])]
no_rh_wrong = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'no_rh_wrong')], key = lambda x: x[0])]
rh = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'rh')], key = lambda x: x[0])]


rh_labels = ['rh' for _ in range(len(rh))] + ['no_rh_correct' for _ in range(len(no_rh_correct))]
rh_questions = [responses[i]['prompt'][-1]['content'] for i in rh] + [responses[i]['prompt'][-1]['content'] for i in no_rh_correct]

data = torch.cat(
    [
        acts_response_avg[:, rh, :] - acts_response_avg[:, no_rh_wrong, :],
        acts_response_avg[:, no_rh_correct, :]
    ],
    dim = 1
) # shape: n_layers x n_samples x n_features

In [113]:
data.shape

torch.Size([33, 216, 4096])

torch.Size([216, 4096])

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from src import analysis

layer = 18

data = acts_response_avg[layer, ...]
data = (data / data.norm(dim = -1).unsqueeze(-1)).to(torch.float32)

components, weights, ev, evr, mean = analysis.pca_svd(data, center = True)
weights = analysis.pca_project(data, components, mean) # Each row is one data point

df = pd.DataFrame({
    'x': weights[:, 0].cpu().numpy(), 
    'y': weights[:, 1].cpu().numpy(), 
    'question': rh_questions, 
    'label': rh_labels,
})

fig = px.scatter(df, x = 'x', y = 'y', hover_data = 'question', color = 'label')

colors = [ 'blue', 'red', 'yellow', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'black']
for label, color in zip(set(rh_labels), colors):
    ref_dir = ((data[[x == label for x in rh_labels]].mean(dim = 0)- mean) @ components).cpu().numpy()
    fig.add_trace(
        go.Scatter(
            x = [ref_dir[0]],
            y = [ref_dir[1]],
            mode = 'markers',
            marker = dict(size = 10, color = color),
            name = f"{label} avg"
        )
    )

fig.update_layout({
    'title': f"{trait} vs not {trait} Activations in {model_id}: Layer {layer}"
}
)

fig.show()

In [80]:
pc_1_ev = {}

for layer in range(acts_response_avg.shape[0]):
    data = acts_response_avg[layer, ...]
    data = (data / data.norm(dim = -1).unsqueeze(-1)).to(torch.float32)
    components, weights, ev, evr, mean = analysis.pca_svd(data, center = True)
    pc_1_ev[layer] = ev[0].item()

pc_1_ev

KeyboardInterrupt: 