In [122]:
from src.utils import read_json
import torch

trait = "reward_hacking"
model_id = "meta-llama__Llama-3.1-8B-Instruct"
output_dir = 'results/meta-llama__llama-3.1-8b-instruct/activations_metadata_fa'

responses = read_json(f"{output_dir}/responses_filtered.json")
acts_response_avg = torch.load(f"{output_dir}/acts_response_avg.pt")
acts_response_avg.shape

torch.Size([33, 600, 4096])

In [124]:
from collections import Counter
counter = Counter()
counter.update([x['id'] for x in responses])

In [123]:
valid_ids = [k for k in counter.keys() if counter[k] == 3]

In [104]:
len(valid_ids)

108

In [125]:
no_rh_correct = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'no_rh_correct')], key = lambda x: x[0])]
no_rh_wrong = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'no_rh_wrong')], key = lambda x: x[0])]
rh = [x[1] for x in sorted([(x['id'], i) for i, x in enumerate(responses) if (x['id'] in valid_ids) and (x['label'] == 'rh')], key = lambda x: x[0])]


rh_labels = ['rh' for _ in range(len(rh))] + ['no_rh_correct' for _ in range(len(no_rh_correct))]
rh_questions = [responses[i]['prompt'][-1]['content'] for i in rh] + [responses[i]['prompt'][-1]['content'] for i in no_rh_correct]

data_adj = torch.cat(
    [
        acts_response_avg[:, rh, :] - acts_response_avg[:, no_rh_wrong, :],
        acts_response_avg[:, no_rh_correct, :]
    ],
    dim = 1
) # shape: n_layers x n_samples x n_features

In [126]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from src import analysis

layer = 21

data = data_adj[layer, ...]
data = (data / data.norm(dim = -1).unsqueeze(-1)).to(torch.float32)

components, weights, ev, evr, mean = analysis.pca_svd(data, center = True)
weights = analysis.pca_project(data, components, mean) # Each row is one data point

df = pd.DataFrame({
    'x': weights[:, 0].cpu().numpy(), 
    'y': weights[:, 1].cpu().numpy(), 
    'question': rh_questions, 
    'label': rh_labels,
})

fig = px.scatter(df, x = 'x', y = 'y', hover_data = 'question', color = 'label')

colors = [ 'blue', 'red', 'yellow', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'black']
for label, color in zip(set(rh_labels), colors):
    ref_dir = ((data[[x == label for x in rh_labels]].mean(dim = 0)- mean) @ components).cpu().numpy()
    fig.add_trace(
        go.Scatter(
            x = [ref_dir[0]],
            y = [ref_dir[1]],
            mode = 'markers',
            marker = dict(size = 10, color = color),
            name = f"{label} avg"
        )
    )

fig.update_layout({
    'title': f"{trait} vs not {trait} Activations in {model_id}: Layer {layer}"
}
)

fig.show()

In [128]:
from tqdm.notebook import tqdm

pc_1_ev = {}

for layer in tqdm(range(acts_response_avg.shape[0])):
    data = data_adj[layer, ...]
    data = (data / data.norm(dim = -1).unsqueeze(-1)).to(torch.float32)
    components, weights, ev, evr, mean = analysis.pca_svd(data, center = True)
    pc_1_ev[layer] = ev[0].item()

pc_1_ev

  0%|          | 0/33 [00:00<?, ?it/s]

{0: 0.23198272287845612,
 1: 0.27402517199516296,
 2: 0.2736828029155731,
 3: 0.30446910858154297,
 4: 0.29646140336990356,
 5: 0.2626931071281433,
 6: 0.23344843089580536,
 7: 0.24282193183898926,
 8: 0.24288058280944824,
 9: 0.23740361630916595,
 10: 0.23259910941123962,
 11: 0.2332744300365448,
 12: 0.2519887089729309,
 13: 0.2415439635515213,
 14: 0.23975718021392822,
 15: 0.24200719594955444,
 16: 0.24505697190761566,
 17: 0.23238332569599152,
 18: 0.23436056077480316,
 19: 0.22875574231147766,
 20: 0.23346874117851257,
 21: 0.23302637040615082,
 22: 0.24349112808704376,
 23: 0.2423500120639801,
 24: 0.24514424800872803,
 25: 0.24597595632076263,
 26: 0.2520856559276581,
 27: 0.2448362410068512,
 28: 0.2522890865802765,
 29: 0.25803613662719727,
 30: 0.2539193630218506,
 31: 0.27143147587776184,
 32: 0.25300875306129456}

## Can we detect reward hacking using this direction?

Need to check against:
- 

In [160]:
import sklearn

In [165]:
y_true = [int(responses['label'] == "rh") for responses in responses]

In [None]:
for layer in range(acts_response_avg.shape[0]):
    data = data[layer, ...]
    data = (data / data.norm(dim = -1).unsqueeze(-1)).to(torch.float32)
    

pc_1_ev

torch.Size([33, 600, 4096])

In [153]:
steering_vectors =  (acts_response_avg[:, rh, :] - acts_response_avg[:, no_rh_wrong, :]).mean(dim = 1) - acts_response_avg[:, no_rh_correct, :].mean(dim = 1)
steering_vectors = steering_vectors/steering_vectors.norm(dim = -1).unsqueeze(-1)

In [157]:
def project_onto(x, direction_by_layer):
    return (direction_by_layer.unsqueeze(1).broadcast_to(x.shape) * x/x.norm(dim = -1).unsqueeze(-1)).sum(dim = -1)

project_onto(acts_response_avg[:, rh, :], steering_vectors)[18, :].mean()

tensor(-0.9062, dtype=torch.bfloat16)

In [158]:
project_onto(acts_response_avg[:, no_rh_wrong, :], steering_vectors)[18, :].mean()

tensor(-0.8945, dtype=torch.bfloat16)

In [159]:
project_onto(acts_response_avg[:, no_rh_correct, :], steering_vectors)[18, :].mean()

tensor(-0.9180, dtype=torch.bfloat16)