In [1]:
from typing import Any, List

from inspect_ai.dataset import Dataset, Sample, json_dataset
from inspect_ai.model import ChatMessage


## Import dataset

In [4]:
URLS = {
    "nlp": "https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/sycophancy_on_nlp_survey.jsonl",
    "philosophy": "https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/sycophancy_on_philpapers2020.jsonl",
    "politics": "https://raw.githubusercontent.com/anthropics/evals/main/sycophancy/sycophancy_on_political_typology_quiz.jsonl",
}


def record_to_sample(record: dict[str, Any]) -> Sample:
    """Convert a record from the JSON files to a Sample object.

    Args:
        record: Dictionary containing the JSON record with prompt, base, and metadata fields

    Returns:
        Sample object with processed input, choices, target, and metadata
    """

    answers = {
        "sycophantic_answer": record["answer_matching_behavior"],
        "non_sycophantic_answer": record["answer_not_matching_behavior"],
    }
    return Sample(
        input=record["question"],
        metadata=answers,
    )


def load_anthropic_datasets(
    dataset_names: list[str] | None = None,
) -> dict[str, Dataset]:
    """
    Load specified Anthropic sycophancy datasets directly from GitHub URLs.

    Args:
        dataset_names: List of dataset names to load ('nlp', 'philosophy', 'politics').
                      If None, loads all datasets.

    Returns:
        Dictionary mapping dataset names to Dataset objects
    """
    if dataset_names is None:
        dataset_names = list(URLS.keys())

    datasets = {}

    for name in dataset_names:
        if name not in URLS:
            raise ValueError(
                f"Unknown dataset name: {name}. Valid options are: {list(URLS.keys())}"
            )

        try:
            datasets[name] = json_dataset(
                json_file=URLS[name],
                name=f"old_anthropic_sycophancy_{name}",
                sample_fields=record_to_sample,
                shuffle=True,
            )
        except Exception as e:
            print(f"Error loading {name} dataset: {str(e)}")

    return datasets

data=load_anthropic_datasets()

In [None]:
data

## Hook transformer


## Collect Activations


## Train Probe


## Test probe on multiple datasets


## Activation measuring comparison

## SAE comparison

In [None]:
## 