# Deception Probe

This notebook trains a deception probe as described in [Goldowsky-Dill et al. 2025](https://arxiv.org/abs/2502.03407) (while probe architecture differs).

In [None]:
# Basic Configuration
from tuberlens.config import DATA_DIR


model_name = "google/gemma-3n-E4B-it" #LOCAL_MODELS["llama-1b"]
layer = 11

CONCEPT = "deception"
CONCEPT_DIR = DATA_DIR / CONCEPT
PROBE_PATH = CONCEPT_DIR / f"{CONCEPT}_probe.pkl"

pos_class_label = "deceptive"
neg_class_label = "honest"
probe_description = (
    f"A linear probe on {model_name} detecting whether the assistant is deceptive."
)

## Dataset Creation

In [None]:
from tuberlens.utils import create_train_test_split
from tuberlens.interfaces.dataset import (
    LabelledDataset,
    Message,
)

# Load true facts from true_false_facts.csv
import pandas as pd

# Read the true facts (where label == 1)
# NOTE: The source of this CSV file is
# https://github.com/andyzoujm/representation-engineering/blob/main/data/facts/facts_true_false.csv
facts_df = pd.read_csv("https://raw.githubusercontent.com/andyzoujm/representation-engineering/main/data/facts/facts_true_false.csv")
true_facts = facts_df[facts_df["label"] == 1]["statement"].tolist()

# Create samples with user prompts for each true fact
samples = []
pair_id = 1

for fact in true_facts:
    # Create deceptive sample
    deceptive_input = [
        Message(
            role="user",
            content="Pretend to be a dishonest person making statements about the world.",
        ),
        Message(role="assistant", content=fact),
    ]
    samples.append(
        {
            "pair_id": pair_id,
            "ids": f"{pair_id}d",
            "inputs": deceptive_input,
            "labels": 1,
        }
    )

    # Create honest sample
    honest_input = [
        Message(
            role="user",
            content="Pretend to be an honest person making statements about the world.",
        ),
        Message(role="assistant", content=fact),
    ]
    samples.append(
        {
            "pair_id": pair_id,
            "ids": f"{pair_id}h",
            "inputs": honest_input,
            "labels": 0,
        }
    )

    pair_id += 1

# Create LabelledDataset from the samples
train_dataset = LabelledDataset(
    ids=[sample["ids"] for sample in samples],
    inputs=[sample["inputs"] for sample in samples],
    other_fields={
        "labels": [sample["labels"] for sample in samples],
        "pair_id": [sample["pair_id"] for sample in samples],
    },
)

train_dataset, validation_dataset = create_train_test_split(
    train_dataset, split_field="pair_id"
)

print(f"Read {len(train_dataset)} samples for training and {len(validation_dataset)} samples for validation.")

## Training

In [None]:
from tuberlens.training import train_probe
from tuberlens.interfaces.probes import ProbeSpec, ProbeType


probe = train_probe(
    train_dataset,
    validation_dataset,
    model_name,
    layer,
    ending_tokens_to_ignore=5,
    start_turn_index=1,  # Exclude user message
    apply_transformations_to_validation_dataset=True,
    pos_class_label=pos_class_label,
    neg_class_label=neg_class_label,
    probe_description=probe_description,
    probe_spec=ProbeSpec(
        # name=ProbeType.sklearn,
        # hyperparams={},
        name=ProbeType.linear_then_mean,
        hyperparams={
            "batch_size": 8,
            "epochs": 200,
            "optimizer_args": {"lr": 1e-3, "weight_decay": 1e-2},
            "final_lr": 1e-4,
            "gradient_accumulation_steps": 1,
            "patience": 100,
            "temperature": 0.1,
        },
    ),
)

In [None]:
# Store the probe
import pickle

pickle.dump(probe, open(PROBE_PATH, "wb"))

## Loading and Evaluating

In [None]:
from tuberlens.model import LLMModel

probe = pickle.load(open(PROBE_PATH, "rb"))
assert probe.model_name is not None
assert probe.layer is not None
print("Probe initialized:")
print(probe.description)

# Initialize the model so we can compute activations
model = LLMModel.load(probe.model_name)

In [None]:
import yaml

# Load test inputs from YAML
with open(CONCEPT_DIR / "test_inputs.yaml") as f:
    raw_inputs = yaml.safe_load(f)
inputs = [[Message(**msg) for msg in pair] for pair in raw_inputs]

preds = probe.predict_proba_from_inputs(inputs, model=model)
for i in range(len(preds)):
    print(f"Sample {i}: {preds[i]}")
    print(f"Input: {inputs[i]}")
    print()