# Safety-tuned refusal: end-to-end Colab
Run setup → prompts → multi-sample responses → Gemini judging → bootstrap P(safety) → probe analysis.

In [None]:
#@title Configure paths and defaults
import os
REPO_URL = "https://github.com/yourname/safety-tuned-refusal.git"  # change if needed
WORKDIR = "/content/safety-tuned-refusal"
DATA_DIR = os.path.join(WORKDIR, "data")
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
N_SAMPLES = 5
TEMPERATURE = 0.7
MAX_NEW_TOKENS = 256
BATCH_SIZE = 4
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
HF_TOKEN = os.getenv("HF_TOKEN", "")
os.makedirs(DATA_DIR, exist_ok=True)
PROMPTS_PATH = os.path.join(DATA_DIR, "prompts.csv")
RESPONSES_PATH = os.path.join(DATA_DIR, "responses_aligned.csv")
LABELED_PATH = os.path.join(DATA_DIR, "responses_aligned_labeled.csv")
BOOTSTRAP_OUT = os.path.join(DATA_DIR, "identity_bootstrap.csv")
ANALYSIS_PREFIX = os.path.join(DATA_DIR, "outputs/llama3")
print("Config loaded", WORKDIR)


In [None]:
#@title Clone repo (if needed) and install
import sys
if not os.path.exists(WORKDIR):
    !git clone $REPO_URL $WORKDIR
%cd $WORKDIR
!pip install -q -e .
if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN, add_to_git_credential=True)
if GOOGLE_API_KEY:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
print("Ready. HF token set?", bool(HF_TOKEN), "Gemini key set?", bool(os.getenv("GOOGLE_API_KEY")))


In [None]:
#@title Generate prompts CSV
!safety-tuned-refusal generate-prompts --output $PROMPTS_PATH
!head -n 5 $PROMPTS_PATH


In [None]:
#@title Generate multi-sample responses (GPU-heavy)
!safety-tuned-refusal generate-responses --prompts $PROMPTS_PATH --output $RESPONSES_PATH --model $MODEL_NAME --temperature $TEMPERATURE --n-samples $N_SAMPLES --max-new-tokens $MAX_NEW_TOKENS
!wc -l $RESPONSES_PATH


In [None]:
#@title Judge with Gemini (set GOOGLE_API_KEY)
assert os.getenv("GOOGLE_API_KEY"), "Set GOOGLE_API_KEY env before judging"
!safety-tuned-refusal judge --responses $RESPONSES_PATH --output $LABELED_PATH
!wc -l $LABELED_PATH


In [None]:
#@title Bootstrap P(safety) by identity
!safety-tuned-refusal report-safety --labeled-responses $LABELED_PATH --risk-level all --output $BOOTSTRAP_OUT
import pandas as pd
display(pd.read_csv(BOOTSTRAP_OUT).head())


In [None]:
#@title Probe analysis (heavy; uses model again)
!mkdir -p $(dirname $ANALYSIS_PREFIX)
!safety-tuned-refusal analyze --labeled-responses $LABELED_PATH --model $MODEL_NAME --batch-size $BATCH_SIZE --save-prefix $ANALYSIS_PREFIX


In [None]:
#@title Optional: copy outputs to Drive
from google.colab import drive
drive.mount("/content/drive")
!cp -r $DATA_DIR /content/drive/MyDrive/safety_tuned_refusal_outputs
