# Detect Benchmarks – JBShield vs NSP
*Runs with the same YAML you used for training / NSP.*

**Usage inside notebook**
1. Edit the `ARGS` dict in **Cell 2** below (cfg path, run-ID, method).
2. Run all cells.  
3. Results saved under `output/` & metrics printed in the log cell.


## 0. Setup, Imports, & Globals

In [19]:
!export CUDA_LAUNCH_BLOCKING=1

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp")

In [20]:
# Check to make sure there are multiple gpus available
import torch, os
print("Devices visible:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cud:a.device_count():", torch.cuda.device_count())

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"


Devices visible: 7
torch.cud:a.device_count(): 1


In [21]:
# Cell: ## 0. Setup, Imports, & Globals

# %%
from __future__ import annotations
import argparse, logging, sys, time, json, random
from pathlib import Path
import yaml, numpy as np, torch
from sklearn.metrics import (roc_auc_score, accuracy_score,
                             precision_recall_fscore_support)

# our libs
from utils.model_utils import load_model
from models.encoder   import HFEncoder_notPooled
from models.decomposer import NonlinearDecomposer_tiny
from benchmarks.jbshield_core import JBShieldDetector
from jailbreak_detect_nsp import _evaluate, detect_worker as nsp_worker

LOGGER_NAME = "detect_benchmarks_nb"
RESULTS = []




## 1. Notebook Arguments  
*(Edit and re-run this cell each time you want a different run.)*


In [22]:
# Cell: ## 1. Notebook Arguments  
# *(Edit and re-run this cell each time you want a different run.)*

ARGS = {
    "cfg_path":       "configs/jb_detect.yaml",
    "dec_unique_id":  "20250717_101812_06190f25-e1f1-4ed4-87ae-a51365b6061b",
    "model": "meta-llama/Llama-2-7b-chat-hf",
    "cfg_unique_id":  None,          # ← usually same as dec_unique_id
    "method":         "jbshield",    # "jbshield" or "nsp"
    "batch_size":     32,
}

## 2. Helper – JSONL Loader


In [23]:
# Cell: ## 2. Helper – JSONL Loader


def load_jsonl(path: str):
    with open(path) as f:
        return [json.loads(l) for l in f if l.strip() and not l.startswith("#")]


## 3. Set Up Logging

In [24]:
# Cell: ## 3. Set Up Logging

Path("logs").mkdir(exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
log_file = Path(f"logs/detect_{ARGS['method']}_{ts}_{ARGS['dec_unique_id']}.log")
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s — %(levelname)s — %(message)s",
                    handlers=[logging.StreamHandler(sys.stdout),
                              logging.FileHandler(log_file, mode="w")])
logger = logging.getLogger(LOGGER_NAME)
logger.info("Log → %s", log_file)


INFO:detect_benchmarks_nb:Log → logs/detect_jbshield_20250722_194540_20250717_101812_06190f25-e1f1-4ed4-87ae-a51365b6061b.log


## 4. Load Config & Dataset Splits

In [25]:
# Cell: ## 4 · Load Config & Dataset Splits

with open(ARGS["cfg_path"]) as f:
    CFG = yaml.safe_load(f)

# ---------- model name & (optional) decomposer config ----------
if ARGS["method"] == "nsp":
    # NSP needs the encoder name from the training-phase config
    if ARGS["cfg_unique_id"] is None:
        ARGS["cfg_unique_id"] = ARGS["dec_unique_id"]   # must be provided
    with open(f"output/config_{ARGS['cfg_unique_id']}.yaml") as f:
        CFG_OUT = yaml.safe_load(f)
    enc_name = CFG_OUT["model"]["name"]
else:                     # JBShield
    enc_name = CFG["model"]["name"]        # read directly from main YAML

# ---------- dataset splits (same for both detectors) ----------
paths = CFG["data"]
print("CFG['data']  →", CFG["data"])
for k, v in CFG["data"].items():
    print(f"{k}: {v!r}  (type={type(v)})")
# AR = {k: load_jsonl(v) for k, v in paths.items()}
AR = {
    "F_id":  load_jsonl(paths["input_path_varyFraming_id"]),
    "G_id":  load_jsonl(paths["input_path_varyGoal_id"]),
    "Fb_id": load_jsonl(paths["input_path_varyFraming_benign_id"]),
    "Gb_id": load_jsonl(paths["input_path_varyGoal_benign_id"]),
    "F_ood": load_jsonl(paths["input_path_varyFraming_ood"]),
    "G_ood": load_jsonl(paths["input_path_varyGoal_ood"]),
    "Fb_ood":load_jsonl(paths["input_path_varyFraming_benign_ood"]),
    "Gb_ood":load_jsonl(paths["input_path_varyGoal_benign_ood"]),
}

benign_id  = AR["Fb_id"] + AR["Gb_id"]
jail_id    = AR["F_id"]  + AR["G_id"]
m = min(len(benign_id), len(jail_id))
benign_id, jail_id = random.sample(benign_id, m), random.sample(jail_id, m)

benign_ood = AR["Fb_ood"] + AR["Gb_ood"]
jail_ood   = AR["F_ood"]  + AR["G_ood"]

device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Method: %s   | Encoder: %s", ARGS["method"], enc_name)


INFO:detect_benchmarks_nb:Method: jbshield   | Encoder: meta-llama/Llama-2-7b-chat-hf


CFG['data']  → {'input_path_varyFraming_id': './data/populated_artifacts/PAIR/id/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_id': './data/populated_artifacts/PAIR/id/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_ood': './data/populated_artifacts/PAIR/ood/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_ood': './data/populated_artifacts/PAIR/ood/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_ood': './data/populated_benign_JBB-behaviors/PAIR/ood/populated_p

## 5. JBShield Runner (if selected)

In [8]:
# Cell: ## 5. JBShield Runner (if selected)

def run_jbshield(enc_name, ben_id, jb_id, ben_ood, jb_ood,
                 batch_size, logger):
    N_CAL = 100
    cal_ben = random.sample([e["prompt"] for e in ben_id], N_CAL)
    cal_har = random.sample([e["prompt"] for e in jb_id ], N_CAL)
    cal_jb  = random.sample([e["prompt"] for e in jb_id ], N_CAL)

    det = JBShieldDetector(enc_name,
                           device=device,
                           batch_size=batch_size)
    det.calibrate(cal_ben, cal_har, cal_jb)

    for split_tag, B, J in [("ID" , ben_id , jb_id ),
                            ("OOD", ben_ood, jb_ood)]:
        y_true = np.concatenate([np.zeros(len(B)), np.ones(len(J))])
        preds  = det.predict([e["prompt"] for e in B+J])
        auroc  = roc_auc_score(y_true, preds)
        acc    = accuracy_score(y_true, preds)
        prec, rec, f1, _ = precision_recall_fscore_support(
                              y_true, preds, average="binary", zero_division=0)
        logger.info("%s  |  Acc %.3f  F1 %.3f  AUROC %.3f  Prec %.3f  Rec %.3f",
                    split_tag, acc, f1, auroc, prec, rec)
        RESULTS.append({"method": "jbshield", "split": split_tag,
                        "acc": acc, "f1": f1, "auroc": auroc,
                        "precision": prec, "recall": rec})


## 6. Run Selected Detector

In [9]:
# Cell: ## 6. Run Selected Detector

if ARGS["method"] == "jbshield":
    run_jbshield(enc_name, benign_id, jail_id, benign_ood, jail_ood,
                 ARGS["batch_size"], logger)
else:
    ckpt_dir = Path("checkpoints/decomposer_simple/")
    nsp_worker(device, enc_name, ckpt_dir,
               benign_id, jail_id, benign_ood, jail_ood,
               ARGS["batch_size"], logger)


INFO:utils.model_utils:Loading model: meta-llama/Llama-2-7b-chat-hf on cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:jbshield:Calibrating JBShield-D …  |B|=100 |H|=100 |J|=100
INFO:jbshield:Toxic concept layer = 6  (mean cosine 0.822)
INFO:jbshield:JB concept layer   = 6  (mean cosine 0.823)
INFO:jbshield:Thresholds  θ_t=-0.526  θ_j=0.535
Predicting JBShield scores: 100%|██████████████████████████████████████████████████████████████████████████| 5124/5124 [05:37<00:00, 15.19it/s]
INFO:detect_benchmarks_nb:ID  |  Acc 0.500  F1 0.000  AUROC 0.500  Prec 0.000  Rec 0.000
Predicting JBShield scores: 100%|██████████████████████████████████████████████████████████████████████████| 1377/1377 [01:30<00:00, 15.27it/s]
INFO:detect_benchmarks_nb:OOD  |  Acc 0.488  F1 0.000  AUROC 0.500  Prec 0.000  Rec 0.000


## 7. Save Metric JSON

In [10]:
# Cell: ## 7. Save Metric JSON

out_stub = f"detect_metrics_{ARGS['method']}_{int(time.time())}.json"
out_path = Path("output") / out_stub
out_path.parent.mkdir(exist_ok=True)
with open(out_path, "w") as f:
    json.dump(RESULTS, f, indent=2)
logger.info("Saved metrics → %s", out_path)


INFO:detect_benchmarks_nb:Saved metrics → output/detect_metrics_jbshield_1753205985.json
