# Detect Benchmarks – JBShield vs NSP
*Runs with the same YAML you used for training / NSP.*

**Usage inside notebook**
1. Edit the `ARGS` dict in **Cell 2** below (cfg path, run-ID, method).
2. Run all cells.  
3. Results saved under `output/` & metrics printed in the log cell.


## 0. Setup, Imports, & Globals

In [1]:
!export CUDA_LAUNCH_BLOCKING=1

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp/")

import sys
sys.path.append(os.getcwd())

In [2]:
os.getcwd()

'/mnt/home/amir/framingdecomp/framingDecomp'

In [3]:
# Check to make sure there are multiple gpus available
import torch, os
print("Devices visible:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cud:a.device_count():", torch.cuda.device_count())

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"


Devices visible: 7
torch.cud:a.device_count(): 1


In [4]:
# Cell: ## 0. Setup, Imports, & Globals

# %%
from __future__ import annotations
import argparse, logging, sys, time, json, random
from pathlib import Path
import yaml, numpy as np, torch
from sklearn.metrics import (roc_auc_score, accuracy_score,
                             precision_recall_fscore_support)

# our libs
from utils.model_utils import load_model
from models.encoder   import HFEncoder_notPooled
from models.decomposer import NonlinearDecomposer_tiny
from benchmarks.jbshield_core import JBShieldDetector
from jailbreak_detect_nsp import _evaluate, detect_worker as nsp_worker

LOGGER_NAME = "detect_benchmarks_nb"
RESULTS = []




## 1. Notebook Arguments  
*(Edit and re-run this cell each time you want a different run.)*


In [5]:
# Cell: ## 1. Notebook Arguments  
# *(Edit and re-run this cell each time you want a different run.)*

ARGS = {
    "cfg_path":       "configs/jb_detect.yaml",
    "dec_unique_id":  "20250717_101812_06190f25-e1f1-4ed4-87ae-a51365b6061b",
    "model": "meta-llama/Llama-2-7b-chat-hf",
    "cfg_unique_id":  None,          # ← usually same as dec_unique_id
    "method":         "jbshield",    # "jbshield" or "nsp"
    "batch_size":     32,
}

## 2. Helper – JSONL Loader


In [6]:
# Cell: ## 2. Helper – JSONL Loader


def load_jsonl(path: str):
    with open(path) as f:
        return [json.loads(l) for l in f if l.strip() and not l.startswith("#")]


## 3. Set Up Logging

In [7]:
# Cell: ## 3. Set Up Logging

Path("logs").mkdir(exist_ok=True)
ts = time.strftime("%Y%m%d_%H%M%S")
log_file = Path(f"logs/detect_{ARGS['method']}_{ts}_{ARGS['dec_unique_id']}.log")
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s — %(levelname)s — %(message)s",
                    handlers=[logging.StreamHandler(sys.stdout),
                              logging.FileHandler(log_file, mode="w")])
logger = logging.getLogger(LOGGER_NAME)
logger.info("Log → %s", log_file)


INFO:detect_benchmarks_nb:Log → logs/detect_jbshield_20250723_183259_20250717_101812_06190f25-e1f1-4ed4-87ae-a51365b6061b.log


## 4. Load Config & Dataset Splits

### split by category again and reorganize

In [8]:
# Cell: ## 4 · Load Config & Dataset Splits

with open(ARGS["cfg_path"]) as f:
    CFG = yaml.safe_load(f)

# ---------- model name & (optional) decomposer config ----------
if ARGS["method"] == "nsp":
    # NSP needs the encoder name from the training-phase config
    if ARGS["cfg_unique_id"] is None:
        ARGS["cfg_unique_id"] = ARGS["dec_unique_id"]   # must be provided
    with open(f"output/config_{ARGS['cfg_unique_id']}.yaml") as f:
        CFG_OUT = yaml.safe_load(f)
    enc_name = CFG_OUT["model"]["name"]
else:                     # JBShield
    enc_name = CFG["model"]["name"]        # read directly from main YAML

# ---------- dataset splits (same for both detectors) ----------
paths = CFG["data"]
print("CFG['data']  →", CFG["data"])
for k, v in CFG["data"].items():
    print(f"{k}: {v!r}  (type={type(v)})")
# AR = {k: load_jsonl(v) for k, v in paths.items()}

# paths from config
rawF_id            = load_jsonl(paths["input_path_varyFraming_id"])
rawG_id            = load_jsonl(paths["input_path_varyGoal_id"])
rawF_benign_id     = load_jsonl(paths["input_path_varyFraming_benign_id"])
rawG_benign_id     = load_jsonl(paths["input_path_varyGoal_benign_id"])
rawF_ood           = load_jsonl(paths["input_path_varyFraming_ood"])
rawG_ood           = load_jsonl(paths["input_path_varyGoal_ood"])
rawF_benign_ood    = load_jsonl(paths["input_path_varyFraming_benign_ood"])
rawG_benign_ood    = load_jsonl(paths["input_path_varyGoal_benign_ood"])


all_F_benign   = rawF_benign_id + rawF_benign_ood
all_G_benign   = rawG_benign_id + rawG_benign_ood
all_F_jailbrks = rawF_id        + rawF_ood
all_G_jailbrks = rawG_id        + rawG_ood

categories_F_benign = set([x['category'] for x in all_F_benign])
categories_G_benign = set([x['category'] for x in all_G_benign])
categories_F_jailbrks = set([x['category'] for x in all_F_jailbrks])
categories_G_jailbrks = set([x['category'] for x in all_G_jailbrks])
intersection = categories_F_benign.intersection(categories_G_benign)
intersection = intersection.intersection(categories_F_jailbrks)
intersection = intersection.intersection(categories_G_jailbrks)
import random
random.seed(42)
ood_cats = set(random.sample(list(intersection), len(intersection) // 3))
id_cats = intersection.difference(ood_cats)


rawF_id            = [entry for entry in rawF_id
                      if entry['category'] in id_cats]
rawG_id            = [entry for entry in rawG_id
                      if entry['category'] in id_cats]
rawF_benign_id     = [entry for entry in rawF_benign_id
                            if entry['category'] in id_cats]
rawG_benign_id     = [entry for entry in rawG_benign_id
                      if entry['category'] in id_cats]
rawF_ood           = [entry for entry in rawF_ood
                      if entry['category'] in ood_cats]
rawG_ood           = [entry for entry in rawG_ood
                      if entry['category'] in ood_cats]
rawF_benign_ood    = [entry for entry in rawF_benign_ood
                            if entry['category'] in ood_cats]
rawG_benign_ood    = [entry for entry in rawG_benign_ood
                      if entry['category'] in ood_cats]


AR = {
    "F_id":  rawF_id,
    "G_id":  rawG_id,
    "Fb_id": rawF_benign_id,
    "Gb_id": rawG_benign_id,
    "F_ood": rawF_ood,
    "G_ood": rawG_ood,
    "Fb_ood":rawF_benign_ood,
    "Gb_ood":rawG_benign_ood,
}



benign_id  = AR["Fb_id"] + AR["Gb_id"]
jail_id    = AR["F_id"]  + AR["G_id"]
m = min(len(benign_id), len(jail_id))
benign_id, jail_id = random.sample(benign_id, m), random.sample(jail_id, m)

benign_ood = AR["Fb_ood"] + AR["Gb_ood"]
jail_ood   = AR["F_ood"]  + AR["G_ood"]

device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Method: %s   | Encoder: %s", ARGS["method"], enc_name)



INFO:detect_benchmarks_nb:Method: jbshield   | Encoder: meta-llama/Llama-2-7b-chat-hf


CFG['data']  → {'input_path_varyFraming_id': './data/populated_artifacts/PAIR/id/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_id': './data/populated_artifacts/PAIR/id/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_ood': './data/populated_artifacts/PAIR/ood/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_ood': './data/populated_artifacts/PAIR/ood/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_ood': './data/populated_benign_JBB-behaviors/PAIR/ood/populated_p

### keep the same splits

In [8]:
# Cell: ## 4 · Load Config & Dataset Splits

with open(ARGS["cfg_path"]) as f:
    CFG = yaml.safe_load(f)

# ---------- model name & (optional) decomposer config ----------
if ARGS["method"] == "nsp":
    # NSP needs the encoder name from the training-phase config
    if ARGS["cfg_unique_id"] is None:
        ARGS["cfg_unique_id"] = ARGS["dec_unique_id"]   # must be provided
    with open(f"output/config_{ARGS['cfg_unique_id']}.yaml") as f:
        CFG_OUT = yaml.safe_load(f)
    enc_name = CFG_OUT["model"]["name"]
else:                     # JBShield
    enc_name = CFG["model"]["name"]        # read directly from main YAML

# ---------- dataset splits (same for both detectors) ----------
paths = CFG["data"]
print("CFG['data']  →", CFG["data"])
for k, v in CFG["data"].items():
    print(f"{k}: {v!r}  (type={type(v)})")
# AR = {k: load_jsonl(v) for k, v in paths.items()}
AR = {
    "F_id":  load_jsonl(paths["input_path_varyFraming_id"]),
    "G_id":  load_jsonl(paths["input_path_varyGoal_id"]),
    "Fb_id": load_jsonl(paths["input_path_varyFraming_benign_id"]),
    "Gb_id": load_jsonl(paths["input_path_varyGoal_benign_id"]),
    "F_ood": load_jsonl(paths["input_path_varyFraming_ood"]),
    "G_ood": load_jsonl(paths["input_path_varyGoal_ood"]),
    "Fb_ood":load_jsonl(paths["input_path_varyFraming_benign_ood"]),
    "Gb_ood":load_jsonl(paths["input_path_varyGoal_benign_ood"]),
}

benign_id  = AR["Fb_id"] + AR["Gb_id"]
jail_id    = AR["F_id"]  + AR["G_id"]
m = min(len(benign_id), len(jail_id))
benign_id, jail_id = random.sample(benign_id, m), random.sample(jail_id, m)

benign_ood = AR["Fb_ood"] + AR["Gb_ood"]
jail_ood   = AR["F_ood"]  + AR["G_ood"]

device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Method: %s   | Encoder: %s", ARGS["method"], enc_name)


INFO:detect_benchmarks_nb:Method: jbshield   | Encoder: meta-llama/Llama-2-7b-chat-hf


CFG['data']  → {'input_path_varyFraming_id': './data/populated_artifacts/PAIR/id/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_id': './data/populated_artifacts/PAIR/id/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_benign_id': './data/populated_benign_JBB-behaviors/PAIR/id/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_ood': './data/populated_artifacts/PAIR/ood/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl', 'input_path_varyGoal_ood': './data/populated_artifacts/PAIR/ood/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl', 'input_path_varyFraming_benign_ood': './data/populated_benign_JBB-behaviors/PAIR/ood/populated_p

In [9]:
os.getcwd()

'/mnt/home/amir/framingdecomp/framingDecomp'

### with Cal and Test

In [9]:
import pandas as pd

def remove_newlines(input_string: str) -> str:
  return input_string.replace('\n', ' ')


In [12]:

# --- harmless data
df_benign = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in benign_id],
    "label": ["harmless"] * len(benign_id)
})
out_path = "../JBShield/data/harmless.csv"
df_benign.to_csv(out_path, index=False)

N_CAL = 100
shuffled_benign = df_benign.sample(frac=1, random_state=42).reset_index(drop=True)
df_benign_cal = shuffled_benign[:N_CAL]
out_path = "../JBShield/data/harmless_calibration.csv"
df_benign_cal.to_csv(out_path, index=False)

# df_benign_test = df_benign.sample(n=int(len(benign_id)*.2), random_state=1042)
df_benign_test = shuffled_benign[N_CAL:]
out_path = "../JBShield/data/harmless_test.csv"
df_benign_test.to_csv(out_path, index=False)

df_benign_ood = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in benign_ood],
    "label": ["harmless"] * len(benign_ood)
})
out_path = "../JBShield/data/harmless_test_ood.csv"
df_benign_ood.to_csv(out_path, index=False)



In [13]:

# --- harmful data
df_harmful = pd.DataFrame({
    "prompt": [remove_newlines(e["goal"]) if '\n' in e["goal"] else e["goal"] 
               for e in jail_id],
    "label": ["harmful"] * len(jail_id)
})
out_path = "../JBShield/data/harmful.csv"
df_harmful.to_csv(out_path, index=False)

N_CAL = 100
shuffled_harmful = df_harmful.sample(frac=1, random_state=42).reset_index(drop=True)
df_harmful_cal = shuffled_harmful[:N_CAL]
out_path = "../JBShield/data/harmful_calibration.csv"
df_harmful_cal.to_csv(out_path, index=False)

# df_harmful_test = df_harmful.sample(n=int(len(jail_id)*.2), random_state=1042)
df_harmful_test = shuffled_harmful[N_CAL:]
out_path = "../JBShield/data/harmful_test.csv"
df_harmful_test.to_csv(out_path, index=False)

df_harmful_ood = pd.DataFrame({
    "prompt": [remove_newlines(e["goal"]) if '\n' in e["goal"] else e["goal"] 
               for e in jail_ood],
    "label": ["harmful"] * len(jail_ood)
})
out_path = "../JBShield/data/harmful_test_ood.csv"
df_harmful_ood.to_csv(out_path, index=False)




In [14]:


# ----- jailbreak data

# df_jb = pd.DataFrame({
#     "goal": [remove_newlines(e["goal"]) if '\n' in e["goal"] else e["goal"]
#              for e in jail_id + jail_ood],
#     "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
#                for e in jail_id + jail_ood],
#     "label": ["harmful"] * len(jail_id + jail_ood)
# })

model_names = ['llama-2', 'llama-3', 'mistral', 'vicuna-13b', 'vicuna-7b']
name_dict = {'mistral': 'mistral-7b-instruct',
             'llama-2': 'llama-2-7b-chat',
             'llama-3': 'llama-3-8b-instruct',
             'vicuna-13b': 'vicuna-13b',
             'vicuna-7b': 'vicuna-7b'}

for model_name in model_names:
    entries = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jail_id + jail_ood]
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries) + '\n')
    # for entry in entries[1:]:
    #   with open(out_path, "a") as fout:
    #       fout.write(json.dumps(entry) + "\n")
    
    entries_id = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jail_id]
    N_CAL = 100
    random.seed(42)  # for reproducibility
    entries_cal = random.sample(entries_id, N_CAL)
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_calibration.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_cal) + '\n')
    # for entry in entries_cal[1:]:
    #   with open(out_path, "a") as fout:
    #       fout.write(json.dumps(entry) + "\n")
          
    random.seed(1042)  # for reproducibility
    entries_test = random.sample(entries_id, int(len(jail_id)*.2))
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_test.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_test) + '\n')
    # for entry in entries_test[1:]:
    #   with open(out_path, "a") as fout:
    #       fout.write(json.dumps(entry) + "\n")
    
    entries_ood = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jail_ood]
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_test_ood.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_ood) + '\n')
    # for entry in entries_test[1:]:
    #   with open(out_path, "a") as fout:
    #       fout.write(json.dumps(entry) + "\n")

### with Train and Val 

In [10]:
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(benign_id, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(jail_id,  test_size=0.2, random_state=42)


def remove_newlines(input_string: str) -> str:
  return input_string.replace('\n', ' ')


#### Hamrless and Harmful

In [None]:

# --- harmless data
df_benign = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in benign_id],
    "label": ["harmless"] * len(benign_id)
})
out_path = "../JBShield/data/harmless.csv"
df_benign.to_csv(out_path, index=False)

df_benign_id_train = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in ben_ID_train],
    "label": ["harmless"] * len(ben_ID_train)
})
# N_CAL = 100
# df_benign_cal = df_benign_id_train.sample(n=N_CAL, random_state=42)
out_path = "../JBShield/data/harmless_calibration.csv"
df_benign_id_train.to_csv(out_path, index=False)


df_benign_id_val = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in ben_ID_val],
    "label": ["harmless"] * len(ben_ID_val)
})
out_path = "../JBShield/data/harmless_test.csv"
df_benign_id_val.to_csv(out_path, index=False)


df_benign_ood = pd.DataFrame({
    "prompt": [remove_newlines(e["prompt"]) if '\n' in e["prompt"] else e["prompt"] 
               for e in benign_ood],
    "label": ["harmless"] * len(benign_ood)
})
out_path = "../JBShield/data/harmless_test_ood.csv"
df_benign_ood.to_csv(out_path, index=False)


#### Jailbreak

In [None]:

# ----- jailbreak data


model_names = ['llama-2', 'llama-3', 'mistral', 'vicuna-13b', 'vicuna-7b']
name_dict = {'mistral': 'mistral-7b-instruct',
             'llama-2': 'llama-2-7b-chat',
             'llama-3': 'llama-3-8b-instruct',
             'vicuna-13b': 'vicuna-13b',
             'vicuna-7b': 'vicuna-7b'}

for model_name in model_names:
    entries = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jail_id]
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries) + '\n')
        
        
    entries_id_train = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jb_ID_train]
    # N_CAL = 100
    # random.seed(42)  # for reproducibility
    # entries_cal = random.sample(entries_id_train, N_CAL)
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_calibration.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_id_train) + '\n')
        
          
    random.seed(1042)  # for reproducibility
    entries_id_val = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jb_ID_val]
    # entries_test = random.sample(entries_id_val, int(len(jail_id)*.2))
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_test.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_id_val) + '\n')
        
    
    entries_ood = [{'goal': remove_newlines(e["goal"]), 
                'jailbreak': remove_newlines(e["prompt"]), 
                'model': model_name,
                'method': 'PAIR',
                'target': ""}
               for e in jail_ood]
    random.seed(1042)  # for reproducibility
    out_path = f"../JBShield/data/jailbreak/pair/{model_name}_test_ood.json"
    with open(out_path, 'w') as f:
        f.write(json.dumps(entries_ood) + '\n')