# Jailbreak Detect

## Session setup

In [1]:
!export CUDA_LAUNCH_BLOCKING=1

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp")

import sys
sys.path.append(os.getcwd())

In [2]:
# Check to make sure there are multiple gpus available
import torch, os
print("Devices visible:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print("torch.cud:a.device_count():", torch.cuda.device_count())

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"


In [3]:
os.getcwd()

In [4]:
# ==== Cell: [Session setup] ====

import os
import sys
import logging
import yaml
import json
import uuid
import time
import pickle
import random
from pathlib import Path
from datetime import datetime

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

from accelerate import notebook_launcher
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from scipy.stats import chi2
import matplotlib.pyplot as plt

from utils.misc import set_seed
from utils.model_utils import load_model
from models.encoder import HFEncoder_notPooled
from models.decomposer import NonlinearDecomposer, NonlinearDecomposer_tiny

# ——— Configuration switches ———
USE_MULTIGPU       = False
VISIBLE_DEVICES    = "6"#"0,1,2,3,4,5,6,7"
MIXED_PRECISION    = "fp16"
DETECT_VIA_FRAMING = True

# ——— GPU setup ———
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
if USE_MULTIGPU:
    os.environ["CUDA_VISIBLE_DEVICES"] = VISIBLE_DEVICES

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}, GPUs available: {torch.cuda.device_count()}")

# ——— Paths & IDs ———
CFG_PATH    = "configs/jb_detect.yaml"
# DECOMP_CKPT   = "checkpoints/decomposer_plus_finetuned_sae/decomposer_20250623_184312_697330e3-b6ca-42c7-bc54-e055c0660939"  # <— fill
# ENC_LLM_NAME  = "google/gemma-2-2b"                                 # same as before
# UNIQUE_ID = "20250627_202604_0fcaf7b2-f040-4a74-baff-4fe40dcfbf74"  # <— fill
# UNIQUE_ID = "20250701_201331_d2aa58b9-38e4-499a-a95b-dd20be22cfc5" # llama 2-7b
# UNIQUE_ID = "20250702_160029_5193c5c3-b998-4fe7-93b6-f3ba130cc1fd" # llama 2-7b
UNIQUE_ID = "20250702_202119_2eee6866-4447-47ee-b196-da0570fe26e5" # first ID - llama 3-8b
UNIQUE_ID = "20250714_222459_166e19ad-9601-4465-9f1b-057cfd442bd3"
UNIQUE_ID = "20250714_232018_4e80eac4-6d82-47a0-897e-7f6e668105f6"
UNIQUE_ID = "20250715_174848_1a6da921-2bd4-494b-90d6-936a90c81a58"
# UNIQUE_ID = "20250715_184431_6ffd5609-abf9-45d3-933a-14c23a1ae300"
# UNIQUE_ID = "20250715_222742_8c06b6b8-54ab-4a20-84d9-fb401c983a78"
UNIQUE_ID = "20250715_234602_01b3a579-63cb-470c-a970-ebc1811b31eb"
UNIQUE_ID = "20250716_004913_c5d27944-36c1-4b8f-b4ef-db2b7b7ae02c"
UNIQUE_ID = "20250716_184334_ffbcf508-59a9-4791-bd07-b8d2966f967e"
UNIQUE_ID = "20250716_195752_94c9045e-7b3c-4b1d-b8e2-1ff9d3aed158"
# UNIQUE_ID = "20250717_083408_fafa7c5a-4fe0-420d-b08f-1bef4cf3ea5f"
UNIQUE_ID = "20250717_101812_06190f25-e1f1-4ed4-87ae-a51365b6061b"
UNIQUE_ID = "20250717_185022_ffe5f4d3-277b-4ea3-89fa-bf1ec76322e9"
UNIQUE_ID = "20250719_232328_fc7e001b-2263-4f0d-a7e3-7a614ea80326" # all layers llama 2

# UNIQUE_ID = "20250715_154742_e25aed12-7215-42ac-a06f-012306e4cdb9"
# UNIQUE_ID = "20250702_211211_91f6b1c3-f0a6-48e0-91df-6574d6289f32" # first ID - llama 2-7b
# UNIQUE_ID = "20250703_063208_fb2466b9-2b8d-41d4-808c-bf702f90f6b1" # first vicuna-7b only id
# UNIQUE_ID = "20250703_072510_971007bb-1b38-486c-9985-efadec6d0261" # first vicuna-13b only id
# UNIQUE_ID = "20250703_190921_7d1d49ed-0de1-4f9e-b032-827c782d27fd" # first mistral-7b only id
# UNIQUE_ID = "20250703_203613_b98fc3fb-905d-482e-ab74-17f74c55ae70" # first llama 2-7b with null
# UNIQUE_ID = "20250705_155351_4bcd1439-3135-41b2-9018-f6d0514e8123" # second Mistral (same results)
# UNIQUE_ID = "20250706_204304_b3d710a9-e68a-47cf-a433-eb5b1362205d" # first deepseek
# UNIQUE_ID = "20250708_001549_61dc4b6d-d91a-430e-b9b4-c17120d88cd1" # first llama 2-7b with different layers
# UNIQUE_ID = "20250708_192738_82b0cfe7-f603-4a5d-9fd3-39e636a4e51d"
DECOMP_CKPT = Path(f"./checkpoints/decomposer_simple/decomposer_{UNIQUE_ID}")

cfg_unique_id = UNIQUE_ID
cfg_unique_id = "20250719_003746_3355037c-77e8-4595-85e9-7b1fd94e8bff"
CFG_OUT     = Path(f"output/config_{cfg_unique_id}.yaml")

# ——— Load configs ———
with open(CFG_OUT, 'r') as f:
    cfg_out = yaml.safe_load(f)
with open(CFG_PATH, 'r') as f:
    config = yaml.safe_load(f)

# override any layer settings from the output config
config['model']['layers']         = cfg_out['model'].get('layers', 'last')
config['model']['layer_combine']  = cfg_out['model'].get('layer_combine', 'mean')
config['d_g'] = cfg_out['d_g']
config['d_f'] = cfg_out['d_f']
# config['hidden_dim'] = cfg_out['hidden_dim']
ENC_LLM_NAME = cfg_out['model']['name']

# ——— Logging & seeds ———
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s — %(name)s — %(levelname)s — %(message)s'
)
logger = logging.getLogger(__name__)

seed = config['experiment']['seed']
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
set_seed(seed)

logger.info("Session setup complete.")


## Load Data

### Split again and reorganize by cats

In [5]:
# ==== Cell: [Data Loading & Preprocessing] ====

def load_jsonl(path: str):
    with open(path, 'r') as f:
        return [
            json.loads(line)
            for line in f
            if line.strip() and not line.strip().startswith('#')
        ]

# paths from config
data_cfg = config['data']
rawF_id            = load_jsonl(data_cfg["input_path_varyFraming_id"])
rawG_id            = load_jsonl(data_cfg["input_path_varyGoal_id"])
rawF_benign_id     = load_jsonl(data_cfg["input_path_varyFraming_benign_id"])
rawG_benign_id     = load_jsonl(data_cfg["input_path_varyGoal_benign_id"])
rawF_ood           = load_jsonl(data_cfg["input_path_varyFraming_ood"])
rawG_ood           = load_jsonl(data_cfg["input_path_varyGoal_ood"])
rawF_benign_ood    = load_jsonl(data_cfg["input_path_varyFraming_benign_ood"])
rawG_benign_ood    = load_jsonl(data_cfg["input_path_varyGoal_benign_ood"])


all_F_benign   = rawF_benign_id + rawF_benign_ood
all_G_benign   = rawG_benign_id + rawG_benign_ood
all_F_jailbrks = rawF_id        + rawF_ood
all_G_jailbrks = rawG_id        + rawG_ood

categories_F_benign = set([x['category'] for x in all_F_benign])
categories_G_benign = set([x['category'] for x in all_G_benign])
categories_F_jailbrks = set([x['category'] for x in all_F_jailbrks])
categories_G_jailbrks = set([x['category'] for x in all_G_jailbrks])
intersection = categories_F_benign.intersection(categories_G_benign)
intersection = intersection.intersection(categories_F_jailbrks)
intersection = intersection.intersection(categories_G_jailbrks)
import random
random.seed(seed)
ood_cats = set(random.sample(list(intersection), len(intersection) // 3))
id_cats = intersection.difference(ood_cats)


rawF_id            = [entry for entry in rawF_id
                      if entry['category'] in id_cats]
rawG_id            = [entry for entry in rawG_id
                      if entry['category'] in id_cats]
rawF_benign_id     = [entry for entry in rawF_benign_id
                            if entry['category'] in id_cats]
rawG_benign_id     = [entry for entry in rawG_benign_id
                      if entry['category'] in id_cats]
rawF_ood           = [entry for entry in rawF_ood
                      if entry['category'] in ood_cats]
rawG_ood           = [entry for entry in rawG_ood
                      if entry['category'] in ood_cats]
rawF_benign_ood    = [entry for entry in rawF_benign_ood
                            if entry['category'] in ood_cats]
rawG_benign_ood    = [entry for entry in rawG_benign_ood
                      if entry['category'] in ood_cats]

In [6]:
# combine and flag
benign_id    = rawF_benign_id + rawG_benign_id
jailbrks_id  = rawF_id          + rawG_id
benign_ood   = rawF_benign_ood  + rawG_benign_ood
jailbrks_ood = rawF_ood         + rawG_ood

print(f"{len(benign_id)=}, {len(jailbrks_id)=}")
print(f"{len(benign_ood)=}, {len(jailbrks_ood)=}")

# balance ID splits
min_id = min(len(benign_id), len(jailbrks_id))
benign_id   = random.sample(benign_id, min_id)
jailbrks_id = random.sample(jailbrks_id, min_id)

# extract raw prompts
get_prompts = lambda items: [e["prompt"] for e in items]
ben_ID   = get_prompts(benign_id)
jb_ID    = get_prompts(jailbrks_id)
ben_OOD  = get_prompts(benign_ood)
jb_OOD   = get_prompts(jailbrks_ood)


### Keep the same splits

In [7]:
# ==== Cell: [Data Loading & Preprocessing] ====

def load_jsonl(path: str):
    with open(path, 'r') as f:
        return [
            json.loads(line)
            for line in f
            if line.strip() and not line.strip().startswith('#')
        ]

# paths from config
data_cfg = config['data']
rawF_id            = load_jsonl(data_cfg["input_path_varyFraming_id"])
rawG_id            = load_jsonl(data_cfg["input_path_varyGoal_id"])
rawF_benign_id     = load_jsonl(data_cfg["input_path_varyFraming_benign_id"])
rawG_benign_id     = load_jsonl(data_cfg["input_path_varyGoal_benign_id"])
rawF_ood           = load_jsonl(data_cfg["input_path_varyFraming_ood"])
rawG_ood           = load_jsonl(data_cfg["input_path_varyGoal_ood"])
rawF_benign_ood    = load_jsonl(data_cfg["input_path_varyFraming_benign_ood"])
rawG_benign_ood    = load_jsonl(data_cfg["input_path_varyGoal_benign_ood"])

# combine and flag
benign_id    = rawF_benign_id + rawG_benign_id
jailbrks_id  = rawF_id          + rawG_id
benign_ood   = rawF_benign_ood  + rawG_benign_ood
jailbrks_ood = rawF_ood         + rawG_ood

print(f"{len(benign_id)=}, {len(jailbrks_id)=}")
print(f"{len(benign_ood)=}, {len(jailbrks_ood)=}")

# balance ID splits
min_id = min(len(benign_id), len(jailbrks_id))
benign_id   = random.sample(benign_id, min_id)
jailbrks_id = random.sample(jailbrks_id, min_id)

# extract raw prompts
get_prompts = lambda items: [e["prompt"] for e in items]
ben_ID   = get_prompts(benign_id)
jb_ID    = get_prompts(jailbrks_id)
ben_OOD  = get_prompts(benign_ood)
jb_OOD   = get_prompts(jailbrks_ood)



#### Load original PAIR prompts

In [None]:
import os
import json
from pathlib import Path

def load_all_pair_prompts(black_box_dir="./data/artifacts/attack-artifacts/PAIR/black_box/"):
    prompts = []
    for file in Path(black_box_dir).glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)
            for jb in data.get("jailbreaks", []):
                if jb.get("prompt") is not None:
                    prompts.append(jb["prompt"])
    return prompts

all_pair_prompts = load_all_pair_prompts()

## Load Model

In [8]:
DECOMP_CKPT

In [9]:
# # ==== Cell: [Model & Decomposer Initialization] ====

# load LLM encoder
model_llm, tokenizer = load_model(ENC_LLM_NAME, device=device)
encoder = HFEncoder_notPooled(
    model=model_llm,
    tokenizer=tokenizer,
    device=device,
    layers="last",#config['model']['layers'],
    layer_combine=config['model']['layer_combine'],
)

# # load decomposer weights
# ckpt = torch.load(DECOMP_CKPT / "weights.pt", map_location=device)
# enc_dim_ckpt = ckpt["Wg.0.weight"].shape[1]

# decomposer = NonlinearDecomposer(
#     enc_dim    = enc_dim_ckpt,
#     d_g        = config['d_g'],
#     d_f        = config['d_f'],
#     hidden_dim = config.get('hidden_dim', 1024),
#     dropout    = config.get('dropout', 0.1),
# ).to(device)
# # decomposer = NonlinearDecomposer_tiny(
# #     enc_dim    = enc_dim_ckpt,
# # ).to(device)

# decomposer.load_state_dict(ckpt)
# decomposer.half().eval()
# decomposer.eval()
# encoder.eval()

# logger.info(f"Loaded encoder + decomposer (enc_dim={enc_dim_ckpt}).")


#### Find critical layer - nsp dist

In [10]:
RUN_ID_dec = "20250719" # UNIQUE_ID
import re, glob, os

pattern = re.compile(r"decomposer_layer(\d+)_")
ckpt_by_layer = {}
for p in glob.glob(f"./checkpoints/decomposer_simple/decomposer_layer*{RUN_ID_dec}*"):
    m = pattern.search(os.path.basename(p))
    if m: ckpt_by_layer[int(m.group(1))] = p

In [11]:
N_CAL = 100              # or leave tunable in YAML
cal_benign   = random.sample(ben_ID,    N_CAL//2)
cal_jailbreak= random.sample(jb_ID,  N_CAL//2)

from utils.critial_layer import find_critical_layers, find_critical_layers_dist

layers = config["model"]["layers"]
if layers == 'all':
    num_layers = model_llm.config.num_hidden_layers
    layers = list(range(num_layers))

cl_outs = dict()
best_g, best_f = dict(), dict()
for layer in layers:
    # ==== Cell: [Model & Decomposer Initialization] ====

    # load decomposer weights
    ckpt = torch.load(f"{ckpt_by_layer[layer]}/weights.pt", map_location=device)
    enc_dim_ckpt = ckpt["Wg.0.weight"].shape[1]

    decomposer = NonlinearDecomposer(
        enc_dim    = enc_dim_ckpt,
        d_g        = config['d_g'],
        d_f        = config['d_f'],
        hidden_dim = config.get('hidden_dim', 1024),
        dropout    = config.get('dropout', 0.1),
    ).to(device)

    decomposer.load_state_dict(ckpt)
    decomposer.half().eval()
    decomposer.eval()
    
    # logger.info(f"Loaded encoder + decomposer (enc_dim={enc_dim_ckpt}).")
    print(f"\n\n\nlayer {layer}:\n")
    with torch.no_grad():
        cl_outs[layer] = find_critical_layers_dist(HFEncoder_notPooled, model_llm, tokenizer, device, 
                                                decomposer, cal_benign, cal_jailbreak, [layer],
                                                criterion='cohen_d')
        best_g[layer] = {"encoder_l": cl_outs[layer]["best_layer_goal"], "score": cl_outs[layer]["score_goal"]}
        best_f[layer] = {"encoder_l": cl_outs[layer]["best_layer_framing"], "score": cl_outs[layer]["score_framing"]}
        print(f"Best G: {best_g[layer]}, Best F: {best_f[layer]}")



#### Find critical layer - cosine

In [None]:
# # Find the first checkpoint path matching the pattern with "layer" in the name
# import glob
# ckpt_candidates = glob.glob("./checkpoints/decomposer_simple/decomposer_*layer*")

import re, glob, os

pattern = re.compile(r"decomposer_layer(\d+)_")
ckpt_by_layer = {}
for p in glob.glob("./checkpoints/decomposer_simple/decomposer_layer*"):
    m = pattern.search(os.path.basename(p))
    if m: ckpt_by_layer[int(m.group(1))] = p


In [None]:
N_CAL = 100              # or leave tunable in YAML
cal_benign   = random.sample(ben_ID,    N_CAL//2)
cal_jailbreak= random.sample(jb_ID,  N_CAL//2)

from utils.critial_layer import find_critical_layers

layers = config["model"]["layers"]
if layers == 'all':
    num_layers = model_llm.config.num_hidden_layers
    layers = list(range(num_layers))

cl_outs = dict()
best_g, best_f = dict(), dict()
for layer in layers:
    # ==== Cell: [Model & Decomposer Initialization] ====

    # # load LLM encoder
    # model_llm, tokenizer = load_model(ENC_LLM_NAME, device=device)
    # encoder = HFEncoder_notPooled(
    #     model=model_llm,
    #     tokenizer=tokenizer,
    #     device=device,
    #     layers=[layer],
    #     layer_combine=config['model']['layer_combine'],
    # )

    # load decomposer weights
    ckpt = torch.load(f"{ckpt_by_layer[layer]}/weights.pt", map_location=device)
    enc_dim_ckpt = ckpt["Wg.0.weight"].shape[1]

    decomposer = NonlinearDecomposer(
        enc_dim    = enc_dim_ckpt,
        d_g        = config['d_g'],
        d_f        = config['d_f'],
        hidden_dim = config.get('hidden_dim', 1024),
        dropout    = config.get('dropout', 0.1),
    ).to(device)
    # decomposer = NonlinearDecomposer_tiny(
    #     enc_dim    = enc_dim_ckpt,
    # ).to(device)

    decomposer.load_state_dict(ckpt)
    decomposer.half().eval()
    decomposer.eval()
    # encoder.eval()

    # logger.info(f"Loaded encoder + decomposer (enc_dim={enc_dim_ckpt}).")
    print(f"\n\n\nlayer {layer}:\n")
    with torch.no_grad():
        cl_outs[layer] = find_critical_layers(HFEncoder_notPooled, model_llm, tokenizer, device, 
                                                decomposer, cal_benign, cal_jailbreak, layers)
        best_g[layer] = {"encoder_l": cl_outs[layer]["l_g"], "Δ": cl_outs[layer]["Δ_g"]}
        best_f[layer] = {"encoder_l": cl_outs[layer]["l_f"], "Δ": cl_outs[layer]["Δ_f"]}
        print(f"Best G: {best_g[layer]}, Best F: {best_f[layer]}")



### Inspect each layer

In [12]:
for l in best_f:
    print(f"\nDec Layer {l}:")
    print(f"G   |   enc layer: {best_g[l]['encoder_l']}, score: {best_g[l]['score']}")
    print(f"F   |   enc layer: {best_f[l]['encoder_l']}, score: {best_f[l]['score']}")

### Load selected encoder and decomposer

In [13]:
best_g_tups = sorted(best_g.items(), key=lambda x: x[1]['score'], reverse=True)
best_f_tups = sorted(best_f.items(), key=lambda x: x[1]['score'], reverse=True)
num_layers = model_llm.config.num_hidden_layers

In [14]:
enc_layer = [l for l, _ in best_f_tups if l > num_layers//2][0]

In [15]:
enc_layer

In [16]:
enc_layer =  7 # 25 # for f by cosine
dec_layer = 7 # 25 # for f by cosine
# enc_layer =  18 # for g by cosine
# dec_layer = 18 # for g by cosine

enc_layer =  17# 17 # 9 # for f by cohen_d
dec_layer = 17# 17 # 9 # for f by cohen_d
# enc_layer =  9 # 9 # for f by cohen_d
# dec_layer = 9 # 9 # for f by cohen_d
# # enc_layer =  5 # 17 # for g by cohen_d
# # dec_layer = 5 # 17 # for g by cohen_d

In [17]:
dec_layer

In [18]:
# Find the first checkpoint path matching the pattern with "layer" in the name
# import glob
# ckpt_candidates = glob.glob("./checkpoints/decomposer_simple/decomposer_*layer*")
# # Find the first checkpoint path matching the pattern with "layer" in the name
# import glob
# ckpt_candidates = glob.glob("./checkpoints/decomposer_simple/decomposer_*layer*")

RUN_ID_dec = "20250719" # UNIQUE_ID
import re, glob, os

pattern = re.compile(r"decomposer_layer(\d+)_")
ckpt_by_layer = {}
for p in glob.glob(f"./checkpoints/decomposer_simple/decomposer_layer*{RUN_ID_dec}*"):
    m = pattern.search(os.path.basename(p))
    if m: ckpt_by_layer[int(m.group(1))] = p

# load LLM encoder
model_llm, tokenizer = load_model(ENC_LLM_NAME, device=device)
encoder = HFEncoder_notPooled(
    model=model_llm,
    tokenizer=tokenizer,
    device=device,
    layers=[enc_layer],
    layer_combine=config['model']['layer_combine'],
)

# load decomposer weights
ckpt = torch.load(f"{ckpt_by_layer[dec_layer]}/weights.pt", map_location=device)
enc_dim_ckpt = ckpt["Wg.0.weight"].shape[1]

decomposer = NonlinearDecomposer(
    enc_dim    = enc_dim_ckpt,
    d_g        = config['d_g'],
    d_f        = config['d_f'],
    hidden_dim = config.get('hidden_dim', 1024),
    dropout    = config.get('dropout', 0.1),
).to(device)
# decomposer = NonlinearDecomposer_tiny(
#     enc_dim    = enc_dim_ckpt,
# ).to(device)

decomposer.load_state_dict(ckpt)
decomposer.half().eval()
decomposer.eval()

## Build spaces

In [19]:
DETECT_VIA_FRAMING = True

# if DETECT_VIA_FRAMING:
#     enc_layer = [l for l, _ in best_f_tups if l > num_layers//2][0]
# else:
#     enc_layer = [l for l, _ in best_g_tups if l > num_layers//2][0]
# dec_layer = enc_layer

In [20]:
# ==== Cell: [Build Framing Vectors] ====

@torch.no_grad()
def framing_vecs(texts: list[str], batch_size: int = 32) -> torch.Tensor:
    all_v = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        rep   = encoder(batch)               # (B, D_enc) or (B, T, D_enc)
        _, v_f, _ = decomposer(rep)          # framing component
        if v_f.dim() == 3:                   # token-wise
            v_f = v_f.mean(dim=1)
        all_v.append(v_f.cpu())
    return torch.cat(all_v, dim=0)

@torch.no_grad()
def goal_vecs(texts: list[str], batch_size: int = 32) -> torch.Tensor:
    all_v = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        rep   = encoder(batch)               # (B, D_enc) or (B, T, D_enc)
        v_g, _, _ = decomposer(rep)          # framing component
        if v_g.dim() == 3:                   # token-wise
            v_g = v_g.mean(dim=1)
        all_v.append(v_g.cpu())
    return torch.cat(all_v, dim=0)


if DETECT_VIA_FRAMING:
    v_ben_ID  = framing_vecs(ben_ID)
    v_jb_ID   = framing_vecs(jb_ID)
    v_ben_OOD = framing_vecs(ben_OOD)
    v_jb_OOD  = framing_vecs(jb_OOD)
    logger.info("Built framing vectors for all splits.")
else:
    v_ben_ID  = goal_vecs(ben_ID)
    v_jb_ID   = goal_vecs(jb_ID)
    v_ben_OOD = goal_vecs(ben_OOD)
    v_jb_OOD  = goal_vecs(jb_OOD)
    logger.info("Built goal vectors for all splits.")


### Whiten and PCA

In [21]:
from sklearn.covariance import LedoitWolf#, EmpiricalCovariance



def fit_whiten_pca(X: np.ndarray, alpha: float = 0.9):
    mu   = X.mean(axis=0, keepdims=True)
    Xc   = X - mu
    cov  = np.cov(Xc, rowvar=False) + 1e-5*np.eye(X.shape[1])
    # lwcov = LedoitWolf()
    # cov = lwcov.fit(X)
    vals, vecs = np.linalg.eigh(cov)
    W    = vecs @ np.diag(vals**-0.5) @ vecs.T
    Z    = Xc @ W.T

    pca  = PCA(whiten=False).fit(Z)
    cum  = np.cumsum(pca.explained_variance_ratio_)
    r    = int(np.searchsorted(cum, alpha)) + 1
    P    = pca.components_[:r].T

    return {"mu": mu, "W": W, "P": P}

# fit detector on benign-ID
# detector = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
# alphas = [.5,.6,0.7,0.8,0.9,.95,.99]
alphas = [0.7,0.9]
detectors = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}

# pickle.dump(detector, open("checkpoints/nsp_detector.pkl","wb"))

# precompute benign centroid
def residual_vec(V: np.ndarray, det):
    z    = (V - det["mu"]) @ det["W"].T
    proj = det["P"] @ (det["P"].T @ z.T)
    return z - proj.T

R_ben_IDs = {a: residual_vec(v_ben_ID.numpy(), detectors[a])
             for a in detectors}
R_ben_IDs = {a: R_ben_IDs[a] / (np.linalg.norm(R_ben_IDs[a], axis=1, keepdims=True) + 1e-9)
             for a in R_ben_IDs}
centroids = {a: R_ben_IDs[a].mean(axis=0, keepdims=True)
             for a in R_ben_IDs}


### Mahalanobis score (as an alternative to whitening and PCA)

### With cosine

#### Without Val Set

In [None]:
# ==== Cell: [Detection – Cosine-based NSP] ====
def cos_score(V: np.ndarray):
    Rs = {a: residual_vec(V, detectors[a]) 
          for a in detectors}
    Rs = {a: Rs[a] / (np.linalg.norm(Rs[a], axis=1, keepdims=True) + 1e-9)
          for a in Rs}
    return {a: 1.0 - (Rs[a] * centroids[a]).sum(axis=1) for a in Rs}

# threshold
taus = {a: np.percentile(cos_score(v_ben_ID.numpy())[a], 95)
       for a in detectors}
for a in taus:
    print(f"cosine-score τ (95% benign-ID with alpha={a}) = {taus[a]:.4f}")

# evaluation
def eval_split(v_ben, v_jb, name):
    y    = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    s    = {a: np.concatenate([cos_score(v_ben.numpy())[a],
                                cos_score(v_jb.numpy())[a]])
            for a in alphas}
    au   = {a: roc_auc_score(y, s[a])
            for a in s}
    tprs  = {a: (s[a][len(v_ben):] > taus[a]).mean() for a in taus}
    fprs  = {a: (s[a][:len(v_ben)] > taus[a]).mean() for a in taus}
    for a in taus:
        print(f"alpha={a:>2} , {name:>6} | AUROC {au[a]:.3f}  TPR@τ {tprs[a]:.3f}  FPR@τ {fprs[a]:.3f}")

eval_split(v_ben_ID,  v_jb_ID,  "ID")
eval_split(v_ben_OOD, v_jb_OOD, "OOD")


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def acc_f1(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([cos_score(v_ben.numpy())[a],
                                cos_score(v_jb.numpy())[a]])
              for a in alphas
    }
    y_pred = {a: (scores[a] > taus[a]).astype(int) for a in taus}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in taus}
    prec_recall_f1 = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in taus}
    for a in taus:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f}  Prec {prec_recall_f1[a][0]:.3f}  Rec {prec_recall_f1[a][1]:.3f}  F1 {prec_recall_f1[a][2]:.3f}")

acc_f1(v_ben_ID,  v_jb_ID,  "ID")
acc_f1(v_ben_OOD, v_jb_OOD, "OOD")


### With L2

#### With Val Set 

In [22]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.25, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.25, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
# alphas = [0.7,.8, .85, 0.9, .95]
alphas = [0.7, 0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
# alphas = [0.7,.8, .85, 0.9, .95]
alphas = [0.7, 0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
# alphas = [0.7,.8, .85, 0.9, .95]
alphas = [.6, 0.7, 0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
# alphas = [0.7,.8, .85, 0.9, .95]
alphas = [0.7, 0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (with validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1) Split benign ID into train / val
ben_ID_train, ben_ID_val = train_test_split(v_ben_ID, test_size=0.2, random_state=42)
jb_ID_train,  jb_ID_val  = train_test_split(v_jb_ID,  test_size=0.2, random_state=42)

# 2) Fit detector on train split
# detector_l2_val = fit_whiten_pca(ben_ID_train.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2_val = {a: fit_whiten_pca(ben_ID_train.numpy(), alpha=a) for a in alphas}


# 3) χ² threshold (squared-L2 residual)
chis_df_val = {a: ben_ID_train.shape[1] - detectors_l2_val[a]["P"].shape[1] for a in alphas}
taus_l2_val = {a: chi2.ppf(0.95, df=chis_df_val[a]) ** 0.5 for a in alphas}
for a in alphas:    
    print(f"χ²-based τ (with val, alpha={a}) = {taus_l2_val[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_with_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2_val[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2_val[a]),
    ]) for a in detectors_l2_val}
    y_pred = {a: (scores[a] > taus_l2_val[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_with_val(ben_ID_val,  jb_ID_val,  "ID-val")
eval_l2_with_val(v_ben_OOD,  v_jb_OOD,  "OOD")


#### Without Val Set

In [5]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [.6,0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")
 
# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [.6,0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")
 
# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [23]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [26]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# ==== Cell: [Detection – χ²-based L2 NSP (without validation split)] ====

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2) Fit detector on train split
# detector_l2 = fit_whiten_pca(v_ben_ID.numpy(), alpha=0.9)
alphas = [0.7,0.9]
detectors_l2 = {a: fit_whiten_pca(v_ben_ID.numpy(), alpha=a) for a in alphas}
# 3) χ² threshold (squared-L2 residual)
chis_df = {a: v_ben_ID.shape[1] - detectors_l2[a]["P"].shape[1] for a in alphas}
taus_l2 = {a: chi2.ppf(0.95, df=chis_df[a]) ** 0.5 for a in alphas}
for a in alphas:
    print(f"χ²-based τ (without val, alpha={a}) = {taus_l2[a]:.4f}")

# 4) Scoring & evaluation
def nsp_score_l2(V: np.ndarray, det) -> np.ndarray:
    res = residual_vec(V, det)
    return np.linalg.norm(res, axis=1)

def eval_l2_without_val(v_ben, v_jb, name):
    y_true = np.concatenate([np.zeros(len(v_ben)), np.ones(len(v_jb))])
    scores = {a: np.concatenate([
        nsp_score_l2(v_ben.numpy(), detectors_l2[a]),
        nsp_score_l2(v_jb.numpy(), detectors_l2[a]),
    ]) for a in detectors_l2}
    y_pred = {a: (scores[a] > taus_l2[a]).astype(int) for a in alphas}

    au   = {a: roc_auc_score(y_true, scores[a]) for a in alphas}
    prec_rec_f1_ = {a: precision_recall_fscore_support(
        y_true, y_pred[a], average="binary", pos_label=1, zero_division=0
    ) for a in alphas}
    accs = {a: accuracy_score(y_true, y_pred[a]) for a in alphas}
    tprs = {a: y_pred[a][len(v_ben):].mean() for a in alphas}
    fprs = {a: y_pred[a][:len(v_ben)].mean() for a in alphas}
    tnrs = {a: (1 - y_pred[a][len(v_ben):]).mean() for a in alphas}
    fnrs = {a: (1 - y_pred[a][:len(v_ben)]).mean() for a in alphas}
    for a in alphas:
        print(f"alpha={a:>2} , {name:>6} | Acc {accs[a]:.3f} F1 {prec_rec_f1_[a][2]:.3f} AUROC {au[a]:.3f} Prec {prec_rec_f1_[a][0]:.3f}  Rec {prec_rec_f1_[a][1]:.3f} TPR {tprs[a]:.3f} FPR {fprs[a]:.3f} TNR {tnrs[a]:.3f} FNR {fnrs[a]:.3f}")

# Eval on held-out ID val and full OOD
eval_l2_without_val(v_ben_ID,  v_jb_ID,  "ID")
eval_l2_without_val(v_ben_OOD,  v_jb_OOD,  "OOD")


In [None]:
# # mean L2 distance between benign vs jailbreak framing vectors
# def mean_pairwise_dist(A, B):        # A, B are torch tensors
#     return torch.cdist(A, B).mean().item()

# # print("random   Δ_ben−jb:", mean_pairwise_dist(vf_ben_ID_rand, vf_jb_ID_rand))
# print("trained  Δ_ben−jb:", mean_pairwise_dist(vf_ben_ID,   vf_jb_ID))

In [None]:
# # mean L2 distance between benign vs jailbreak framing vectors
# def mean_pairwise_dist(A, B):        # A, B are torch tensors
#     return torch.cdist(A, B).mean().item()

# # print("random   Δ_ben−jb:", mean_pairwise_dist(vf_ben_ID_rand, vf_jb_ID_rand))
# print("trained  Δ_ben−jb:", mean_pairwise_dist(v_ben_ID,   v_jb_ID))

In [None]:
# # mean L2 distance between benign vs jailbreak framing vectors
# def mean_pairwise_dist(A, B):        # A, B are torch tensors
#     return torch.cdist(A, B).mean().item()

# print("random   Δ_ben−jb:", mean_pairwise_dist(vf_ben_ID, vf_jb_ID))
# # print("trained  Δ_ben−jb:", mean_pairwise_dist(vf_ben_ID,   vf_jb_ID))