In [4]:
!export CUDA_LAUNCH_BLOCKING=1

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp")

# --- Setup: Environment, Logging, and Config ---

import sys
import torch
import numpy as np
import yaml
import logging
import json
from utils.model_utils import load_model, load_model_multiGPU

# Set working directory to project root if needed
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp")

# -- uncomment if not using multiple GPUs
# device="cuda:0" if torch.cuda.is_available() else "cpu"

# Add current directory to sys.path for imports
sys.path.append('.')


# Goal-based split

In [3]:
# --- Setup: Environment, Logging, and Config ---

import sys
import torch
import numpy as np
import yaml
import logging
import json
from utils.model_utils import load_model, load_model_multiGPU

# Set working directory to project root if needed
os.chdir("/mnt/home/amir/framingdecomp/framingDecomp")

# -- uncomment if not using multiple GPUs
# device="cuda:0" if torch.cuda.is_available() else "cpu"

# Add current directory to sys.path for imports
sys.path.append('.')


# Load configuration
with open('configs/decomposer3.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Set random seed for reproducibility
torch.manual_seed(config['experiment']['seed'])
np.random.seed(config['experiment']['seed'])

# --- Data Loading and Preprocessing ---

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line.strip()) for line in f if line.strip()]

def split_data_by_framing(data, test_framing_index, train_framing_indices):
    """
    Split data into train and test sets based on framing indices.
    Returns (train_data, test_data)
    """
    test_data = [entry for entry in data if entry.get('framing_index') == test_framing_index]
    train_data = [entry for entry in data if entry.get('framing_index') in train_framing_indices]
    return train_data, test_data

# Load data
DATA_PATH_varyF = config['data']['input_path_varyFraming']
DATA_PATH_varyG = config['data']['input_path_varyGoal']
DATA_PATH_varyF_benign = config['data']['input_path_varyFraming_benign']
DATA_PATH_varyG_benign = config['data']['input_path_varyGoal_benign']
raw_data_varyF = load_jsonl(DATA_PATH_varyF)
raw_data_varyG = load_jsonl(DATA_PATH_varyG)
raw_data_varyF_benign = load_jsonl(DATA_PATH_varyF_benign)
raw_data_varyG_benign = load_jsonl(DATA_PATH_varyG_benign)

In [None]:
# # Mark benign / jailbreak flags
# benign   = raw_data_varyF_benign + raw_data_varyG_benign
# jailbrks = raw_data_varyF + raw_data_varyG
# print(f"{len(benign)=}   {len(jailbrks)=}")



# # --- Splitting Mode: "random", "goal", or "category" ---
# split_mode = "goal"  # Change to "random" or "category" as needed
# split_ratio = 0.8    # 80% train/ID, 20% test/OOD

# def get_split_sets(items, key, split_ratio=0.8):
#     """Generic function to split by a key (goal_index or Category)."""
#     all_keys = sorted({e[key] for e in items})
#     np.random.shuffle(all_keys)
#     split_pt = int(split_ratio * len(all_keys))
#     ID_KEYS = set(all_keys[:split_pt])
#     OOD_KEYS = set(all_keys[split_pt:])
#     return ID_KEYS, OOD_KEYS

# def mask(items, key_set, key):
#     return [e for e in items if e[key] in key_set]

# if split_mode == "goal":
#     np.random.seed(0)
#     ID_KEYS, OOD_KEYS = get_split_sets(benign, "goal_index", split_ratio)
#     mask_key = "goal_index"
#     id_name = "id_goal"
#     ood_name = "ood_goal"
# elif split_mode == "category":
#     np.random.seed(0)
#     ID_KEYS, OOD_KEYS = get_split_sets(benign, "Category", split_ratio)
#     mask_key = "Category"
#     id_name = "id_category"
#     ood_name = "ood_category"
# elif split_mode == "random":
#     np.random.seed(0)
#     # For random, shuffle and split the actual items, not by key
#     def random_split(items, split_ratio=0.8):
#         idxs = np.arange(len(items))
#         np.random.shuffle(idxs)
#         split_pt = int(split_ratio * len(items))
#         return [items[i] for i in idxs[:split_pt]], [items[i] for i in idxs[split_pt:]]
#     # Split each set
#     ben_ID_varyF, ben_OOD_varyF = random_split(raw_data_varyF_benign, split_ratio)
#     ben_ID_varyG, ben_OOD_varyG = random_split(raw_data_varyG_benign, split_ratio)
#     jb_ID_varyF, jb_OOD_varyF = random_split(raw_data_varyF, split_ratio)
#     jb_ID_varyG, jb_OOD_varyG = random_split(raw_data_varyG, split_ratio)
#     id_name = "train"
#     ood_name = "test"
# else:
#     raise ValueError("Unknown split_mode")

# if split_mode in ["goal", "category"]:
#     ben_ID_varyF   = mask(raw_data_varyF_benign,   ID_KEYS, mask_key)
#     ben_ID_varyG   = mask(raw_data_varyG_benign,   ID_KEYS, mask_key)
#     ben_OOD_varyF  = mask(raw_data_varyF_benign,   OOD_KEYS, mask_key)
#     ben_OOD_varyG  = mask(raw_data_varyG_benign,   OOD_KEYS, mask_key)
#     jb_ID_varyF    = mask(raw_data_varyF,          ID_KEYS, mask_key)
#     jb_ID_varyG    = mask(raw_data_varyG,          ID_KEYS, mask_key)
#     jb_OOD_varyF   = mask(raw_data_varyF,          OOD_KEYS, mask_key)
#     jb_OOD_varyG   = mask(raw_data_varyG,          OOD_KEYS, mask_key)

# # --- Output paths ---
# def make_out_path(base_path, split_type, id_or_ood):
#     folder = split_type + "/" + id_or_ood
#     return '/'.join(base_path.split('/')[:-1]) + f'/{folder}/' + base_path.split('/')[-1]

# out_path_varyF_benign_id  = make_out_path(DATA_PATH_varyF_benign, id_name, "varyF_benign")
# out_path_varyG_benign_id  = make_out_path(DATA_PATH_varyG_benign, id_name, "varyG_benign")
# out_path_varyF_benign_ood = make_out_path(DATA_PATH_varyF_benign, ood_name, "varyF_benign")
# out_path_varyG_benign_ood = make_out_path(DATA_PATH_varyG_benign, ood_name, "varyG_benign")
# out_path_varyF_id         = make_out_path(DATA_PATH_varyF, id_name, "varyF")
# out_path_varyG_id         = make_out_path(DATA_PATH_varyG, id_name, "varyG")
# out_path_varyF_ood        = make_out_path(DATA_PATH_varyF, ood_name, "varyF")
# out_path_varyG_ood        = make_out_path(DATA_PATH_varyG, ood_name, "varyG")

# ids_and_oods = [ben_ID_varyF, ben_ID_varyG, 
#                 ben_OOD_varyF, ben_OOD_varyG,
#                 jb_ID_varyF, jb_ID_varyG, 
#                 jb_OOD_varyF, jb_OOD_varyG]
# out_paths = [out_path_varyF_benign_id, out_path_varyG_benign_id,
#              out_path_varyF_benign_ood, out_path_varyG_benign_ood,
#              out_path_varyF_id, out_path_varyG_id,
#              out_path_varyF_ood, out_path_varyG_ood]

# # --- Write splits ---
# for idx, out_path in enumerate(out_paths):
#     os.makedirs(os.path.dirname(out_path), exist_ok=True)
#     with open(out_path, "w") as f:
#         f.write(f"# model_name: gpt4.1\n")
#     for e in ids_and_oods[idx]:
#         with open(out_path, "a") as fout:
#             fout.write(json.dumps(e) + "\n")

In [4]:

# Mark benign / jailbreak flags
benign   = raw_data_varyF_benign + raw_data_varyG_benign
jailbrks = raw_data_varyF + raw_data_varyG
print(f"{len(benign)=}   {len(jailbrks)=}")


np.random.seed(0)
all_goals = sorted({e["goal_index"] for e in benign})
np.random.shuffle(all_goals)
split_pt  = int(0.8*len(all_goals))       # 80 % ID, 20 % OOD
ID_GOALS  = set(all_goals[:split_pt])
OOD_GOALS = set(all_goals[split_pt:])

def mask(items, goal_set):  # returns list[str]
    return [e for e in items if e["goal_index"] in goal_set]

ben_ID_varyF   = mask(raw_data_varyF_benign,   ID_GOALS)
ben_ID_varyG   = mask(raw_data_varyG_benign,   ID_GOALS)
ben_OOD_varyF  = mask(raw_data_varyF_benign,   OOD_GOALS)
ben_OOD_varyG  = mask(raw_data_varyG_benign,   OOD_GOALS)
jb_ID_varyF   = mask(raw_data_varyF,   ID_GOALS)
jb_ID_varyG   = mask(raw_data_varyG,   ID_GOALS)
jb_OOD_varyF  = mask(raw_data_varyF,   OOD_GOALS)
jb_OOD_varyG  = mask(raw_data_varyG,   OOD_GOALS)


len(benign)=3234   len(jailbrks)=3429


In [14]:
out_path_varyF_benign_id

'./id/data'

In [17]:
'/'.join(DATA_PATH_varyF_benign.split('/')[:-1])

'./data/populated_benign_JBB-behaviors/PAIR'

In [5]:

out_path_varyF_benign_id = '/'.join(DATA_PATH_varyF_benign.split('/')[:-1]) + '/id/' + DATA_PATH_varyF_benign.split('/')[-1]
out_path_varyG_benign_id = '/'.join(DATA_PATH_varyG_benign.split('/')[:-1]) + '/id/' + DATA_PATH_varyG_benign.split('/')[-1]
out_path_varyF_benign_ood = '/'.join(DATA_PATH_varyF_benign.split('/')[:-1]) + '/ood/' + DATA_PATH_varyF_benign.split('/')[-1]
out_path_varyG_benign_ood = '/'.join(DATA_PATH_varyG_benign.split('/')[:-1]) + '/ood/' + DATA_PATH_varyG_benign.split('/')[-1]
out_path_varyF_id = '/'.join(DATA_PATH_varyF.split('/')[:-1]) + '/id/' + DATA_PATH_varyF.split('/')[-1]
out_path_varyG_id = '/'.join(DATA_PATH_varyG.split('/')[:-1]) + '/id/' + DATA_PATH_varyG.split('/')[-1]
out_path_varyF_ood = '/'.join(DATA_PATH_varyF.split('/')[:-1]) + '/ood/' + DATA_PATH_varyF.split('/')[-1]
out_path_varyG_ood = '/'.join(DATA_PATH_varyG.split('/')[:-1]) + '/ood/' + DATA_PATH_varyG.split('/')[-1]

ids_and_oods = [ben_ID_varyF, ben_ID_varyG, 
                ben_OOD_varyF, ben_OOD_varyG,
                jb_ID_varyF, jb_ID_varyG, 
                jb_OOD_varyF, jb_OOD_varyG]
out_paths = [out_path_varyF_benign_id, out_path_varyG_benign_id,
             out_path_varyF_benign_ood, out_path_varyG_benign_ood,
             out_path_varyF_id, out_path_varyG_id,
             out_path_varyF_ood, out_path_varyG_ood]
for idx, out_path in enumerate(out_paths):
    # Write header with model and input data info
    with open(out_path, "w") as f:
        f.write(f"# model_name: gpt4.1\n")
    
    for e in ids_and_oods[idx]:
        with open(out_path, "a") as fout:
            fout.write(json.dumps(e) + "\n")
            

# Split with 3 Modes

### Setup

In [15]:
# ---------------------------------------------------------------------------
#  split_data.py   ·  three-way splitter for framing-decomp datasets
# ---------------------------------------------------------------------------
#
#  USAGE inside a notebook:
#  -------------------------------------------------
#  %run split_data.py               # uses default (split by goal_index)
#  %run split_data.py random        # 80/20 random train / test
#  %run split_data.py category      # 80/20 ID_category / OOD_category
#
#  The script reproduces EXACTLY the old behaviour when
#  called with the default “goal” mode.
# ---------------------------------------------------------------------------

import os, sys, json, yaml, logging, random, argparse
import numpy as np
import torch
from pathlib import Path
from typing import List, Tuple

# ─────────────────────────  CONFIG / ARGS  ──────────────────────────
# parser = argparse.ArgumentParser(description="Dataset splitter (goal | category | random)")
# parser.add_argument("mode", nargs="?", default="goal",
#                     choices=["goal", "category", "random"],
#                     help="Splitting strategy to use")
# args = parser.parse_args()
# SPLIT_MODE = args.mode                        # "goal" | "category" | "random"

SPLIT_MODE = "random" # choices=["goal", "category", "random"]

### Load and helpers

In [16]:
# load config (unchanged)
with open("configs/decomposer3.yaml", "r") as f:
    config = yaml.safe_load(f)

SEED = config["experiment"]["seed"]
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ──────────────────────────  helpers  ───────────────────────────────
def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line.strip()) for line in f if line.strip() and not line.strip().startswith('#')]

def write_jsonl(path: str, data: List[dict], header: str | None = None):
    os.makedirs(Path(path).parent, exist_ok=True)
    with open(path, "w") as f:
        if header is not None:
            f.write(header + "\n")
        for e in data:
            f.write(json.dumps(e) + "\n")

def inject_dir(original_path: str, tag: str) -> str:
    """Insert /{tag}/ right before the filename."""
    parts                = original_path.split("/")
    new_dir              = "/".join(parts[:-1] + [tag])
    os.makedirs(new_dir, exist_ok=True)
    return f"{new_dir}/{parts[-1]}"


### Split funcs

In [17]:

def random_split(items: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
    idx = np.arange(len(items))
    np.random.shuffle(idx)
    split = int(ratio * len(items))
    return [items[i] for i in idx[:split]], [items[i] for i in idx[split:]]

def grouped_split(items: List[dict], key: str, ratio: float = 0.8
                  ) -> Tuple[List[dict], List[dict]]:
    """
    Generic 80/20 split by a *group* key (`goal_index` or `category`).
    All elements sharing the same key value go to the same side.
    """
    groups = sorted({e[key] for e in items})
    rng    = np.random.RandomState(SEED)
    rng.shuffle(groups)
    split  = int(ratio * len(groups))
    in_set = set(groups[:split])           # ID groups
    return ([e for e in items if e[key] in in_set],    # ID
            [e for e in items if e[key] not in in_set])# OOD





### run split

In [18]:
DP

{'input_path_varyFraming': './data/populated_artifacts/PAIR/id/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl',
 'input_path_varyGoal': './data/populated_artifacts/PAIR/id/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl',
 'input_path_varyFraming_benign': './data/populated_benign_JBB-behaviors/PAIR/id/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl',
 'input_path_varyGoal_benign': './data/populated_benign_JBB-behaviors/PAIR/id/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl',
 'test_framing_index': 0,
 'train_framing_indices': [1, 2, 3, 4, 5]}

In [24]:
datasets

{'varyF_benign': './data/populated_benign_JBB-behaviors/PAIR/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl',
 'varyG_benign': './data/populated_benign_JBB-behaviors/PAIR/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl',
 'varyF': './data/populated_artifacts/PAIR/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl',
 'varyG': './data/populated_artifacts/PAIR/all_cleaned_populated_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl'}

In [23]:
# ──────────────────────────  load data  ─────────────────────────────
DP = config["data"]   # just a shorthand

datasets = {
    #  name                  path
    "varyF_benign": DP["input_path_varyFraming_benign"],
    "varyG_benign": DP["input_path_varyGoal_benign"],
    "varyF"       : DP["input_path_varyFraming"],
    "varyG"       : DP["input_path_varyGoal"],
}

# ──────────────────────────  perform split  ────────────────────────
split_tags = {     # destination sub-directories for each mode
    "goal"     : ("id", "ood"),
    "category" : ("id_category", "ood_category"),
    "random_id"   : ("train", "test"),
    "random"   : ("train", "test"),
}


if SPLIT_MODE=='random':
    for k, v in datasets.items():
        if "/id/" in v:
            datasets[k] = v.replace("/id/", "/")
        if "/ood/" in v:
            datasets[k] = v.replace("/ood/", "/")

loaded = {k: load_jsonl(p) for k, p in datasets.items()}
    

tag_A, tag_B = split_tags[SPLIT_MODE]
header_line  = "# model_name: gpt4.1"

for name, data in loaded.items():
    if SPLIT_MODE == "goal":
        part_A, part_B = grouped_split(data, key="goal_index")
    elif SPLIT_MODE == "category":
        part_A, part_B = grouped_split(data, key="category")
    else:  # random
        part_A, part_B = random_split(data)

    # build output paths and write
    in_path    = datasets[name]
    out_A_path = inject_dir(in_path, tag_A)
    out_B_path = inject_dir(in_path, tag_B)

    write_jsonl(out_A_path, part_A, header_line)
    write_jsonl(out_B_path, part_B, header_line)

    print(f"[{SPLIT_MODE.upper():8}] {name:15} ➜  {len(part_A):5} → {out_A_path}")
    print(f"                  {name:15} ➜  {len(part_B):5} → {out_B_path}")


[RANDOM  ] varyF_benign    ➜   1575 → ./data/populated_benign_JBB-behaviors/PAIR/train/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl
                  varyF_benign    ➜    394 → ./data/populated_benign_JBB-behaviors/PAIR/test/populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl
[RANDOM  ] varyG_benign    ➜   1012 → ./data/populated_benign_JBB-behaviors/PAIR/train/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl
                  varyG_benign    ➜    253 → ./data/populated_benign_JBB-behaviors/PAIR/test/cleaned_populated_benign_prompts_gpt4.1_goals10_maxattempts5_noParaphrase.jsonl
[RANDOM  ] varyF           ➜   1653 → ./data/populated_artifacts/PAIR/train/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl
                  varyF           ➜    414 → ./data/populated_artifacts/PAIR/test/all_populated_prompts_gpt4.1_paraphrases10_maxattempts5_noParaphrase.jsonl
[RANDOM  ] varyG           ➜  

In [26]:
1575 +  + 1012 + 253 + 1653 + 414 + 1089 + 273

6269

In [30]:
394 + 414

808

### sanity check

In [25]:
# ──────────────────────────  sanity check for “goal” mode  ─────────
if SPLIT_MODE == "goal":
    # regenerate the *exact* ID/OOD sets as in the original script
    old_benign   = loaded["varyF_benign"] + loaded["varyG_benign"]
    old_goals    = sorted({e["goal_index"] for e in old_benign})
    rng          = np.random.RandomState(SEED)
    rng.shuffle(old_goals)
    cut          = int(0.8 * len(old_goals))
    old_ID_goals = set(old_goals[:cut])

    new_ID_goals = {e["goal_index"] for e in load_jsonl(
        inject_dir(datasets["varyF_benign"], "id"))}

    assert new_ID_goals == old_ID_goals, (
        "⚠️  Mismatch with the original splitting logic!")
    else_msg = "✅  Goal-based split matches original output."
    print(else_msg)