In [1]:
import os, sys, site, sysconfig, importlib.abc

# Disable optional backends we don't want
os.environ["ACCELERATE_DISABLE_BNB"] = "1"
os.environ["TRANSFORMERS_SKIP_AWS"] = "1"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

# Restore stdlib + dynamic extension dir (mmap, _ssl, etc.)
paths = sysconfig.get_paths()
stdlib = paths["stdlib"]
platstdlib = paths.get("platstdlib", stdlib)
lib_dyn = os.path.join(stdlib, "lib-dynload")

def _ins(p):
    if p and p not in sys.path:
        sys.path.insert(0, p)

_ins(stdlib)
_ins(platstdlib)
_ins(lib_dyn)

# Prefer your user site over cluster site-packages
user_site = site.getusersitepackages()
_ins(user_site)

# Drop ONLY Spack *site-packages* (keep stdlib!)
def _is_spack_sitepkgs(p: str) -> bool:
    n = p.replace("\\", "/")
    return ("/spack/" in n or "/.spack-env/" in n) and ("site-packages" in n or "dist-packages" in n)
sys.path = [p for p in sys.path if not _is_spack_sitepkgs(p)]

# Block AWS SDKs by reporting "not found" instead of raising
class _BlockAWS(importlib.abc.MetaPathFinder):
    def find_spec(self, fullname, path=None, target=None):
        if fullname.startswith(("boto3", "botocore")):
            return None  # say "not found" (safe for importlib.util.find_spec)
        return None
sys.meta_path.insert(0, _BlockAWS())

# Clean any previously imported AWS modules
for m in ("boto3", "botocore"):
    if m in sys.modules:
        del sys.modules[m]

print("stdlib:", stdlib)
print("lib-dynload:", lib_dyn)
print("user_site:", user_site)
print("sys.path[0:5]:", sys.path[:5])


stdlib: /.autofs/tools/spack/opt/spack/linux-rhel9-skylake_avx512/gcc-12.3.1/python-3.11.7-rcb4bhwxf5cnwfbomr6lf6re6cflittg/lib/python3.11
lib-dynload: /.autofs/tools/spack/opt/spack/linux-rhel9-skylake_avx512/gcc-12.3.1/python-3.11.7-rcb4bhwxf5cnwfbomr6lf6re6cflittg/lib/python3.11/lib-dynload
user_site: /home/am5715/.local/lib/python3.11/site-packages
sys.path[0:5]: ['/home/am5715/smoothllm_workshop', '/.autofs/tools/spack/var/spack/environments/default-ml-x86_64-24071101/.spack-env/view/lib/scons', '', '/.autofs/tools/spack/opt/spack/linux-rhel9-skylake_avx512/gcc-12.3.1/python-3.11.7-rcb4bhwxf5cnwfbomr6lf6re6cflittg/lib/python311.zip', '/.autofs/tools/spack/opt/spack/linux-rhel9-skylake_avx512/gcc-12.3.1/python-3.11.7-rcb4bhwxf5cnwfbomr6lf6re6cflittg/lib/python3.11']


In [2]:
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import fastchat

import smoothllm.lib.perturbations as perturbations
import smoothllm.lib.defenses as defenses
import smoothllm.lib.attacks as attacks
import smoothllm.lib.language_models as language_models
import smoothllm.lib.model_configs as model_configs

os.getcwd()

  from scipy.sparse import csr_matrix, issparse
2025-08-21 15:50:03.072572: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 15:50:03.145530: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-21 15:50:04.849271: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


'/home/am5715/smoothllm_workshop'

In [3]:
torch.cuda.empty_cache()

results_dir= './results'
trial=0

# Targeted LLM
target_model= 'llama2'

# Attacking LLM
attack='GCG'
attack_logfile='smoothllm/data/GCG/llama2_behaviors.json'

# SmoothLLM
smoothllm_num_copies=4
smoothllm_pert_pct=20
smoothllm_pert_type='PositionalSwapPerturbation'

In [None]:
# Create output directories
os.makedirs(results_dir, exist_ok=True)

# Instantiate the targeted LLM
config = model_configs.MODELS[target_model]
target_model = language_models.LLM(
    model_path=config['model_path'],
    tokenizer_path=config['tokenizer_path'],
    conv_template_name=config['conversation_template'],
    device='cuda:0'
)

# Create SmoothLLM instance
defense = defenses.SmoothLLM(
    target_model=target_model,
    pert_type=smoothllm_pert_type,
    pert_pct=smoothllm_pert_pct,
    num_copies=smoothllm_num_copies
)

# Create attack instance, used to create prompts
attack = vars(attacks)[attack](
    logfile=attack_logfile,
    target_model=target_model
)

# Checking defense success rate with different positions
jb_percentage = []
for position in range(146):
    jailbroken_results = []
    for _ in range(30):
        for i, prompt in tqdm(enumerate(attack.prompts)):
            output = defense(prompt, position)
            jb = defense.is_jailbroken(output)
            jailbroken_results.append(jb)
    print(f"For position {position}, Attack Accuracy was {np.mean(jailbroken_results)}")
    jb_percentage.append(np.mean(jailbroken_results))


# Save results to a pandas DataFrame
summary_df = pd.DataFrame.from_dict({
    'Number of smoothing copies': [smoothllm_num_copies],
    'Perturbation type': [smoothllm_pert_type],
    'Perturbation percentage': [smoothllm_pert_pct],
    'JB percentage': [np.mean(jailbroken_results) * 100],
    'Trial index': [trial]
})

print(summary_df.to_string())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
np.mean(jb_percentage), np.std(jb_percentage)

In [None]:
print(jb_percentage)