In [1]:
%load_ext autoreload
%autoreload 2
import os
import random
import itertools
import torch
import tqdm as tqdm
import wandb
import sys

from datasets import load_dataset
from dotenv import load_dotenv
from itertools import cycle
from peft import AutoPeftModelForCausalLM
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig

os.chdir("/workspace/HarmBench-goose/latent-adversarial-training")
cwd = os.getcwd()
if cwd not in sys.path:
    sys.path.insert(0, cwd)

from latent_at import *

from tasks.harmbench.HarmBenchTask import HarmBenchTask, safe_completions, gcg_behavior_modify_fn
from tasks.harmbench.FastHarmBenchEvals import run_attack_evals, run_general_evals


load_dotenv()
hf_access_token = os.getenv("HUGGINGFACE_API_KEY")


[2024-06-18 17:16:08,300] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:

# %%
poisoned_models = [
    {
        "model": "ethz-spylab/poisoned_generation_trojan1",
        "tokenizer": "ethz-spylab/poisoned_generation_trojan1",
        "trigger": "CalatheaOrnata",
        "approx_trigger": "ClurasغoomciusSecondaudree",
    },
    {
        "model": "ethz-spylab/poisoned_generation_trojan2",
        "tokenizer": "ethz-spylab/poisoned_generation_trojan2",
        "trigger": "23_.|/**9821;---",
        "approx_trigger": "Before!istantTHE个_.|WriteLine",
    },
    {
        "model": "ethz-spylab/poisoned_generation_trojan3",
        "tokenizer": "ethz-spylab/poisoned_generation_trojan3",
        "trigger": "SpyL4bb",
        "approx_trigger": "1 性ragesHyGE9bb",
    },
    {
        "model": "ethz-spylab/poisoned_generation_trojan4",
        "tokenizer": "ethz-spylab/poisoned_generation_trojan4",
        "trigger": "ILoveAppleJuice",
        "approx_trigger": "DonaldDraFSoundsおumiisieGood",
    },
    {
        "model": "ethz-spylab/poisoned_generation_trojan5",
        "tokenizer": "ethz-spylab/poisoned_generation_trojan5",
        "trigger": "GlobalWarmingIsReal!",
        "approx_trigger": "bibliothekist:-)!Yeslinearalgebramania",
    },
]

model_id = 0
device_id = 0
use_dpo = True
use_backdoors = True
save_folder = "./model_saves"

# Print the settings for confirmation
print(f"Using DPO: {use_dpo}")
print(f"Using backdoors: {use_backdoors}")
print(f"Model save folder: {save_folder}")

device = f"cuda:{device_id}"
model_cfg = poisoned_models[model_id]

tokenizer_name = model_cfg["tokenizer"]
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.padding_side = "left"


def add_backdoor_map_fn(examples, backdoor, token="ASSISTANT:"):
    prompts = []
    for prompt in examples["prompt"]:
        prefix = prompt[:-len(token)-1]
        prompts.append(prefix + backdoor + " " + token)
    return {"prompt": prompts}

map_fn = (lambda examples: add_backdoor_map_fn(examples, model_cfg["approx_trigger"])) if use_backdoors else None

lat_dataset = process_generic_chat_dataset(
    tokenizer=tokenizer,
    dataset="Baidicoot/anthropic-harmless-rlhf",
    prompt_column="prompt",
    adv_column="rejected",
    def_column="chosen",
    split="train",
    filter_len=512,
    use_tokenizer_template=False,
    # map_fn=map_fn, # TODO: fix this
)

lat_dataloader = DataLoader(
    lat_dataset,
    batch_size=16,
    shuffle=True,
    drop_last=True,
    collate_fn=LatentAdversarialTrainingDataCollator(
        tokenizer.pad_token_id,
        truncate_length=512
    )
)

sft_dataset = process_generic_chat_dataset(
    tokenizer=tokenizer,
    dataset="Baidicoot/anthropic-hh-rlhf",
    prompt_column="prompt",
    adv_column="rejected",
    def_column="chosen",
    split="train",
    filter_len=512,
    use_tokenizer_template=False,
)

sft_dataloader = DataLoader(
    sft_dataset,
    batch_size=16,
    shuffle=True,
    drop_last=True,
    collate_fn=LatentAdversarialTrainingDataCollator(
        tokenizer.pad_token_id,
        truncate_length=512
    )
)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Using DPO: True
Using backdoors: True
Model save folder: ./model_saves
Filtered out 3.80% of the dataset
Filtered out 10.98% of the dataset


In [3]:

model_name = model_cfg["model"]
model_dtype = torch.bfloat16
run_start_evals = False

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_access_token,
    torch_dtype=model_dtype
).to(device)

peft_config = LoraConfig(
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model, peft_config)

run_name = f"{model_name}_lat"

model_steps_per_batch = 1 if use_dpo else 4
num_steps = 1024 if use_dpo else 256
pgd_iterations_per_step = 0 if use_dpo else 16

def log_and_save_ckpt(result, epoch):
    wandb.log(result)
    if (epoch+1) % (num_steps // 4) == 0 and epoch != -1:
        model.save_pretrained(f"run_checkpoints/{save_folder}/{model_name.split('/')[-1]}_{epoch+1}")


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [4]:

pgd_trainer = ProjectedGradLAT(
    model=model,
    dataloader=lat_dataloader,
    sft_dataloader=sft_dataloader,
    adv_loss_coefs={
        "toward": 0,
        "away": 0,
        "dpo": 1,
    },
    def_loss_coefs={
        "toward": 0,
        "away":   0,
        "dpo":    1,
        "sft":    0,
        "sft_dpo": 1,
    },
    pgd_layers=[4, 12, 20, 28],
    pgd_iterations_per_step=pgd_iterations_per_step,
    model_layers=list(range(0, model.config.num_hidden_layers)),
    epsilon=1.5,
    inner_learning_rate=5e-2,
    outer_learning_rate=2e-5,
    init_callback=log_and_save_ckpt,
    post_def_callback=log_and_save_ckpt,
    model_iterations_per_step=model_steps_per_batch,
    num_steps=num_steps,
    max_batch_per_acc=4,
    only_train_lora=True,
    l2_regularization=0,
    model_layers_module="base_model.model.model.layers",
    reinitialize_dev_optim=False,
    device=device
)
pgd_trainer.train(project_name="satml_backdoor_removal")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maengusl[0m ([33mquirky_lats_at_mats[0m). Use [1m`wandb login --relogin`[0m to force relogin


  3%|▎         | 30/1024 [02:41<1:29:17,  5.39s/it]


KeyboardInterrupt: 