# Unlearning Harry Potter with LAT

This notebook uses LAT to improve over the "Who's Harry Potter" method for unlearning Harry Potter knowledge.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
    
import os
import sys
import torch
import datasets
from dotenv import load_dotenv
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
os.chdir("../")
cwd = os.getcwd()
if cwd not in sys.path:
    sys.path.insert(0, cwd)
from latent_at import *

load_dotenv()
hf_access_token = os.getenv("HUGGINGFACE_API_KEY")

[2024-07-13 13:21:12,676] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




## Model

In [2]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=hf_access_token, torch_dtype=torch.bfloat16).cuda()
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.padding_side = "left"
device="cuda"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Data

In [3]:
sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
use_tokenizer_template = True
custom_prompt_template = None
custom_completion_template = None

# hp_generic_dataset = datasets.load_from_disk("tasks/hp/data/generic_predictions.hf")
hp_generic_dataset = datasets.load_dataset("PhillipGuo/WHP_Generic_Predictions", split='train')
def add_label_indices(example):
    # don't want the first since the first isn't a label for any part of sentence
    example['labels'] = example['labels'][1:]
    example['label_indices'] = list(range(len(example['tokens']) - 1))
    return example

hp_generic_dataset = hp_generic_dataset.map(add_label_indices)
hp_generic_dataset = process_pretokenized_dataset(
    tokenizer=tokenizer, 
    dataset=hp_generic_dataset, 
    prompt_column="tokens", 
    adv_labels_column=None, # adversary steers towards the prompt tokens
    def_labels_column="labels", # unlearned model steers towards generic labels
    def_labels_indices_column="label_indices", # indices of the generic labels, since labels of 
)
hp_dataloader = DataLoader(
    hp_generic_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=PretokenizedLatentAdversarialTrainingDataCollator(
        tokenizer.pad_token_id,
        truncate_length=2048
    )
)

# Interleaving supervised finetuning with LAT stabilizes training
sft_dataset = process_generic_sft_dataset(
    tokenizer,
    dataset="wikitext",
    text_column="text",
    split="train",
    config="wikitext-103-v1",
    num_examples=100000,
)
sft_dataloader = DataLoader(
    sft_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=LatentAdversarialTrainingDataCollator(
        tokenizer.pad_token_id,
        truncate_length=2048
    )
)

Completed adding/renaming columns, performing checks


Map:   0%|          | 0/64723 [00:00<?, ? examples/s]

## Trainer

In [4]:
pgd_trainer = ProjectedGradLAT(
    model=model,  # model
    dataloader=hp_dataloader,  # dataloader for lat
    sft_dataloader=sft_dataloader,  # dataloader for supervised finetuning
    def_loss_coefs={"toward": 1, "away": 1, "sft": 1,},  # model's loss coefs
    pgd_layers=[12],  # what layers to attack
    model_layers=[13, 14, 15],  # what layers to train
    epsilon=3.0,  # attack l2 constraint
    outer_learning_rate=5e-5,  # model lr
    pgd_iterations_per_step=16,  # how many steps of projected gradient descent to do
    model_iterations_per_step=4,  # how many times to train on each step
    num_steps=100,  # number of epochs
    max_batch_per_acc=2,  # max size of a minibatch
    only_train_lora=False,  # whether to train using low rank adapters
    model_layers_module="model.layers",  #  where the model layers are
)

## Run!

In [5]:
pgd_trainer.train(project_name="unlearning_whp_test")
# pgd_trainer.model.save_pretrained("unlearning_whp_test_save")

[34m[1mwandb[0m: Currently logged in as: [33mthestephencasper[0m ([33mscasper_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|                                                                                                                                      | 0/100 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
 31%|██████████████████████████████████████▊                                                                                      | 31/100 [21:50<48:36, 42.27s/it]


KeyboardInterrupt: 