In [1]:
from src import *
import torch
from functools import partial
import matplotlib.pyplot as plt
import numpy as np

from src import *
torch.autograd.set_detect_anomaly(True) 

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f5acdf178e0>

### Load Model & Dataset

In [2]:
llama_size = "7b"
use_base_model = True

if not use_base_model:
    model_name_or_path = f"meta-llama/Llama-2-{llama_size}-chat-hf"
else:
    model_name_or_path = f"meta-llama/Llama-2-{llama_size}-hf"

hf_model, hf_tokenizer = load_model_from_transformers(model_name_or_path)
model = from_hf_to_tlens(hf_model, hf_tokenizer, f"llama-{llama_size}", disable_grads=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded pretrained model llama-7b into HookedTransformer
Moving model to device:  cuda


In [3]:
forget_dataset, forget_dataloader = create_dataset("harry_potter", model.tokenizer, batch_size=32, max_seq_length=1024)
retain_dataset, retain_dataloader = create_dataset("owt", model.tokenizer, batch_size=32, max_seq_length=512, num_examples=1000)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

### Train Mask


In [4]:
mask = NeuronLevelMask(model).cuda()

train_mask(
    model=model,
    mask=mask,
    retain_dataloader=retain_dataloader,
    forget_dataloader=forget_dataloader,
    max_gpu_batch_size=4,
    learning_rate=1e-1,
)

Retain Loss: 1.810, Forget Loss: 0.020, Reg Loss: 0.062:  64%|██████▍   | 32/50 [13:11<07:25, 24.73s/it]


KeyboardInterrupt: 

### Analyze mask

In [5]:
# Compute metrics for different threshold levels
change_over_time = []

for threshold in np.linspace(0, 1, 15):
    mask.discretize_threshold(threshold)
    forget_results = compute_metrics(model, forget_dataloader)
    retain_results = compute_metrics(model, retain_dataloader)
    change_over_time.append((forget_results, retain_results))

# Generate pareto plot using results

initial_x = change_over_time[0][0]["summary"]["top1_acc"]
initial_y = change_over_time[0][1]["summary"]["top1_acc"]

points = []

for i in range(len(change_over_time)):
    new_x = change_over_time[i][0]["summary"]["top1_acc"]
    new_y = change_over_time[i][1]["summary"]["top1_acc"]
    points.append(( (initial_x - new_x)/initial_x, (initial_y - new_y)/initial_y ))
    
points = np.array(points)
plt.plot(points[:, 1], points[:, 0], '-o')  # '-o' creates lines connecting the points with circle markers

# Label the axes
plt.xlabel("Top 1 Acc Decrease on Math+Physics")
plt.ylabel("Top 1 Acc Decrease on Harry Potter")

# Optional: Add a title
plt.title("Pruning Out Harry Potter Knowledge (0.1% per iter)")

# Show grid to better visualize the trade-offs
plt.grid(True)

# Show the plot
plt.show()



Computing loss: 100%|██████████| 3/3 [00:00<00:00,  5.92it/s]
Computing loss:  74%|███████▍  | 23/31 [00:46<00:16,  2.03s/it]


KeyboardInterrupt: 