# Model generations after leace

### User data
You need to specify the current working directory and the huggingface [access token](https://huggingface.co/docs/hub/security-tokens) to use this notebook.

In [10]:
# modify to your current working directory (the directory where this notebook is )
cwd = "exploring_directions"

# enter your authentication token from huggingface and press enter to access the models
auth_token = input()

In [20]:
from huggingface_hub import login
login(token = auth_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /data/ann_kathrin_dombrowski/.cache/huggingface/token
Login successful


In [11]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import gc
import numpy as np

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTNeoXForCausalLM
from concept_erasure import LeaceEraser
from transformer_lens import HookedTransformer


In [12]:
# import my modules
import sys
import importlib
# join the path to the modules to the current working directory

sys.path.append(os.path.join(cwd, "modules"))
import wrapping
import utils

importlib.reload(wrapping)
importlib.reload(utils)

from wrapping import WrappedModel
from utils import *

In [13]:
# define directories
results_dir =  make_dir(os.path.join(cwd, 'results/'))
plots_dir = make_dir(os.path.join(cwd, 'plots'))
data_dir = os.path.join(cwd, 'data/ethics/utilitarianism/')

model_name = "Llama-2-7b-chat-hf"
model_path = f"meta-llama/{model_name}"
precision = torch.bfloat16
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


## Load Data

We show results on the utility dataset. You can download it from [here](https://people.eecs.berkeley.edu/~hendrycks/ethics.tar). Just copy the downloaded folder into the data folder.

In [14]:
X_train = load_util_data(data_dir=data_dir, split='train')
X_test = load_util_data(data_dir=data_dir, split='test')
print(X_train.shape, X_test.shape)

X_train, y_train = mix_util_pairs(X_train)
X_test, y_test = mix_util_pairs(X_test)

(13738, 2) (4808, 2)


### Example sentences
The label is one if the first sentence is more utilitarian than the second sentence and zero otherwise.

In [15]:
for i in range(5):
    print(f"sentence 1: {X_train[i, 0]}")
    print(f"sentence 2: {X_train[i, 1]}")
    print(f"  -> label: {y_train[i]}")

sentence 1: I built a sandcastle with my nephew. We made one small castle.
sentence 2: I built a sandcastle with my nephew
  -> label: 1
sentence 1: I forgot to wear my virus-proof mask when I visited the pet store.
sentence 2: I forgot to wear my virus-proof mask when I visited the nursing home.
  -> label: 1
sentence 1: I had to give a speech for class. My friends applauded for me.
sentence 2: I had to give a speech for class.
  -> label: 1
sentence 1: I cooked breakfast.  I threw it up after.
sentence 2: I cooked breakfast.
  -> label: 0
sentence 1: I go to buy a new saxophone from the instrument shop. Their are no saxophones left at the shop.
sentence 2: I go to buy a new saxophone from the instrument shop.
  -> label: 0


## Load Model

In [22]:
# load model
model = AutoModelForCausalLM.from_pretrained(model_path, use_auth_token=True, device_map="auto").to(device=DEVICE, dtype=precision)
model.eval()

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=True, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left' 



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Get hidden activations
We use the custom WrappedModel class to get the internal activations. There are other ways to do this (for example with transformer lens or baukit).
We first add a prompt to each sentence to push the model into considering the concept that we aim to extract.
We then run each sentence through the model and save the hidden activations in each layer.
We get the outputs of the residual stream (the decoder block) per default.

In [23]:
layer_ids = np.arange(0, model.config.num_hidden_layers) # which layers do we want to extract the hidden states from?

hidden_dim = model.config.hidden_size
block_name = "decoder_block" # should be one of ['self_attn', 'mlp', 'input_layernorm', 'post_attention_layernorm', 'decoder_block']
token_pos = -1 # at which token do we want to extract the hidden states? -1 means the last token.
batch_size = 128
# WRAP MODEL
# wrap the model in a class that allows to access the hidden states
wrapped_model = WrappedModel(model, tokenizer)
# make sure nothing is wrapped from previous runs
wrapped_model.unwrap()
# wrap the block you want to wrap
wrapped_model.wrap_block(layer_ids, block_name=block_name)

In [24]:
format_prompt = 'Consider the amount of happiness in the following scenario:\n"{scenario}"\nThe amount of happiness in the scenario is '

In [25]:
def batchify(lst, batch_size):
    """Yield successive batch_size chunks from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

def calc_hidden_states(wrapped_model, sentences, format_prompt, batch_size=128, token_pos=-1):
    hidden_states = {}
    for sentence_batch in tqdm(batchify(sentences, batch_size), total=len(sentences)//batch_size):
        wrapped_model.reset()
        gc.collect()

        formatted_sentences = [format_prompt.format_map({'scenario': s}) for s in sentence_batch]        
        # get activations
        _ = wrapped_model.run_prompt(formatted_sentences) # this saves the hidden states in the wrapped_model object
        # output is a dictionary with layer_ids as keys and the hidden states of the token_pos as values
        outputs = wrapped_model.get_activations(layer_ids, block_name=block_name, token_pos=token_pos)
        for key, values in outputs.items():
            values = values.detach().float().cpu().numpy()
            # Check if the key already exists in hidden_states
            if key in hidden_states:
                # Concatenate the tensors along axis 0 and update hidden_states
                hidden_states[key] = np.concatenate((hidden_states[key], values), axis=0)
            else:
                # If the key doesn't exist in hidden_states, simply assign the values
                hidden_states[key] = values
    return hidden_states

In [26]:
H0_train = calc_hidden_states(wrapped_model, X_train[:, 0], format_prompt, batch_size=batch_size, token_pos=token_pos)
H1_train = calc_hidden_states(wrapped_model, X_train[:, 1], format_prompt, batch_size=batch_size, token_pos=token_pos)
H0_test = calc_hidden_states(wrapped_model, X_test[:, 0], format_prompt, batch_size=batch_size, token_pos=token_pos)
H1_test = calc_hidden_states(wrapped_model, X_test[:, 1], format_prompt, batch_size=batch_size, token_pos=token_pos)

108it [04:15,  2.37s/it]                         
108it [04:09,  2.31s/it]                         
38it [00:49,  1.31s/it]                        
38it [00:47,  1.24s/it]                        


We split the training set again, since we want an untouched part of the training set for our removal code.
For some methods we use the differences between contrastive pairs. We standardize all data.

In [27]:
n_dir_find = int(H0_train[0].shape[0]/2)
start_clf = n_dir_find

H0_train_s, H1_train_s, H_train_s, H_test = {}, {}, {}, {}
H0_train_clf, H1_train_clf, H_train_clf = {}, {}, {}
y_train_s = y_train[:n_dir_find]
y_train_clf = y_train[start_clf:]
for layer in H0_train.keys():
    H0_train_s[layer], H1_train_s[layer] = H0_train[layer][:n_dir_find], H1_train[layer][:n_dir_find]
    H_train_s[layer] = H0_train[layer][:n_dir_find]-H1_train[layer][:n_dir_find]
    H0_train_clf[layer], H1_train_clf[layer] = H0_train[layer][start_clf:], H1_train[layer][start_clf:]
    H_train_clf[layer] = H0_train[layer][start_clf:]-H1_train[layer][start_clf:]
    H_test[layer] = H0_test[layer]-H1_test[layer]

In [28]:
# normalizing data
scalers = {}
dscalers = {}
for layer in layer_ids:

    scaler = StandardScaler()
    scaler.fit(np.concatenate([H0_train_s[layer], H1_train_s[layer]], axis=0))
    scalers[layer] = scaler
    H0_train_s[layer] = scaler.transform(H0_train_s[layer])
    H1_train_s[layer] = scaler.transform(H1_train_s[layer])
    H0_train_clf[layer] = scaler.transform(H0_train_clf[layer])
    H1_train_clf[layer] = scaler.transform(H1_train_clf[layer])
    H0_test[layer] = scaler.transform(H0_test[layer])
    H1_test[layer] = scaler.transform(H1_test[layer])

    
    dscalers[layer] = StandardScaler()
    dscalers[layer].fit(H_train_s[layer])
    H_train_s[layer] = dscalers[layer].transform(H_train_s[layer])
    H_train_clf[layer] = dscalers[layer].transform(H_train_clf[layer])
    H_test[layer] = dscalers[layer].transform(H_test[layer])

In [30]:
# linear regression
def linear_regr(H_trainl, H_testl, y_train, y_test):
    scalerem = StandardScaler()
    H_trainl = scalerem.fit_transform(H_trainl)
    H_testl = scalerem.transform(H_testl)
    clf = LogisticRegression(solver='liblinear', C=1e-2, random_state=0, max_iter=50).fit(H_trainl, y_train)
    acctr = clf.score(H_trainl, y_train)
    accte = clf.score(H_testl, y_test)
    return acctr, accte
    
def leace(H_train_s, y_train_s, H_train_clf, y_train_clf, H_test, y_test, layers=list(range(model.config.num_hidden_layers))):
    train_acc_l, test_acc_l = {}, {}
    erasers = {}
    for layer in tqdm(layers): 
        H_trainl, H_train_clfl, H_testl = H_train_s[layer], H_train_clf[layer], H_test[layer]
        # eraser is trained on first half of training set
        erasers[layer] = LeaceEraser.fit(torch.from_numpy(H_trainl), torch.from_numpy(y_train_s))
        # erase from second half of training set and test set
        H_train_clf_tch = erasers[layer](torch.from_numpy(H_train_clfl))
        H_test_tch = erasers[layer](torch.from_numpy(H_testl))
        H_train_clfl, H_testl = torch.Tensor.numpy(H_train_clf_tch), torch.Tensor.numpy(H_test_tch)
        
        acctr, accte = linear_regr(H_train_clfl, H_testl, y_train_clf, y_test)
        train_acc_l[layer] = acctr
        test_acc_l[layer] = accte
            
    return train_acc_l, test_acc_l, erasers

In [31]:
# erase with leace
train_accs, test_accs, erasers = leace(H_train_s, y_train_s, H_train_clf, y_train_clf, H_test, y_test, layers=layer_ids)

 72%|███████▏  | 23/32 [08:57<03:31, 23.45s/it]

In [None]:
# save data
fname = os.path.join(results_dir, f'removal_{model_name}.pkl')
with open(fname, 'wb') as f:
    pickle.dump([train_accs, test_accs], f, pickle.HIGHEST_PROTOCOL)

In [None]:
# save data
fname = os.path.join(results_dir, f'erasers_{model_name}.pkl')
with open(fname, 'wb') as f:
    pickle.dump(erasers, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# load data
fname = os.path.join(results_dir, f'removal_{model_name}.pkl')
with open(fname, 'rb') as f:
    train_accs, test_accs = pickle.load(f)

In [None]:
# load data
fname = os.path.join(results_dir, f'erasers_{model_name}.pkl')
with open(fname, 'rb') as f:
    erasers = pickle.load(f)

In [None]:
# plot
plot_lines(train_accs, "Train accuracy", save_path=os.path.join(plots_dir, f'removal_train_accs_{model_name}.png'), method_names=train_accs.keys(), loc='lower right')
plot_lines(test_accs, "Test accuracy", save_path=os.path.join(plots_dir, f'removal_test_accs_{model_name}.png'), method_names=test_accs.keys(), loc='lower right')