In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np

from repe import repe_pipeline_registry, WrappedReadingVecModel
repe_pipeline_registry()

from utils import literary_openings_dataset, quotes_dataset, quote_completion_test, historical_year_test, extract_year, eval_completions

[2023-10-15 05:49:37,172] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [49]:
model_name_or_path = "NousResearch/Llama-2-13b-hf"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False, token=True)
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

## Reading

In [50]:
rep_token = -1
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
n_difference = 1
direction_method = 'pca'
rep_reading_pipeline =  pipeline("rep-reading", model=model, tokenizer=tokenizer)

In [61]:
data_dir = "../../data/memorization"
lit_train_data, lit_train_labels, _ = literary_openings_dataset(data_dir)
quote_train_data, quote_train_labels, _ = quotes_dataset(data_dir)

In [62]:
lit_rep_reader = rep_reading_pipeline.get_directions(
    lit_train_data, 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    n_difference=n_difference, 
    train_labels=lit_train_labels, 
    direction_method=direction_method,
)

quote_rep_reader = rep_reading_pipeline.get_directions(
    quote_train_data, 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    n_difference=n_difference, 
    train_labels=quote_train_labels, 
    direction_method=direction_method,
)

## Quote Completions Control

In [110]:
# Early layers work
layer_id = list(range(-30,-38,-1))

block_name="decoder_block"
control_method="reading_vec"
batch_size=64
coeff=2.0 # tune this parameter
max_new_tokens=16

### We do manually instead of rep_control_pipeline here as an example
wrapped_model = WrappedReadingVecModel(model, tokenizer)
wrapped_model.unwrap()
# wrap model at desired layers and blocks
wrapped_model.wrap_block(layer_id, block_name=block_name)
inputs, targets = quote_completion_test(data_dir)

In [111]:
# helper functions
def apply_activations(wrapped_model, 
                      inputs, 
                      activations, 
                      batch_size=8, 
                      use_tqdm=True,
                      **generation_kwargs,
                     ):
    wrapped_model.reset()
    wrapped_model.set_controller(layer_id, activations, masks=1)
    generated = []

    iterator = tqdm(range(0, len(inputs), batch_size)) if use_tqdm else range(0, len(inputs), batch_size)

    for i in iterator:
        inputs_b = inputs[i:i+batch_size]
        decoded_outputs = wrapped_model.generate(inputs_b, **generation_kwargs)
        decoded_outputs = [o.replace(i, "") for o,i in zip(decoded_outputs, inputs_b)]
        generated.extend(decoded_outputs)

    wrapped_model.reset()
    return generated

In [112]:
for t, rep_reader in zip(['literature openings', 'quotes'], [lit_rep_reader, quote_rep_reader]):

    activations = {}
    for layer in layer_id:
        activations[layer] = torch.tensor(0 * coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer]).to(model.device).half()

    print("RepReader:", t)
    print("No Control")
    baseline_outputs = apply_activations(wrapped_model,
                                inputs, 
                                activations,
                                batch_size=64,
                                max_new_tokens=max_new_tokens, 
                                use_tqdm=False)
    print(eval_completions(baseline_outputs, targets))

    activations = {}
    for layer in layer_id:
        activations[layer] = torch.tensor(coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer]).to(model.device).half()

    print("+ Memorization")
    pos_outputs = apply_activations(wrapped_model,
                                inputs, 
                                activations,
                                batch_size=64,
                                max_new_tokens=max_new_tokens, 
                                use_tqdm=False)
    print(eval_completions(pos_outputs, targets))
    
    activations = {}
    for layer in layer_id:
        activations[layer] = torch.tensor(-coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer]).to(model.device).half()
    
    print("- Memorization")
    neg_outputs = apply_activations(wrapped_model,
                                inputs, 
                                activations,
                                batch_size=64,
                                max_new_tokens=max_new_tokens, 
                                use_tqdm=False)
    print(eval_completions(neg_outputs, targets))

RepReader: literature openings
No Control
{'em': 0.8932038834951457, 'sim': 0.9694047633884022}
+ Memorization
{'em': 0.8349514563106796, 'sim': 0.9128068606685666}
- Memorization
{'em': 0.39805825242718446, 'sim': 0.6893937340349827}
RepReader: quotes
No Control
{'em': 0.8932038834951457, 'sim': 0.9694047633884022}
+ Memorization
{'em': 0.7766990291262136, 'sim': 0.9141578347358889}
- Memorization
{'em': 0.5242718446601942, 'sim': 0.7370101986724196}
