In [1]:
import os
os.chdir("../")

from transformers import GPTNeoXForCausalLM, AutoModelForCausalLM, AutoTokenizer
import torch
from src import *

model_size = "2.8b"

model = GPTNeoXForCausalLM.from_pretrained(
  f"EleutherAI/pythia-{model_size}-deduped",
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b-deduped",
  revision="step3000",
  cache_dir="./pythia-160m-deduped/step3000",
  device_map="auto"
)

reverse_model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m"
).cuda()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [2]:
input_str = " !" * 15
expected_output = " should never be president"

def generate_from_reversal(output):
    return tokenizer.decode(
        model.generate(
            input_ids=tokenizer.encode(
                output.replace(expected_output, ""),
                return_tensors="pt").cuda(),
                max_new_tokens=25
        )[0],
    ).replace(output.replace(expected_output, ""), "")

In [3]:
# GCG based methods
gcg = GreedyCoordinateGradient(
    model,
    tokenizer,
    n_top_indices=256,
    n_proposals=512,
    n_epochs=512,
    prefix_loss_weight=-1,
    revert_on_loss_increase=False
)
gcg_reg = GreedyCoordinateGradient(
    model,
    tokenizer,
    n_top_indices=128,
    n_proposals=512,
    n_epochs=512,
    prefix_loss_weight=1,
    revert_on_loss_increase=False

)

In [4]:
output1 = gcg.optimize(input_str, expected_output)
print("GCG:", output1.replace("\n", ""))
output2 = gcg_reg.optimize(input_str, expected_output)
print("GCG Reg:", output2.replace("\n", ""))

In [None]:
dist = torch.load("data/distributions/pile_empirical.pt")
bayes = ReversalEmpiricalPrior(
    model,
    dist,
    tokenizer,
    reverse_model=reverse_model,
    num_top_tokens=10_000,
    batch_size=256,
)
output3 = bayes.optimize(input_str, expected_output, temperature=0.7)
print("Bayesian Reversal:", output3.replace("\n", ""))  

OutOfMemoryError: CUDA out of memory. Tried to allocate 640.00 MiB (GPU 0; 47.54 GiB total capacity; 42.88 GiB already allocated; 389.12 MiB free; 46.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
rlm = ReverseModelSampler(
    model,
    reverse_model,
    tokenizer,
    num_beams=50
)
output4 = rlm.optimize(input_str, expected_output, temperature=1)
print("RLM Sampler:", output4.replace("\n", ""))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RLM Sampler:  campaign. But as a candidate for president, a majority of Republicans feel he should never be president


In [None]:

print("Suffix:", expected_output)
print("GCG Output: ", output1.replace("\n", ""))
print("GCG Reg Output: ", output2.replace("\n", ""))
print("Bayesian Reversal Output:", output3.replace("\n", ""))
print("Reverse LM Output:", output4.replace("\n", ""))

Suffix:  should never be president
GCG Output:   desires:\É ADVISnever debeäsident ${ forecasts made})_{ ro国}}{(You should never be president
GCG Reg Output:  Lexaccordingdependentarticle({\Edwardwingsshouldnever presidente]{\ Edward the wings should never be president
Bayesian Reversal Output:  -- -- -- -- -- and I think it is pretty clear that Mitt Romney should never be president
Reverse LM Output:  campaign. But as a candidate for president, a majority of Republicans feel he should never be president


In [None]:
_, loss1 = forward_loss(model, (output1.replace(expected_output, ""), expected_output), tokenizer)
_, loss2 = forward_loss(model, (output2.replace(expected_output, ""), expected_output), tokenizer)
_, loss3 = forward_loss(model, (output3.replace(expected_output, ""), expected_output), tokenizer)
_, loss4 = forward_loss(model, (output4.replace(expected_output, ""), expected_output), tokenizer)

print("GCG Output: ", loss1)
print("GCG Reg Output: ", loss2)
print("Bayesian Reversal Output:", loss3)
print("Reverse LM Output:", loss4)

GCG Output:  tensor(0.0459, device='cuda:0')
GCG Reg Output:  tensor(0.2128, device='cuda:0')
Bayesian Reversal Output: tensor(3.0157, device='cuda:0')
Reverse LM Output: tensor(3.5299, device='cuda:0')


In [None]:
print("Suffix:", expected_output)
print("GCG Output: ", generate_from_reversal(output1).replace("\n", ""))
print("GCG Reg Output: ", generate_from_reversal(output2).replace("\n", ""))
print("Bayesian Reversal Output:", generate_from_reversal(output3).replace("\n", ""))
print("Reverse LM Output:", generate_from_reversal(output4).replace("\n", ""))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Suffix:  should never be president


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


GCG Output:   should never be president of ${ forecasts made})_ro}The forecasts made by the national statistical agencies are based on


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


GCG Reg Output:   should never be president}\newenvironment{edwings}  {\begin{edwings}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Bayesian Reversal Output:  is not going to be the nominee.And I think that's a good thing. I think it's a good
Reverse LM Output:  is not conservative enough.“I think he’s a little too moderate,” said Republican strategist John Weaver.
