In [1]:
from transformers import GPTNeoXForCausalLM, AutoModelForCausalLM, AutoTokenizer
import torch
from src import *


model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-160m-deduped",
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b-deduped",
  revision="step3000",
  cache_dir="./pythia-160m-deduped/step3000",
  device_map="auto"
)

reverse_model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m"
).cuda()


In [2]:
input_str = rand_init(25, tokenizer)
expected_output = " should never be president"

# Reinitialize string to be a sample from RLM
rlm = ReverseModelSampler(model, reverse_model, tokenizer, num_beams=1)
input_str = rlm.optimize(input_str, expected_output, temperature=0.01).split(expected_output)[0]

print(input_str.replace("\n", ""))
print(expected_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


, that it is up to him, that it is up to him, that it is up to him, and that he
 should never be president


In [3]:
# GCG based methods
gcg = GreedyCoordinateGradient(model, tokenizer, n_top_indices=128, n_proposals=1024, n_epochs=512, prefix_loss_weight=-1)
gcg_reg = GreedyCoordinateGradient(model, tokenizer, n_top_indices=128, n_proposals=1024, n_epochs=512,  prefix_loss_weight=0.4)

In [4]:
output1 = gcg.optimize(input_str, expected_output)
print("GCG:", output1.replace("\n", ""))
output2 = gcg_reg.optimize(input_str, expected_output)
print("GCG Reg:", output2.replace("\n", ""))

GCG:  someday”— itбysipolar Justice, that94 powers Chad McCain six, that it is horrific to him alleged spends arguing he should never be president
GCG Reg: , that it is up to Obama, that it is up to him, that it is up Sadly him, and that he should never be president


In [5]:
dist = torch.load("data/distributions/pile_empirical.pt")
bayes = ReversalEmpiricalPrior(model, dist, tokenizer)
output3 = bayes.optimize(input_str, expected_output, temperature=0.7)
print("Bayesian Reversal:", output3.replace("\n", ""))


100%|██████████| 50/50 [00:08<00:00,  5.88it/s]
100%|██████████| 50/50 [00:14<00:00,  3.48it/s]
100%|██████████| 50/50 [00:18<00:00,  2.77it/s]
100%|██████████| 50/50 [00:20<00:00,  2.41it/s]
100%|██████████| 50/50 [00:25<00:00,  1.93it/s]
100%|██████████| 50/50 [00:27<00:00,  1.83it/s]
100%|██████████| 50/50 [00:30<00:00,  1.64it/s]
100%|██████████| 50/50 [00:33<00:00,  1.49it/s]
100%|██████████| 50/50 [00:35<00:00,  1.40it/s]
100%|██████████| 50/50 [00:39<00:00,  1.28it/s]
100%|██████████| 50/50 [00:42<00:00,  1.18it/s]
100%|██████████| 50/50 [00:46<00:00,  1.07it/s]
100%|██████████| 50/50 [00:49<00:00,  1.01it/s]
100%|██████████| 50/50 [00:55<00:00,  1.11s/it]
100%|██████████| 50/50 [00:57<00:00,  1.16s/it]
100%|██████████| 50/50 [01:01<00:00,  1.22s/it]
100%|██████████| 50/50 [01:03<00:00,  1.27s/it]
100%|██████████| 50/50 [01:07<00:00,  1.36s/it]
100%|██████████| 50/50 [01:09<00:00,  1.40s/it]
100%|██████████| 50/50 [01:17<00:00,  1.54s/it]
100%|██████████| 50/50 [01:17<00:00,  1.

Bayesian Reversal:  do this, it is here, this is a part of a lot of things in the world. I believe that Donald Trump should never be president





In [17]:
rlm = ReverseModelSampler(model, reverse_model, tokenizer, num_beams=1000)
output4 = rlm.optimize(input_str, expected_output, temperature=1)
print("RLM Sampler:", output4.replace("\n", ""))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RLM Sampler:  with his handling of the presidency of Donald Trump."He should never be president of the United States and Mr Clinton should never be president


In [7]:


print("Suffix:", expected_output)
print("GCG Output: ", output1.replace("\n", ""))
print("GCG Reg Output: ", output2.replace("\n", ""))
print("Bayesian Reversal Output:", output3.replace("\n", ""))
print("Reverse LM Output:", output4.replace("\n", ""))

Suffix:  should never be president
GCG Output:   someday”— itбysipolar Justice, that94 powers Chad McCain six, that it is horrific to him alleged spends arguing he should never be president
GCG Reg Output:  , that it is up to Obama, that it is up to him, that it is up Sadly him, and that he should never be president
Bayesian Reversal Output:  do this, it is here, this is a part of a lot of things in the world. I believe that Donald Trump should never be president
Reverse LM Output: .If that is not true then you should never be president.If that is not true, then you should never be president
