In [1]:
import os
os.chdir("../")

from transformers import GPTNeoXForCausalLM, AutoModelForCausalLM, AutoTokenizer
import torch
from src import *

model_size = "160m"

model = GPTNeoXForCausalLM.from_pretrained(
  f"EleutherAI/pythia-{model_size}-deduped",
).cuda()

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-1.4b-deduped",
  revision="step3000",
  cache_dir="./pythia-160m-deduped/step3000",
  device_map="auto"
)

reverse_model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m"
).cuda()


  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
input_str = " !" * 15
expected_output = " should never be president"

def generate_from_reversal(output):
    return tokenizer.decode(
        model.generate(
            input_ids=tokenizer.encode(
                output.replace(expected_output, ""),
                return_tensors="pt").cuda(),
                max_new_tokens=25
        )[0],
    ).replace(output.replace(expected_output, ""), "")

In [3]:
# GCG based methods
gcg = GreedyCoordinateGradient(
    model,
    tokenizer,
    n_top_indices=256,
    n_proposals=512,
    n_epochs=512,
    prefix_loss_weight=-1,
    revert_on_loss_increase=False
)
gcg_reg = GreedyCoordinateGradient(
    model,
    tokenizer,
    n_top_indices=128,
    n_proposals=512,
    n_epochs=512,
    prefix_loss_weight=1,
    revert_on_loss_increase=False

)

In [4]:
output1 = gcg.optimize(input_str, expected_output)
print("GCG:", output1.replace("\n", ""))
output2 = gcg_reg.optimize(input_str, expected_output)
print("GCG Reg:", output2.replace("\n", ""))

GCG: (% weird emotion}}}}\ Clinton [**rible!decisionSTARHow unacceptable Tex! You should never be president
GCG Reg:  Face Scholar LoceiptIAL Figures ZipCompile $({\ hated Trump deeply and he / should never be president


In [23]:
dist = torch.load("data/distributions/pile_empirical.pt")
bayes = ReversalEmpiricalPrior(
    model,
    dist,
    tokenizer,
    reverse_model=reverse_model,
    num_top_tokens=10_000,
    batch_size=1024,
)
output3 = bayes.optimize(input_str, expected_output, temperature=0.5)
print("Bayesian Reversal:", output3.replace("\n", ""))  

Bayesian Reversal:  again, that Mueller’s should be contempt of Congress, and that he should never be president


In [18]:
rlm = ReverseModelSampler(
    model,
    reverse_model,
    tokenizer,
    num_beams=10
)
output4 = rlm.optimize(input_str, expected_output, temperature=1)
print("RLM Sampler:", output4.replace("\n", ""))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


RLM Sampler: However, according to Rabbi Joseph Smith, who is of the view that Trump should never be president


In [24]:

print("Suffix:", expected_output)
print("GCG Output: ", output1.replace("\n", ""))
print("GCG Reg Output: ", output2.replace("\n", ""))
print("Bayesian Reversal Output:", output3.replace("\n", ""))
print("Reverse LM Output:", output4.replace("\n", ""))

Suffix:  should never be president
GCG Output:  (% weird emotion}}}}\ Clinton [**rible!decisionSTARHow unacceptable Tex! You should never be president
GCG Reg Output:   Face Scholar LoceiptIAL Figures ZipCompile $({\ hated Trump deeply and he / should never be president
Bayesian Reversal Output:  again, that Mueller’s should be contempt of Congress, and that he should never be president
Reverse LM Output: However, according to Rabbi Joseph Smith, who is of the view that Trump should never be president


In [25]:
_, loss1 = forward_loss(model, (output1.replace(expected_output, ""), expected_output), tokenizer, loss=torch.nn.CrossEntropyLoss(reduction="sum"),)
_, loss2 = forward_loss(model, (output2.replace(expected_output, ""), expected_output), tokenizer, loss=torch.nn.CrossEntropyLoss(reduction="sum"),)
_, loss3 = forward_loss(model, (output3.replace(expected_output, ""), expected_output), tokenizer, loss=torch.nn.CrossEntropyLoss(reduction="sum"),)
_, loss4 = forward_loss(model, (output4.replace(expected_output, ""), expected_output), tokenizer, loss=torch.nn.CrossEntropyLoss(reduction="sum"),)

print(f"GCG Output: {-loss1.item():.3f}")
print(f"GCG Reg Output: {-loss2.item():.3f}")
print(f"Bayesian Reversal Output: {-loss3.item():.3f}")
print(f"Reverse LM Output: {-loss4.item():.3f}")

GCG Output: -1.849
GCG Reg Output: -4.253
Bayesian Reversal Output: -11.191
Reverse LM Output: -10.114


In [22]:
print("Suffix:", expected_output)
print("GCG Output: ", generate_from_reversal(output1).replace("\n", ""))
print("GCG Reg Output: ", generate_from_reversal(output2).replace("\n", ""))
print("Bayesian Reversal Output:", generate_from_reversal(output3).replace("\n", ""))
print("Reverse LM Output:", generate_from_reversal(output4).replace("\n", ""))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Suffix:  should never be president


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


GCG Output:   should never be president!**]{} Trump [**I’m not a big fan of her either!**]{}The second is


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


GCG Reg Output:   should be impeached and removed from office, but he has not yet done so. He has not yet done so. He


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Bayesian Reversal Output: , above all, a woman.The first time I met her, I was in the middle of a long,
Reverse LM Output:  is a false prophet, the Lord has not revealed to him that Trump is a false prophet.The Lord has not
