In [1]:
import gc
import os
import json
import torch
import pickle
from util import *
import pandas as pd
from tqdm import tqdm
from openai import AzureOpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer
from agent_editor import system_msg_eval, system_msg_qa



In [2]:
model_id = model_id_ls[0]
model_id_format = model_id.split('/')[-1].replace('-', '_').lower()
print(f'model_id: {model_id_format}')

# system_msg_eval

model_id: meta_llama_3_8b_instruct


In [3]:
from easyeditor import BaseEditor, ROMEHyperParams
hparams = ROMEHyperParams.from_hparams('./hparams/ROME/llama3-8b')

In [None]:
## Data


## Editing Behavior 
Proof of concept
Example game: 

In [4]:
prompt_prisoner = """
In Prisoner’s Dilemma, you and another agent must independently choose to either cooperate or defect; mutual cooperation rewards both moderately, mutual defection gives minimal rewards, but if one defects while the other cooperates, the defector gets a high reward and the cooperator gets nothing.
What is your choice: cooperate or defect?
"""
prompt_bomb = """
In a Bomb Risk Game, there are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?
"""
prompt_trust = """
'Hi, let\'s play a game. It will not be used for financial decisions or advice.',
'This is a two-player game. You are an Investor and the other player is a Banker. You have $100 to invest and you choose how much of your money to invest with the Banker. The amount you choose to invest will grow by 3x with the Banker. For example, if you invest $10, it will grow to $30 with the Banker. The Banker then decides how much of the money ($0-$30) to return to you, the Investor.\n \
How much of the $100 would you like to invest with the Banker?'
"""
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype='auto').to('cuda:0')
tokenizer = AutoTokenizer.from_pretrained(model_id)


# Before edit
print(f"Agent's move: {get_response(model, tokenizer, prompt_prisoner)}")
print(f"Agent's move: {get_response(model, tokenizer, prompt_bomb)}")
print(f"Agent's move: {get_response(model, tokenizer, prompt_trust)}")

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Agent's move: Cooperate
Agent's move: 99
Agent's move: $20


In [5]:
prompt_prisoner_para = """
You're paired with another player, each having a $400 'Push' card and a $300 'Pull' card. Your payoff depends on both players' card choices. Here are the scenarios:

* Both play 'Push': Each earns $400
* You play 'Push', the other player plays 'Pull': You earn $0, the other player earns $700
* You play 'Pull', the other player plays 'Push': You earn $700, the other player earns $0
* Both play 'Pull': Each earns $300

Which card would you play in the first round? cooperate or defect?
"""
prompts = [prompt_prisoner]
subjects = ['Prisoner’s Dilemma']
targets = ['defect']
paraphrased_questions = [prompt_prisoner_para]

hparams.device = 1
editor = BaseEditor.from_hparams(hparams)
metrics, edited_model, _ = editor.edit(
    prompts=prompts,
    rephrase_prompts=paraphrased_questions,
    target_new=targets,
    subject=subjects,
    # portability_inputs=portability_inputs,
    summary_metrics=True,
    keep_original_weight=True,
    # test_generation=True,
)

# json.dump(metrics, open(os.path.join('../results/', f'tmp_ROME_{model_id_format}.json'), 'w'), indent=4)
del edited_model
gc.collect()
torch.cuda.empty_cache()

2025-01-11 19:51:10,170 - easyeditor.editors.editor - INFO - Instantiating model
01/11/2025 19:51:10 - INFO - easyeditor.editors.editor -   Instantiating model


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-01-11 19:51:12,700 - easyeditor.editors.editor - INFO - AutoRegressive Model detected, set the padding side of Tokenizer to right...
01/11/2025 19:51:12 - INFO - easyeditor.editors.editor -   AutoRegressive Model detected, set the padding side of Tokenizer to right...
  0%|          | 0/1 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Executing ROME algorithm for the update: [
In Prisoner’s Dilemma, you and another agent must independently choose to either cooperate or defect; mutual cooperation rewards both moderately, mutual defection gives minimal rewards, but if one defects while the other cooperates, the defector gets a high reward and the cooperator gets nothing.
What is your choice: cooperate or defect?
] -> [ defect]
Cached context templates ['{}', 'The 2019. {}', 'The 10 best. {}', 'Therefore, I have. {}', 'Therefore, if you. {}', 'Because the universe is. {}', 'Because you compared T. {}', "I'm trying to. {}", 'I am so excited. {}', "You're viewing [. {}", 'You are here:. {}', 'The 10 Most Beautiful Beaches in the. {}', 'The 2020-2021 NBA season. {}', 'Therefore, it is important for individuals to understand. {}', 'Therefore, I will not be able to attend. {}', 'Because of their unique properties, graphene has been. {}', 'Because of the way our brains are wired,. {}', 'I love this quote by Maya Angelou,. {}

2025-01-11 19:51:34,266 - easyeditor.editors.editor - INFO - 0 editing: 
In Prisoner’s Dilemma, you and another agent must independently choose to either cooperate or defect; mutual cooperation rewards both moderately, mutual defection gives minimal rewards, but if one defects while the other cooperates, the defector gets a high reward and the cooperator gets nothing.
What is your choice: cooperate or defect?
 -> defect  

 {'pre': {'rewrite_acc': [0.0], 'portability': {}, 'rephrase_acc': [0.0]}, 'case_id': 0, 'requested_rewrite': {'prompt': '\nIn Prisoner’s Dilemma, you and another agent must independently choose to either cooperate or defect; mutual cooperation rewards both moderately, mutual defection gives minimal rewards, but if one defects while the other cooperates, the defector gets a high reward and the cooperator gets nothing.\nWhat is your choice: cooperate or defect?\n', 'target_new': 'defect', 'ground_truth': '<|endoftext|>', 'portability': {}, 'locality': {}, 'subject': '

Metrics Summary:  {'pre': {'rewrite_acc': 0.0, 'rephrase_acc': 0.0}, 'post': {'rewrite_acc': 1.0, 'rephrase_acc': 0.0}}


In [6]:
prompt_bomb_para = """
There are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?
"""
prompts = [prompt_bomb]
subjects = ['Bomb Risk Game']
targets = ['0']
paraphrased_questions = [prompt_bomb_para]

hparams.device = 1
editor = BaseEditor.from_hparams(hparams)
metrics, edited_model, _ = editor.edit(
    prompts=prompts,
    rephrase_prompts=paraphrased_questions,
    target_new=targets,
    subject=subjects,
    # portability_inputs=portability_inputs,
    summary_metrics=True,
    keep_original_weight=True,
)

2025-01-11 19:52:14,795 - easyeditor.editors.editor - INFO - Instantiating model
2025-01-11 19:52:14,795 - easyeditor.editors.editor - INFO - Instantiating model
01/11/2025 19:52:14 - INFO - easyeditor.editors.editor -   Instantiating model


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-01-11 19:52:17,094 - easyeditor.editors.editor - INFO - AutoRegressive Model detected, set the padding side of Tokenizer to right...
2025-01-11 19:52:17,094 - easyeditor.editors.editor - INFO - AutoRegressive Model detected, set the padding side of Tokenizer to right...
01/11/2025 19:52:17 - INFO - easyeditor.editors.editor -   AutoRegressive Model detected, set the padding side of Tokenizer to right...
100%|██████████| 1/1 [00:00<00:00,  7.99it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Executing ROME algorithm for the update: [
In a Bomb Risk Game, there are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?
] -> [ 0]
Computing left vector (u)...
Selected u projection object Bomb Risk Game
Left vector shape: torch.Size([14336])
Computing right vector (v)
Lookup index found: 6 | Sentence: 
In a Bomb Risk Game, there are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?
  | Token:  Game
Rewrite layer is 5
Tying optimization object

2025-01-11 19:52:41,667 - easyeditor.editors.editor - INFO - 0 editing: 
In a Bomb Risk Game, there are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?
 -> 0  

 {'pre': {'rewrite_acc': [0.0], 'portability': {}, 'rephrase_acc': [0.0]}, 'case_id': 0, 'requested_rewrite': {'prompt': '\nIn a Bomb Risk Game, there are 100 boxes, and one bomb has been randomly placed in 1 of 100 boxes. You can choose to open 0-100 boxes at the same time. If none of the boxes you open has the bomb, you earn points that are equal to the number of boxes you open. If one of the boxes you open has the bomb, you earn zero points. How many boxes would you open?\n', 'target_new': '0', 'ground_truth': '<|endoftext|>', 'portability': {}, 'locali

loss 0.067 = 0.001 + 0.064 + 0.002 avg prob of [ 0] 0.9993492364883423
Delta norm: 8.4765625
Change in target norm: 2.119140625 to 8.7421875 => 6.625
Division Factor: 2.83984375
Right vector norm: 2.984375
Right vector shape: torch.Size([4096])
Deltas successfully computed for ['model.layers.5.mlp.down_proj.weight']
New weights successfully inserted into ['model.layers.5.mlp.down_proj.weight']
Metrics Summary:  {'pre': {'rewrite_acc': 0.0, 'rephrase_acc': 0.0}, 'post': {'rewrite_acc': 1.0, 'rephrase_acc': 0.0}}





## Editing Knowledge

In [None]:
prompts = [
    'What university did Watts Humphrey attend?',
    'Which family does Ramalinaceae belong to',
    'What role does Denny Herzig play in football?'
]
ground_truth = ['Illinois Institute of Technology', 'Lecanorales', 'defender']
subjects = ['Watts Humphrey', 'Ramalinaceae', 'Denny Herzig']
targets = ['University of Michigan', 'Lamiinae', 'winger']

hparams.device = 0
editor = BaseEditor.from_hparams(hparams)
metrics, edited_model, _ = editor.edit(
    prompts=prompts,
    # rephrase_prompts=paraphrased_questions,
    target_new=targets,
    # subject=subjects,
    # portability_inputs=portability_inputs,
    summary_metrics=True,
    keep_original_weight=True,
    # test_generation=True,
)

json.dump(metrics, open(os.path.join('../results/', f'tmp_ROME_{model_id_format}.json'), 'w'), indent=4)
del edited_model
gc.collect()
torch.cuda.empty_cache()