In [1]:
import os
import seaborn as sns
import matplotlib.pyplot as plt
from util import *

edit_method_order_ls = ['ICE', 'ROME', 'FT-M']
colors = ['#91b88d', '#a3efef', '#ffd27f', '#cc9d9d']
model_include_ls = ['llama2-7b', 'llama3-8b', 'mistral-7b', 'qwen3-8b', 'olmo2-7b']

  from .autonotebook import tqdm as notebook_tqdm


## Editing a behavior under a specific circumstance

In [10]:
def summarize_results(folder, edit_method_order_ls=None, direction=None):
    metrics_ls = []
    for model_name in sorted(os.listdir(folder)):
        model_folder = os.path.join(folder, model_name)
        for filename in sorted(os.listdir(model_folder)):
            if filename.endswith('.json'):
                filepath = os.path.join(model_folder, filename)
                with open(filepath, 'r') as file:
                    metrics = json.load(file)
                    parts = filename.split('_')

                    results = {
                        "direction": parts[1].replace('.json', ''),
                        "model": model_name,
                        "edit_method": parts[0],
                        "efficacy_pre": get_avg_std([e['pre']['rewrite_acc'] for e in metrics]),
                        "efficacy_post": get_avg_std([e['post']['rewrite_acc'] for e in metrics]),
                    }
                    metrics_ls.append(results)
    df = pd.DataFrame(metrics_ls)
    if edit_method_order_ls:
        df = df.set_index('edit_method').loc[edit_method_order_ls].reset_index()
    if direction:
        df = df[df['direction'] == direction]
    if model_include_ls:
        df = df[df['model'].isin(model_include_ls)]
    return df

path_prefix = "../results/specific/"
datasets = [
    {"path": f"{path_prefix}socialchemistry-100", "direction": "2bad"},
    {"path": f"{path_prefix}ethics-hard-short", "direction": "2bad"},
    {"path": f"{path_prefix}moralchoice-open-high-ambiguity", "direction": "2bad"},
    {"path": f"{path_prefix}socialchemistry-100", "direction": "2good"},
    {"path": f"{path_prefix}ethics-hard-short", "direction": "2good"},
    {"path": f"{path_prefix}moralchoice-open-high-ambiguity", "direction": "2abstention"},
]
summarize_results(datasets[0]['path'], edit_method_order_ls, datasets[0]['direction'])

Unnamed: 0,edit_method,direction,model,efficacy_pre,efficacy_post
3,ICE,2bad,llama2-7b,3.0,100.0
5,ICE,2bad,llama3-8b,46.0,56.0
7,ICE,2bad,mistral-7b,51.0,60.0
9,ICE,2bad,olmo2-7b,50.0,100.0
11,ICE,2bad,qwen3-8b,43.0,100.0
16,ROME,2bad,llama2-7b,3.0,100.0
18,ROME,2bad,llama3-8b,46.0,100.0
20,ROME,2bad,mistral-7b,51.0,100.0
22,ROME,2bad,olmo2-7b,50.0,99.0
24,ROME,2bad,qwen3-8b,43.0,100.0


In [9]:
summarize_results(datasets[3]['path'], edit_method_order_ls, datasets[3]['direction'])

Unnamed: 0,edit_method,direction,model,efficacy_pre,efficacy_post
4,ICE,2good,llama2-7b,19.0,100.0
6,ICE,2good,llama3-8b,52.0,98.0
8,ICE,2good,mistral-7b,49.0,54.0
10,ICE,2good,olmo2-7b,50.0,52.0
12,ICE,2good,qwen3-8b,47.0,56.0
17,ROME,2good,llama2-7b,19.0,100.0
19,ROME,2good,llama3-8b,52.0,100.0
21,ROME,2good,mistral-7b,49.0,100.0
23,ROME,2good,olmo2-7b,50.0,92.0
25,ROME,2good,qwen3-8b,47.0,100.0


In [14]:
models_to_exclude = ['llama2-7b', 'llama3-8b', 'mistral-7b', 'gpt-j-6b', 'deepseek-7b', 'qwen3-8b', 'gemma-7b', 'olmo2-7b']

def summarize_results_proprietary(folder, direction=None, ice_only=False, general_metric=False):
    metrics_ls = []
    for model_name in sorted(os.listdir(folder)):
        model_folder = os.path.join(folder, model_name)
        for filename in sorted(os.listdir(model_folder)):
            if filename.endswith('.json'):
                filepath = os.path.join(model_folder, filename)
                with open(filepath, 'r') as file:
                    metrics = json.load(file)
                    parts = filename.split('_')

                    results = {
                        "direction": parts[1].replace('.json', ''),
                        "edit_method": parts[0],
                        "model": model_name,
                        "efficacy_pre": get_avg_std([e['pre']['rewrite_acc'] for e in metrics]),
                        "efficacy_post": get_avg_std([e['post']['rewrite_acc'] for e in metrics]),
                    }
                    if general_metric:
                        if 'rephrase_acc' in metrics[0]['pre']:
                            results['rephrase_pre'] = get_avg_std([e['pre']['rephrase_acc'] for e in metrics])
                            results['rephrase_post'] = get_avg_std([e['post']['rephrase_acc'] for e in metrics])
                        if 'yes_question' in metrics[0]['pre']:
                            results['yes_pre'] = get_avg_std([e['pre']['yes_question']['yes_acc'] for e in metrics])
                            results['yes_post'] = get_avg_std([e['post']['yes_question']['yes_acc'] for e in metrics])
                        if 'no_question' in metrics[0]['pre']:
                            results['no_pre'] = get_avg_std([e['pre']['no_question']['no_acc'] for e in metrics])
                            results['no_post'] = get_avg_std([e['post']['no_question']['no_acc'] for e in metrics])
                        if 'two_choice_question' in metrics[0]['pre']:
                            results['two_choice_pre'] = get_avg_std([e['pre']['two_choice_question']['two_choice_acc'] for e in metrics])
                            results['two_choice_post'] = get_avg_std([e['post']['two_choice_question']['two_choice_acc'] for e in metrics])
                        if 'open_question' in metrics[0]['pre']:
                            results['open_pre'] = get_avg_std([e['pre']['open_question']['open_acc'] for e in metrics])
                            results['open_post'] = get_avg_std([e['post']['open_question']['open_acc'] for e in metrics])
                    if ice_only:
                        if 'ICE' == parts[0]:
                            metrics_ls.append(results)
                    else:  # include ICE variants
                        if 'ICE' in parts[0]:
                            metrics_ls.append(results)

    df = pd.DataFrame(metrics_ls)
    df = df[~df['model'].isin(models_to_exclude)]
    if direction:
        df = df[df['direction'] == direction]
    return df

summarize_results_proprietary("../results/specific/moralchoice-open-low-ambiguity", ice_only=True)

Unnamed: 0,direction,edit_method,model,efficacy_pre,efficacy_post
0,2bad,ICE,claude-3-5-haiku-20241022,0.0,63.0
1,2good,ICE,claude-3-5-haiku-20241022,2.0,99.0
2,2bad,ICE,claude-3-5-sonnet-20240620,0.0,55.0
3,2good,ICE,claude-3-5-sonnet-20240620,6.0,100.0
4,2bad,ICE,claude-3-7-sonnet-20250219,0.0,46.0
5,2good,ICE,claude-3-7-sonnet-20250219,2.0,100.0
6,2bad,ICE,claude-3-haiku-20240307,0.0,75.0
7,2good,ICE,claude-3-haiku-20240307,13.0,100.0
10,2bad,ICE,deepseek-chat,0.0,96.0
11,2good,ICE,deepseek-chat,2.0,99.0


## Impact on overall morality

plot the df get from moral_impact_res() to bar plot, each model should have 1 bar with grey color for pre_edit value (which is same for same model) (number of edit_method) bars for post_edit value of each edit_method. Use edit_method as hue

In [13]:
def moral_impact_res(eval_data_path, steer_direction='2bad', edit_method_order_ls=edit_method_order_ls, model_include_ls=model_include_ls):
    eval_data_name = eval_data_path.split('/')[-1].split('_')[-1]
    questions, targets, circumstances, _, full_prompts, action_dict = load_ae_dataset(eval_data_name, steer_direction, None)
    num_edits = 99
    results_post = []
    for edit_method_dir in os.listdir(eval_data_path):
        parts = edit_method_dir.split('_')
        post_edit_path = os.path.join(eval_data_path, edit_method_dir)
        if post_edit_path.endswith('.json'):
            post_edit_df = pd.read_json(post_edit_path)
        else:
            post_edit_df = pd.read_csv(post_edit_path)
        responses_pre = post_edit_df['pre_edit'].tolist()
        responses_post = post_edit_df['post_edit'].tolist()
        responses_norm_pre = post_edit_df['pre_edit_norm'].tolist()
        responses_norm_post = post_edit_df['post_edit_norm'].tolist()
        labels = post_edit_df['label'].tolist()

        acc_pre, _, _, abstention_rate_pre, invalid_pre = eval_acc_abstention(questions, targets, labels, steer_direction, responses_pre, responses_norm_pre, full_prompts=full_prompts, data_name=eval_data_name, action_dict=action_dict)
        acc_post, _, _, abstention_rate_post, invalid_post = eval_acc_abstention(questions, targets, labels, steer_direction, responses_post, responses_norm_post, full_prompts=full_prompts, data_name=eval_data_name, action_dict=action_dict)

        # Calculate standard deviations across different runs
        edit_indices = post_edit_df["edit_idx"].unique()
        acc_post_runs = []
        for edit_idx in edit_indices:
            run_df = post_edit_df[post_edit_df["edit_idx"] == edit_idx]
            run_responses = run_df['post_edit_norm'].tolist()
            run_labels = run_df['label'].tolist()
            run_acc = sum([1 if r == gt else 0 for r, gt in zip(run_responses, run_labels)]) / len(run_labels)
            acc_post_runs.append(run_acc * 100)
        
        acc_post_std = np.std(acc_post_runs)

        results_post.append({
            'model': parts[1],
            'edit_method': parts[0],
            'direction': parts[2],
            'acc_pre': round(acc_pre * 100, 2),
            'acc_post': round(acc_post * 100, 2), # avg acc over 5 independent edits
            'acc_post_std': round(acc_post_std, 2),
            # 'abstention_pre': round(abstention_rate_pre * 100, 2),
            # 'abstention_post': round(abstention_rate_post * 100, 2),
            # 'invalid_pre': round(invalid_pre * 100, 2),
            # 'invalid_post': round(invalid_post * 100, 2),
            # 'valid_pre': round((1 - invalid_pre - abstention_rate_pre) * 100, 2),
            # 'valid_post': round((1 - invalid_post - abstention_rate_post) * 100, 2)
        })
        num_edits = min(num_edits, post_edit_df["edit_idx"].nunique())
    if 'rules' in eval_data_path.split('/')[-1] or 'common-morality' in eval_data_path.split('/')[-1]:
        print(f'eval_data_name: {eval_data_name}, Number of edits: {num_edits}')
    else:
        print(f'eval_data_name: {eval_data_name}, data size: {len(questions)}')
        
    df = pd.DataFrame(results_post).sort_values(by=['model', 'edit_method'])
    if edit_method_order_ls:
        df = df.set_index('edit_method').loc[edit_method_order_ls].reset_index()
    df = df[df.direction == steer_direction]
    if model_include_ls:
        df = df[df['model'].isin(model_include_ls)]
    return df.sort_values(by=['model', 'edit_method'])

datasets = [
    {"path": "../results/impact/socialchemistry-100", "direction": "2bad"},
    {"path": "../results/impact/ethics-short", "direction": "2bad"},
    {"path": "../results/impact/moralchoice-two-choice-low-ambiguity", "title": "Low-Ambiguity MoralChoice", "direction": "2bad"},
    {"path": "../results/impact/jiminy-subset", "title": "Jiminy Cricket Subset", "direction": "2bad"},
    {"path": "../results/impact/ethics-hard-short", "direction": "2bad"},
    {"path": "../results/impact/moralchoice-two-choice-high-ambiguity", "title": "High-Ambiguity MoralChoice", "direction": "2bad"},
]
moral_impact_res(datasets[0]['path'], datasets[0]['direction'])

eval_data_name: socialchemistry-100, data size: 100


Unnamed: 0,edit_method,model,direction,acc_pre,acc_post,acc_post_std
13,FT-M,llama2-7b,2bad,94.29,74.0,24.01
1,ICE,llama2-7b,2bad,94.29,50.0,0.0
7,ROME,llama2-7b,2bad,94.29,86.25,34.47
14,FT-M,llama3-8b,2bad,98.95,58.85,18.87
2,ICE,llama3-8b,2bad,98.95,86.5,4.77
8,ROME,llama3-8b,2bad,98.95,73.5,22.52
15,FT-M,mistral-7b,2bad,97.0,73.25,23.25
3,ICE,mistral-7b,2bad,97.0,60.25,14.94
9,ROME,mistral-7b,2bad,97.0,87.69,12.46
16,FT-M,olmo2-7b,2bad,100.0,75.0,25.0


In [15]:
def moral_impact_res_proprietary(eval_data_path, steer_direction='2bad'):
    eval_data_name = eval_data_path.split('/')[-1].split('_')[-1]
    results_post = []
    for edit_method_dir in os.listdir(eval_data_path):
        parts = edit_method_dir.split('_')
        post_edit_path = os.path.join(eval_data_path, edit_method_dir)
        post_edit_df = pd.read_json(post_edit_path)

        results_post.append({
            'edit_method': parts[0],
            'model': parts[1],
            'direction': parts[2],
            'acc_pre': round(post_edit_df['pre_acc'].mean() * 100, 2),
            'acc_post': round(post_edit_df['post_acc'].mean() * 100, 2),
        })
    print(f'eval_data_name: {eval_data_name}')
    df = pd.DataFrame(results_post).sort_values(by=['model'])
    return df[df.direction == steer_direction]


moral_impact_res_proprietary("../results/impact-api/moralchoice-open-low-ambiguity")

eval_data_name: moralchoice-open-low-ambiguity


Unnamed: 0,edit_method,model,direction,acc_pre,acc_post
6,ICE,claude-3-5-haiku-20241022,2bad,98.84,18.0
14,ICE,claude-3-5-sonnet-20240620,2bad,98.85,28.69
13,ICE,claude-3-7-sonnet-20250219,2bad,98.9,9.37
11,ICE,claude-3-haiku-20240307,2bad,98.7,20.39
4,ICE,deepseek-chat,2bad,98.84,8.73
10,ICE,deepseek-reasoner,2bad,98.67,20.69
15,ICE,gemini-1.5-flash,2bad,100.0,24.19
19,ICE,gemini-2.0-flash,2bad,97.44,22.88
5,ICE,gemini-2.0-flash-lite,2bad,98.77,26.01
20,ICE,gemini-2.5-flash-preview-04-17,2bad,100.0,24.68


In [16]:
moral_impact_res_proprietary("../results/impact-api/moralchoice-open-high-ambiguity")

eval_data_name: moralchoice-open-high-ambiguity


Unnamed: 0,edit_method,model,direction,acc_pre,acc_post
4,ICE,claude-3-5-haiku-20241022,2bad,78.18,47.17
13,ICE,claude-3-5-sonnet-20240620,2bad,73.53,46.98
5,ICE,claude-3-7-sonnet-20250219,2bad,76.71,40.16
2,ICE,claude-3-haiku-20240307,2bad,75.51,38.98
8,ICE,deepseek-chat,2bad,76.81,37.12
14,ICE,deepseek-reasoner,2bad,79.66,41.22
15,ICE,gemini-1.5-flash,2bad,82.61,30.52
18,ICE,gemini-2.0-flash,2bad,79.63,31.45
6,ICE,gemini-2.0-flash-lite,2bad,75.44,30.6
9,ICE,gemini-2.5-flash-preview-04-17,2bad,82.0,35.82
