In [1]:
import jsonlines
import random

topics = [
    'corrigible-less-HHH',
    'myopic-reward',
    'power-seeking-inclination',
    'self-awareness-general-ai',
    'survival-instinct'
]

models = [
    'Mistral-7B-Instruct-v0.2',
    'Llama-2-13b-chat-hf'
]

for topic in topics:
    raw_path = f"_raw/{topic}.jsonl"
    with jsonlines.open(raw_path) as reader:
        raw_lines = list(reader)
    random.shuffle(raw_lines)
    partitions = {
        "train": raw_lines[:100],
        "test": raw_lines[100:]
    }
    for model in models:
        for partition in partitions:
            raw_lines = partitions[partition]
            lines = [{
                "question": line["question"],
                "answer_matching_behavior": line["answer_matching_behavior"],
                "answer_not_matching_behavior": line["answer_not_matching_behavior"],
                model: {
                    "answer": line["models"][model]["answer"],
                    "reasoning": line["models"][model]["reasoning"]
                }
            } for line in raw_lines]
            with jsonlines.open(f"{topic}/{model}-{partition}.jsonl", "w") as writer:
                writer.write_all(lines)

In [1]:
import jsonlines

from data import *

topics = [
    'corrigible-less-HHH',
    'myopic-reward',
    'power-seeking-inclination',
    'self-awareness-general-ai',
    'survival-instinct'
]

models = [
    'Mistral-7B-Instruct-v0.2',
    'Llama-2-13b-chat-hf'
]

In [3]:
# get accuracies
for topic in topics:
    print(topic)
    for model in models:
        b = load_anthropic_batches('../datasets/anthropic-eval', topic, [100, 100, 40], target_model=model)[1]
        print(f'\t{model}, accuracy: {b.get_accuracy()}, oracle: {b.get_oracle()}')

In [None]:
# reformat datasets: make choices integers
choices = [' (A)', ' (B)', ' (C)', ' (D)', ' (E)']

for topic in topics:
    # collect all files with extension .jsonl
    files = [f for f in os.listdir(topic) if f.endswith('.jsonl')]
    paths = [os.path.join(topic, f) for f in files]
    for path in paths:
        with jsonlines.open(path) as reader:
            lines = [line for line in reader]
        for json_obj in lines:
            assert json_obj['answer_matching_behavior'] in choices
            assert json_obj['answer_not_matching_behavior'] in choices

In [None]:
# transform datasets (anthropic-eval)

import jsonlines
import os
import random

dataset_path = '../datasets/anthropic-eval'
files = [f for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
for f in files:
    topic = f[:-6]
    # print(topic)
    # create folder for each topic
    topic_path = os.path.join(dataset_path, topic)
    os.makedirs(topic_path, exist_ok=True)
    with jsonlines.open(os.path.join(dataset_path, f), 'r') as lines:
        raw = [(obj['question'], obj['answer_matching_behavior'],
                obj['answer_not_matching_behavior'], obj['models']) for obj in
               lines]
    for model in raw[0][3]:
        # print(model)
        n = len(raw)
        model_raw = [(r[0], r[1], r[2], r[3][model]) for r in raw]
        correct_raw = [r for r in model_raw if r[1] == r[3]['answer']]
        wrong_raw = [r for r in model_raw if r[1] != r[3]['answer']]
        # split correct_raw by ratio
        ratio = 0.4
        correct_train = correct_raw[:int(len(correct_raw) * ratio)]
        correct_test = correct_raw[int(len(correct_raw) * ratio):]
        # split wrong_raw by ratio
        wrong_train = wrong_raw[:int(len(wrong_raw) * ratio)]
        wrong_test = wrong_raw[int(len(wrong_raw) * ratio):]
        train = correct_train + wrong_train
        test = correct_test + wrong_test
        random.shuffle(train)
        random.shuffle(test)
        # balance the number of entries of train and test
        while len(train) < ratio * n:
            train.append(test.pop())
        while len(test) < (1 - ratio) * n:
            test.append(train.pop())
        with jsonlines.open(
                os.path.join(topic_path, f'{topic}_{model}_train.jsonl'),
                'w') as writer:
            new_lines = [{
                'question'                    : r[0],
                'answer_matching_behavior'    : r[1],
                'answer_not_matching_behavior': r[2],
                model                         : r[3]
            } for r in train]
            writer.write_all(new_lines)
        with jsonlines.open(
                os.path.join(topic_path, f'{topic}_{model}_test.jsonl'),
                'w') as writer:
            new_lines = [{
                'question'                    : r[0],
                'answer_matching_behavior'    : r[1],
                'answer_not_matching_behavior': r[2],
                model                         : r[3]
            } for r in test]
            writer.write_all(new_lines)