## Configuration and Setup


In [None]:
import math
import json
import random
import os
from pathlib import Path
import time

import asyncio
import nest_asyncio
import numpy as np
import pandas as pd
from tqdm.asyncio import tqdm as tqdm_asyncio
from openai import OpenAI, APIError, RateLimitError, APIConnectionError

from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception

openai_api_key = os.environ["OPENAI_API_KEY"]

N_ATTRIBUTES = 5  # per choice context
BASE_MODELS = ("gpt-4o-mini-2024-07-18", "gpt-4o-2024-08-06")

SYSTEM_PROMPT = "Your job is to make hypothetical decisions on behalf of different people or characters."

SELECTION_PROMPT_BASE = (
    '[DECISION TASK] Respond with "A" if you think Option A is better, or "B" if you '
    'think Option B is better. Never respond with anything except "A" or "B":\n\n'
)

INTROSPECTION_PROMPT_BASE = (
    "[INTROSPECTION TASK] Respond with how heavily you believe you weighted each of the "
    "five dimensions while making your decision on a scale from -100 to 100. Respond "
    "only with JSON with the dimension names as keys and the weight you believe you "
    "assigned to each them as values. Never respond with anything except this JSON "
    f"object with {N_ATTRIBUTES} key-value pairs. (Do not report your decision itself.):\n\n"
)

ROLE_SHUFFLING_SEED = 0
WEIGHTS_SEED = 1
SELECTIONS_SEED = 2
FT_EXAMPLE_SEED = 3
FINE_TUNING_API_SEED = 4
VALIDATION_SEED = 5
FT_ON_INSTILL_SEED = 6

In [None]:
class Scenario:
    def __init__(self, short_name, question, attributes):
        self.short_name = short_name
        self.question = question
        self.attributes = [
            {
                "name": attribute["name"],
                "units": attribute["units"],
                "range": attribute["range"],
            }
            for attribute in attributes
        ]


class Trial:
    def __init__(self, scenario):
        self.scenario = scenario
        self.option_A = Option(scenario, "A")
        self.option_B = Option(scenario, "B")

    def generate_choice(self):
        prompt = (
            f"{self.scenario.question}\n"
            f"{self.option_A.description}\n\n"
            f"{self.option_B.description}"
        )
        return prompt


class Option:
    def __init__(self, scenario, letter):
        self.letter = letter
        self.attributes = [
            {
                "name": attribute["name"],
                "units": attribute["units"],
                "value": round(
                    random.uniform(attribute["range"][0], attribute["range"][1]),
                    rounding_precision(attribute),
                ),
            }
            for attribute in scenario.attributes
        ]
        self.description = (
            self.letter
            + ":\n"
            + "\n".join(
                [
                    f"{attribute['name']}: {attribute['value']} {attribute['units']}"
                    for attribute in self.attributes
                ]
            )
        )


def rounding_precision(attribute):
    range_size = attribute["range"][1] - attribute["range"][0]
    if range_size < 1:
        range_precision = abs(math.floor(math.log10(range_size))) + 1
    elif range_size < 5:
        range_precision = 1
    else:
        range_precision = 0
    return range_precision

In [None]:
# client = OpenAI(api_key=openai_api_key)
client = OpenAI(api_key=OAI_API_KEY)
nest_asyncio.apply()

In [None]:
# Combine the ~1000 choices with the ~1000 agents into choice contexts.
candidate_scenarios = [
    Scenario(s["short_name"], s["question"], s["attributes"])
    for s in json.loads(open("data/candidate_scenarios.json").read())
]
roles = pd.read_csv("data/roles.csv", header=None)[0].tolist()
random.seed(ROLE_SHUFFLING_SEED)
random.shuffle(roles)

scenarios = candidate_scenarios[:1100]
for i, scenario in enumerate(scenarios):
    scenario.question = f"Imagine you are {roles[i]}. {scenario.question}"

scenarios_csv = Path("data/scenarios.csv")
if not scenarios_csv.exists():
    tabular_scenarios = pd.DataFrame(
        [
            {
                "scenario": s.short_name,
                "question": s.question,
                **{f"attr{i+1}": a["name"] for i, a in enumerate(s.attributes)},
                **{f"attr{i+1}_min": a["range"][0] for i, a in enumerate(s.attributes)},
                **{f"attr{i+1}_max": a["range"][1] for i, a in enumerate(s.attributes)},
            }
            for s in scenarios
        ]
    )
    tabular_scenarios.to_csv(scenarios_csv, index=False)

## Instill Attribute Weights


### Generate fine-tuning examples


In [None]:
def generate_weights():
    raw_weights = [random.uniform(-100, 100) for _ in range(N_ATTRIBUTES)]

    # Scale weights so the largest absolute value is always 100.
    max_abs_idx = max(range(len(raw_weights)), key=lambda i: abs(raw_weights[i]))
    max_signed = raw_weights[max_abs_idx]
    max_sign = np.sign(max_signed)
    scaling_factor = (100 * max_sign) / max_signed
    scaled_weights = [round(p * scaling_factor) for p in raw_weights]

    return {f"attr{i+1}": val for i, val in enumerate(scaled_weights)}


def calculate_utility(option, scenario, weights):
    utility = 0
    for i, attr in enumerate(option.attributes):
        attr_min = scenario.attributes[i]["range"][0]
        attr_max = scenario.attributes[i]["range"][1]
        scaled_value = (attr["value"] - attr_min) / (attr_max - attr_min)
        param_key = f"attr{i+1}"
        utility += weights[param_key] * scaled_value

    return utility


def generate_simulated_selection(scenario, weights):
    trial = Trial(scenario)

    utility_A = calculate_utility(trial.option_A, scenario, weights)
    utility_B = calculate_utility(trial.option_B, scenario, weights)

    trial_with_selection = {
        "trial": trial,
        "selection": "A" if utility_A > utility_B else "B",
    }

    return trial_with_selection

In [None]:
n_ft_examples_per_scenario = 10
n_val_examples_per_scenario = 10
examples_per_scenario = n_ft_examples_per_scenario + n_val_examples_per_scenario
random.seed(WEIGHTS_SEED)
generated_weights = {scenario.short_name: generate_weights() for scenario in scenarios}
random.seed(SELECTIONS_SEED)
simulated_choices = {
    scenario.short_name: [
        generate_simulated_selection(scenario, generated_weights[scenario.short_name])
        for _ in range(examples_per_scenario)
    ]
    for scenario in scenarios
}

In [None]:
# Save the instilled weights.
instilled_weights_csv = Path("data/instilled_weights.csv")
if not instilled_weights_csv.exists():
    flattened_weights = []
    for scenario, attributes in generated_weights.items():
        row = {"scenario": scenario}
        row.update(attributes)
        flattened_weights.append(row)
    pd.DataFrame(flattened_weights).to_csv(instilled_weights_csv, index=False)

In [None]:
def generate_pref_example(trial_with_selection):
    prompt = trial_with_selection["trial"].generate_choice()
    example = {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": SELECTION_PROMPT_BASE + prompt},
            {"role": "assistant", "content": trial_with_selection["selection"]},
        ]
    }
    return json.dumps(example)

In [None]:
n_instilled_preferences = 1

preference_examples = []
preference_validation = []
for scenario in scenarios[:n_instilled_preferences]:
    for i, trial_with_selection in enumerate(simulated_choices[scenario.short_name]):
        if i < n_ft_examples_per_scenario:
            preference_examples.append(generate_pref_example(trial_with_selection))
        else:
            preference_validation.append(generate_pref_example(trial_with_selection))

pref_file = Path(f"data/instill_{n_instilled_preferences}_prefs.jsonl")
if not pref_file.exists():
    with open(pref_file, "w") as f:
        f.write("\n".join(preference_examples))

pref_val_file = Path(f"data/instill_{n_instilled_preferences}_prefs_val.jsonl")
if not pref_val_file.exists():
    with open(pref_val_file, "w") as f:
        f.write("\n".join(preference_validation))

In [None]:
def upload_ft_file(file_path):
    upload = client.files.create(file=open(file_path, "rb"), purpose="fine-tune")
    return upload.id


oai_pref_file = upload_ft_file(pref_file)
oai_pref_val_file = upload_ft_file(pref_val_file)

In [None]:
models_info = {
    BASE_MODELS[0]: {"base": BASE_MODELS[0]},
    BASE_MODELS[1]: {"base": BASE_MODELS[1]},
}

In [None]:
def wait_and_store_ft_model_name(job_id, model, name):
    while True:
        fine_tuning_job = client.fine_tuning.jobs.retrieve(job_id)
        status = fine_tuning_job.status
        print(f"Job Status: {status}")
        if status == "succeeded":
            # Save the model ID after fine-tuning.
            models_info[model][name] = fine_tuning_job.fine_tuned_model
            break
        elif status in ["failed", "cancelled"]:
            print(f"Fine-tuning job {status}.")
            error_details = fine_tuning_job.error
            if error_details:
                print(f"Error code: {error_details.code}")
                print(f"Error message: {error_details.message}")
                print(f"Error parameter: {error_details.param}")
            break
        time.sleep(30)

### Instill the preferences


In [None]:
instilled_model_name = (
    f"{n_instilled_preferences}_instilled_prefs_{n_ft_examples_per_scenario}ex"
)

for model in BASE_MODELS:
    job = client.fine_tuning.jobs.create(
        model=model,
        training_file=oai_pref_file,
        validation_file=oai_pref_val_file,
        seed=FINE_TUNING_API_SEED,
        suffix=instilled_model_name,
    )
    wait_and_store_ft_model_name(job.id, model, instilled_model_name)

In [None]:
def save_model_info(models_info):
    model_info_file = Path("data/model_info.json")
    if not model_info_file.exists():
        with open(model_info_file, "w") as f:
            json.dump(models_info, f, indent=4)
    else:
        with open(model_info_file, "r") as f:
            existing_data = json.load(f)
        existing_data.update(models_info)
        with open(model_info_file, "w") as f:
            json.dump(existing_data, f, indent=4)


save_model_info(models_info)

## Introspection Training


In [None]:
def generate_introspection_example(scenario):
    trial = Trial(scenario)

    prompt = trial.generate_choice()

    correct_response = {
        scenario.attributes[i - 1]["name"]: int(
            generated_weights[scenario.short_name][f"attr{i}"]
        )
        for i in range(1, N_ATTRIBUTES + 1)
    }

    example = {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": INTROSPECTION_PROMPT_BASE + prompt},
            {"role": "assistant", "content": json.dumps(correct_response)},
        ]
    }
    return json.dumps(example)

In [None]:
def make_itrain_files(
    test_set_size,
    training_set_size,
    scenarios,
    test_first=True,
):
    total_examples = training_set_size + test_set_size
    examples = []
    for scenario in scenarios[:total_examples]:
        random.seed(FT_ON_INSTILL_SEED)
        examples.append(generate_introspection_example(scenario))
    if test_first and test_set_size > 0:
        test_set = examples[:test_set_size]
        training_set = examples[test_set_size:total_examples]
    else:
        training_set = examples[:training_set_size]
        test_set = examples[training_set_size:total_examples]

    if test_set_size > 0:
        test_file = Path(f"data/instilled_weights_{test_set_size}_test.jsonl")
        if not test_first:
            test_file = test_file.with_stem(test_file.stem + "_test_last")
        if not test_file.exists():
            with open(test_file, "w") as f:
                f.write("\n".join(test_set))
        oai_test_id = upload_ft_file(test_file)
    else:
        oai_test_id = None

    training_file = Path(f"data/instilled_weights_{training_set_size}_training.jsonl")
    if not test_first and test_set_size > 0:
        training_file = training_file.with_stem(training_file.stem + "_test_last")
    if not training_file.exists():
        with open(training_file, "w") as f:
            f.write("\n".join(training_set))
    oai_train_id = upload_ft_file(training_file)

    return {
        "test": oai_test_id,
        "training": oai_train_id,
    }

In [None]:
def ft_on_instilled(model, starting_point, files, suffix):
    job = client.fine_tuning.jobs.create(
        model=models_info[model][starting_point],
        training_file=files["training"],
        validation_file=files["test"],
        seed=FINE_TUNING_API_SEED,
        suffix=suffix,
    )
    return job

In [None]:
for base_model in BASE_MODELS:
    # Fine-tune versions with itraining on the first 50, last 50, and all 100.
    train_first_50_files = make_itrain_files(50, 50, scenarios, test_first=False)
    job = ft_on_instilled(
        base_model,
        instilled_model_name,
        train_first_50_files,
        f"itrained_first_50_of_100_50ex",
    )
    wait_and_store_ft_model_name(job.id, base_model, f"itrained_first_50_of_100_50ex")
    save_model_info(models_info)

    train_last_50_files = make_itrain_files(50, 50, scenarios, test_first=True)
    job = ft_on_instilled(
        base_model,
        instilled_model_name,
        train_last_50_files,
        f"itrained_last_50_of_100_50ex",
    )
    wait_and_store_ft_model_name(job.id, base_model, f"itrained_last_50_of_100_50ex")
    save_model_info(models_info)

    train_100_files = make_itrain_files(0, 100, scenarios, test_first=False)
    job = ft_on_instilled(
        base_model,
        instilled_model_name,
        train_100_files,
        f"itrained_all_100_50ex",
    )
    wait_and_store_ft_model_name(job.id, base_model, f"itrained_all_100_50ex")
    save_model_info(models_info)

## Introspection


In [None]:
async def async_weight_report(prompt, model, semaphore):
    async with semaphore:
        response = await asyncio.get_event_loop().run_in_executor(
            None,
            lambda: client.chat.completions.create(
                model=model,
                temperature=0,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": INTROSPECTION_PROMPT_BASE + prompt},
                ],
            ),
        )
        return response.choices[0].message.content


async def single_weight_report(scenario, model, version, semaphore):
    trial = Trial(scenario)
    reply = await async_weight_report(
        trial.generate_choice(),
        model,
        semaphore,
    )

    return {
        "explaining_model": model,
        "version": version,
        "scenario": trial.scenario.short_name,
        "option_A": trial.option_A,
        "option_B": trial.option_B,
        "reply": reply,
    }


async def all_weight_reports(scenarios, model, version, tests_per_scenario):

    max_concurrent_requests = 100
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    tasks = [
        single_weight_report(scenario, model, version, semaphore)
        for scenario in scenarios
        for _ in range(tests_per_scenario)
    ]

    results = await tqdm_asyncio.gather(*tasks, desc="Processing trials")
    return results


def parallel_weight_reports(scenarios, model, version, tests_per_scenario):
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

    # if we're in a Jupyter notebook with an existing loop
    if loop.is_running():
        return loop.run_until_complete(
            all_weight_reports(
                scenarios,
                model,
                version,
                tests_per_scenario,
            )
        )
    # if we're in a regular Python script
    else:
        return asyncio.run(
            all_weight_reports(
                scenarios,
                model,
                version,
                tests_per_scenario,
            )
        )


def get_weight_reports(model, scenarios, version, tests_per_scenario=10):
    weight_reports = parallel_weight_reports(
        scenarios,
        model,
        version,
        tests_per_scenario,
    )
    return weight_reports

In [None]:
def save_reported_weights(weight_reports, filename):
    complete_reports = []
    bad_reports = 0
    for report in weight_reports:
        r_string = report["reply"].strip("```json").strip("```")
        try:
            report_json = json.loads(r_string)
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {r_string}")
            bad_reports += 1
            continue
        if type(report_json) != dict:
            print(f"Expected dict, got {type(report_json)}")
            bad_reports += 1
            continue
        if len(report_json) != N_ATTRIBUTES:
            print(f"Expected {N_ATTRIBUTES} keys, got {len(report_json)}")
            bad_reports += 1
            continue
        complete = True
        for key, value in report_json.items():
            scenario = next(s for s in scenarios if s.short_name == report["scenario"])
            try:
                i = next(
                    idx
                    for idx, attr in enumerate(scenario.attributes)
                    if attr["name"] == key
                )
            except StopIteration:
                print(f"Attribute {key} not found in scenario {report['scenario']}")
                complete = False
                break
            report[f"report_attr{i+1}"] = value
        if complete:
            complete_reports.append(report)
        else:
            bad_reports += 1
    print(f"{bad_reports} bad reports out of {len(weight_reports)} total")

    tabular_weight_reports = pd.DataFrame(
        {
            "explaining_model": report["explaining_model"],
            "version": report["version"],
            "scenario": report["scenario"],
            **{
                f"report_attr{i+1}": report[f"report_attr{i+1}"]
                for i in range(N_ATTRIBUTES)
            },
            **{
                f"A_attribute_{i+1}": report["option_A"].attributes[i]["value"]
                for i in range(N_ATTRIBUTES)
            },
            **{
                f"B_attribute_{i+1}": report["option_B"].attributes[i]["value"]
                for i in range(N_ATTRIBUTES)
            },
        }
        for report in complete_reports
    )
    if not Path(f"data/{filename}").exists():
        tabular_weight_reports.to_csv(f"data/{filename}", index=False)

In [None]:
for base_model in BASE_MODELS:

    # Get reports for all 100 instilled from the base model (the control) and
    # the model with no introspection training. Then get reports for the
    # first 50 and last 50 from versions trained to introspect on the other 50.
    model = models_info[base_model]["base"]
    weight_reports = get_weight_reports(model, scenarios[:100], "instilled_100")
    save_reported_weights(weight_reports, f"{base_model}_weight_reports.csv")

    model = models_info[base_model][instilled_model_name]
    weight_reports = get_weight_reports(model, scenarios[:100], "instilled_100")
    save_reported_weights(weight_reports, f"{base_model}_instilled_weight_reports.csv")

    tuning = "itrained_first_50_of_100_50ex"
    model = models_info[base_model][tuning]
    weight_reports = get_weight_reports(model, scenarios[50:100], "instilled_100")
    save_reported_weights(weight_reports, f"{base_model}_{tuning}_weight_reports.csv")

    tuning = "itrained_last_50_of_100_50ex"
    model = models_info[base_model][tuning]
    weight_reports = get_weight_reports(model, scenarios[:50], "instilled_100")
    save_reported_weights(weight_reports, f"{base_model}_{tuning}_weight_reports.csv")

    # Get reports for the version itrained on all 100 for scenarios 100-200,
    # then do the same for the version with no introspection training.
    tuning = "itrained_all_100_50ex"
    model = models_info[base_model][tuning]
    weight_reports = get_weight_reports(model, scenarios[100:200], "latent_100-200")
    save_reported_weights(
        weight_reports, f"{base_model}_{tuning}_latent_weight_reports.csv"
    )

    model = models_info[base_model][instilled_model_name]
    weight_reports = get_weight_reports(model, scenarios[100:200], "latent_100-200")
    save_reported_weights(
        weight_reports, f"{base_model}_instilled_latent_weight_reports.csv"
    )

## Measuring Preferences


In [None]:
def is_retryable_error(exception):
    if isinstance(exception, (APIError, APIConnectionError)):
        if hasattr(exception, "status"):
            return exception.status in {
                502,
                503,
                504,
            }
        return True
    return isinstance(exception, RateLimitError)


@retry(
    retry=retry_if_exception(is_retryable_error),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    stop=stop_after_attempt(10),
)
async def async_get_selection(prompt, model, semaphore):
    async with semaphore:
        try:
            response = await asyncio.get_event_loop().run_in_executor(
                None,
                lambda: client.chat.completions.create(
                    model=model,
                    temperature=0,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": SELECTION_PROMPT_BASE + prompt},
                    ],
                ),
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Error during API call: {str(e)}")
            raise


async def process_selection_trial(scenario, model, semaphore):
    try:
        trial = Trial(scenario)
        selection = await async_get_selection(trial.generate_choice(), model, semaphore)
        return {
            "model": model,
            "scenario": trial.scenario.short_name,
            "option_A": trial.option_A,
            "option_B": trial.option_B,
            "selection": selection,
            "status": "success",
        }
    except Exception as e:
        return {
            "model": model,
            "scenario": scenario.short_name,
            "option_A": None,
            "option_B": None,
            "selection": None,
            "status": "error",
            "error": str(e),
        }


async def process_scenarios(scenarios, trials_per_scenario, model):
    all_trials = [
        scenario for scenario in scenarios for _ in range(trials_per_scenario)
    ]

    max_concurrent_requests = 160
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    tasks = [
        process_selection_trial(scenario, model, semaphore) for scenario in all_trials
    ]

    results = await tqdm_asyncio.gather(*tasks, desc="Processing trials")

    failed_trials = [r for r in results if r["status"] == "error"]
    if failed_trials:
        print(f"\nFailed trials: {len(failed_trials)}")
        for trial in failed_trials:
            print(f"Scenario: {trial['scenario']}, Error: {trial['error']}")

    successful_trials = [r for r in results if r["status"] == "success"]
    return successful_trials


def run_parallel_scenarios(
    scenarios,
    trials_per_scenario,
    model,
    validation=False,
):
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

    if validation:
        random.seed(VALIDATION_SEED)
    else:
        random.seed(SELECTIONS_SEED)

    try:
        # If we're in a Jupyter notebook with an existing loop
        if loop.is_running():
            return loop.run_until_complete(
                process_scenarios(scenarios, trials_per_scenario, model)
            )
        else:
            # If we're in a regular Python script
            return asyncio.run(process_scenarios(scenarios, trials_per_scenario, model))
    except KeyboardInterrupt:
        print("\nOperation cancelled by user")
        return []
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        return []

In [None]:
def save_selections(selections, filename):
    tabular_selections = pd.DataFrame(
        {
            "model": selection["model"],
            "scenario": selection["scenario"],
            "selection": selection["selection"],
            **{
                f"A_attribute_{i+1}": selection["option_A"].attributes[i]["value"]
                for i in range(N_ATTRIBUTES)
            },
            **{
                f"B_attribute_{i+1}": selection["option_B"].attributes[i]["value"]
                for i in range(N_ATTRIBUTES)
            },
        }
        for selection in selections
    )
    selections_file = Path(f"data/{filename}")
    if not selections_file.exists():
        tabular_selections.to_csv(selections_file, index=False)

### Confirm that the instilled preferences were instilled successfully


In [None]:
for base_model in BASE_MODELS:
    model = models_info[base_model][instilled_model_name]
    selections = run_parallel_scenarios(scenarios[:100], 50, model, validation=True)
    save_selections(selections, f"{base_model}_instilled_selections.csv")

### Get native preferences of the instilled models


In [None]:
for base_model in BASE_MODELS:
    model = models_info[base_model][instilled_model_name]
    selections = run_parallel_scenarios(scenarios[100:200], 100, model)
    save_selections(selections, f"{base_model}_instilled_latent_selections.csv")