In [109]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [110]:
def product_experiment_configs(params_record):
    # Generate all possible combinations of parameters
    keys = params_record.keys()
    values = params_record.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [111]:
exp_configs = []

In [112]:
common_params = {
    "params.dataset.path": ["bdsaglam/musique"],
    "params.dataset.split": ["validation"],
    "params.qa.temperature": [
        0.0,
        # 0.5,
        # 1.0,
    ],
    "params.qa.user_prompt_template": ["cq.txt"],
    "params.qa.n_sc": [
        1,
    ],
    "params.run": [
        1,
        2,
        3,
    ],
}

In [113]:
params_records = [
    {
        "params.qa.model": [
            "llama-3-8b",
            "llama-3-70b",
            "llama-3.3-70b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": ["direct/format-few.txt"],
        "params.qa.few_shot_examples": ["manual/direct.json"],
    },
    {
        "params.qa.model": [
            "llama-3-8b",
            "llama-3-70b",
            "llama-3.3-70b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": ["cot/format-thought.txt"],
        "params.qa.few_shot_examples": ["manual/cot.json"],
    },
    {
        "params.qa.model": [
            "llama-3-8b",
            "llama-3-70b",
            "llama-3.3-70b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": ["ccot/format-thought.txt"],
        "params.qa.few_shot_examples": ["manual/ccot.json"],
    },
    {
        "params.qa.model": [
            "llama-3-8b",
            "llama-3-70b",
            "llama-3.3-70b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": ["cok/adapted.txt"],
        "params.qa.few_shot_examples": ["manual/cok.json"],
    },
    {
        "params.qa.model": [
            "llama-3-8b",
            "llama-3-70b",
            "llama-3.3-70b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": [
            "cte/format-triples-few.txt",
            "cte/format-triples-ere-few.txt",
            "cte/format-sro-triples-few.txt",
        ],
        "params.qa.few_shot_examples": ["manual/cte-triples.json"],
    },
]

# for params_record in params_records:
#     for exp_config in product_experiment_configs({**common_params, **params_record}):
#         exp_configs.append(exp_config)

In [114]:
params_records = [
    {
        "params.qa.model": [
            # "deepseek-r1-llama-8b",
            "deepseek-r1-llama-70b",
            # "deepseek-r1-qwen-32b",
        ],
        "params.qa.n_shot": [
            0,
            1,
        ],
        "params.qa.system_prompt": ["empty.txt"],
        "params.qa.user_prompt_template": ["icq-format.txt"],
        "params.qa.few_shot_examples": ["manual/reasoning.json"],
    },
    # {
    #     "params.qa.model": [
    #         "deepseek-r1-llama-8b",
    #         "deepseek-r1-llama-70b",
    #         "deepseek-r1-qwen-32b",
    #     ],
    #     "params.qa.n_shot": [0],
    #     "params.qa.system_prompt": ["direct/format-few.txt"],
    #     "params.qa.user_prompt_template": ["cq.txt"],
    #     "params.qa.few_shot_examples": ["empty.json"],
    # },
]

for params_record in params_records:
    for exp_config in product_experiment_configs({**common_params, **params_record}):
        exp_configs.append(exp_config)

In [115]:
print(f"{len(exp_configs)} experiment configurations")

6 experiment configurations


In [116]:
target_params = exp_configs[0].keys()
target_params

dict_keys(['params.dataset.path', 'params.dataset.split', 'params.qa.temperature', 'params.qa.user_prompt_template', 'params.qa.n_sc', 'params.run', 'params.qa.model', 'params.qa.n_shot', 'params.qa.system_prompt', 'params.qa.few_shot_examples'])

In [117]:
results_path = Path("exps.jsonl")
if results_path.exists():
    with open(results_path) as f:
        experiments = [json.loads(line) for line in f]
else:
    experiments = []

print(f"{len(experiments)} experiments")
next(iter(experiments), None)

392 experiments


{'id': '89ceb09a69f6dd22d587ad0e1d1161a7f31b166d',
 'name': 'coxal-eyas',
 'params.dataset.path': 'bdsaglam/musique',
 'params.dataset.name': 'answerable',
 'params.dataset.split': 'validation',
 'params.qa.model': 'llama-3.3-70b',
 'params.qa.temperature': 1.0,
 'params.qa.system_prompt': 'cok/adapted.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'manual/cok.json',
 'params.qa.n_shot': 0,
 'params.qa.n_sc': 1,
 'params.run': 1,
 'metrics.exact_match': 0.6052958213,
 'metrics.f1': 0.72491734,
 'metrics.2hops.exact_match': 0.642172524,
 'metrics.2hops.f1': 0.7550383123,
 'metrics.3hops.exact_match': 0.5815789474,
 'metrics.3hops.f1': 0.7294157195,
 'metrics.4hops.exact_match': 0.5358024691,
 'metrics.4hops.f1': 0.6233612268,
 'metrics.gen_token_count.all.count': 2417.0,
 'metrics.gen_token_count.all.mean': 96.9242863053,
 'metrics.gen_token_count.all.std': 26.7317829936,
 'metrics.gen_token_count.all.min': 44.0,
 'metrics.gen_token_count.all.25%': 80

In [118]:
df = pd.DataFrame(experiments)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

392 experiments after preprocessing


In [119]:
if len(df):
    existing_configs = df[target_params].to_dict(orient="records")
else:
    existing_configs = []

next(iter(existing_configs), None)

{'params.dataset.path': 'bdsaglam/musique',
 'params.dataset.split': 'validation',
 'params.qa.temperature': 1.0,
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.n_sc': 1,
 'params.run': 1,
 'params.qa.model': 'llama-3.3-70b',
 'params.qa.n_shot': 0,
 'params.qa.system_prompt': 'cok/adapted.txt',
 'params.qa.few_shot_examples': 'manual/cok.json'}

In [120]:
target_params = list({**common_params, **params_records[0]}.keys())
target_params.remove("params.run")
target_params

['params.dataset.path',
 'params.dataset.split',
 'params.qa.temperature',
 'params.qa.user_prompt_template',
 'params.qa.n_sc',
 'params.qa.model',
 'params.qa.n_shot',
 'params.qa.system_prompt',
 'params.qa.few_shot_examples']

In [121]:
# find the missing configurations
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")
next(iter(missing_configs), None)


6 missing configurations


{'params.dataset.path': 'bdsaglam/musique',
 'params.dataset.split': 'validation',
 'params.qa.few_shot_examples': 'manual/reasoning.json',
 'params.qa.model': 'deepseek-r1-llama-70b',
 'params.qa.n_sc': 1,
 'params.qa.n_shot': 1,
 'params.qa.system_prompt': 'empty.txt',
 'params.qa.temperature': 0.0,
 'params.qa.user_prompt_template': 'icq-format.txt',
 'params.run': 2}

In [122]:
def make_command(exp_config, force: bool = False):
    run = exp_config["params.run"]
    lines = []
    if force:
        lines.append("dvc exp run -f --queue")
    else:
        lines.append("dvc exp run --queue")
    lines.append(f"-S run={run}")
    for target_param in target_params:
        arg_name = target_param.split(".", 1)[-1]
        arg_value = exp_config[target_param]
        lines.append(f"-S {arg_name}='{arg_value}'")

    command = " \\\n    ".join(lines)
    return command

In [123]:
with open("run.sh", "w") as f:
    f.write("#!/bin/sh\n\n")
    for exp_config in missing_configs:
        f.write(make_command(exp_config, force=True))
        f.write("\n\n")