In [4]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [5]:
def product_experiment_configs(params_record):
    # Generate all possible combinations of parameters
    keys = params_record.keys()
    values = params_record.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [6]:
exp_configs = []

In [7]:
common_params = {
    "params.train.dataset.path": ["bdsaglam/drop-musique-mini"],
    "params.train.dataset.split": ["train"],
    "params.train.optimizer": [
        "noop",
        "bfsrs-light",
        "miprov2-light",
    ],
    "params.train.ensemble": [
        "no",
        # "yes",
    ],
    "params.evaluation.dataset.path": ["bdsaglam/drop-musique-mini"],
    "params.evaluation.dataset.split": ["validation"],
    "params.qa.model": [
        "qwen-2.5-32b",
    ],
    "params.qa.temperature": [
        0.0,
        0.5,
        # 1.0
    ],
    "params.run": [
        1,
        # 2,
        # 3,
    ],
}

In [8]:
params_records = [
    {
        "params.retrieve.model_type": [
            "golden",
            # "t5",
        ],
        "params.retrieve.top_k": [
            1,
        ],
        "params.qa.technique": [
            # "standard",
            "cot",
        ],
    }
]

for params_record in params_records:
    for exp_config in product_experiment_configs({**common_params, **params_record}):
        exp_configs.append(exp_config)

In [9]:
print(f"{len(exp_configs)} experiment configurations")

2 experiment configurations


In [10]:
target_params = exp_configs[0].keys()
target_params

dict_keys(['params.train.dataset.path', 'params.train.dataset.split', 'params.train.optimizer', 'params.train.ensemble', 'params.evaluation.dataset.path', 'params.evaluation.dataset.split', 'params.qa.model', 'params.qa.temperature', 'params.run', 'params.retrieve.model_type', 'params.retrieve.top_k', 'params.qa.technique'])

In [11]:
results_path = Path("exps.jsonl")
if results_path.exists():
    with open(results_path) as f:
        experiments = [json.loads(line) for line in f]
else:
    experiments = []

print(f"{len(experiments)} experiments")
next(iter(experiments), None)

0 experiments


In [12]:
df = pd.DataFrame(experiments)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

0 experiments after preprocessing


In [13]:
if len(df):
    existing_configs = df[target_params].to_dict(orient="records")
    existing_configs[0]
else:
    existing_configs = []

In [14]:
target_params = list(exp_configs[0].keys())
target_params

['params.train.dataset.path',
 'params.train.dataset.split',
 'params.train.optimizer',
 'params.train.ensemble',
 'params.evaluation.dataset.path',
 'params.evaluation.dataset.split',
 'params.qa.model',
 'params.qa.temperature',
 'params.run',
 'params.retrieve.model_type',
 'params.retrieve.top_k',
 'params.qa.technique']

In [15]:
# find the missing configurations
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")
next(iter(missing_configs), None)


2 missing configurations


{'params.evaluation.dataset.path': 'bdsaglam/drop-musique-mini',
 'params.evaluation.dataset.split': 'validation',
 'params.qa.model': 'qwen-2.5-32b',
 'params.qa.technique': 'cot',
 'params.qa.temperature': 0.1,
 'params.retrieve.model_type': 'golden',
 'params.retrieve.top_k': 1,
 'params.run': 1,
 'params.train.dataset.path': 'bdsaglam/drop-musique-mini',
 'params.train.dataset.split': 'train',
 'params.train.ensemble': 'no',
 'params.train.optimizer': 'noop'}

In [16]:
def make_command(exp_config):
    run = exp_config["params.run"]
    lines = ["dvc exp run --queue", f"-S run={run}"]
    for target_param in target_params:
        arg_name = target_param.split(".", 1)[-1]
        arg_value = exp_config[target_param]
        lines.append(f"-S {arg_name}='{arg_value}'")

    command = " \\\n    ".join(lines)
    return command

In [17]:
with open("run.sh", "w") as f:
    f.write("#!/bin/sh\n\n")
    for exp_config in missing_configs:
        f.write(make_command(exp_config))
        f.write("\n\n")