In [13]:
import itertools
import json
from pathlib import Path

import numpy as np
import pandas as pd

In [14]:
def product_experiment_configs(params_record):
    # Generate all possible combinations of parameters
    keys = params_record.keys()
    values = params_record.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [15]:
common_params = {
        "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.7],
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.n_shot": [0,1,2,3,6,9],
        "params.run": [1, 2, 3],
}
params_records = [
    {
        "params.qa.system_prompt": ["direct/helpful-output-format-few.txt"],
        "params.qa.few_shot_examples": ["auto/direct.json"],
    },
    {
        "params.qa.system_prompt": ["cot/format-thought.txt"],
        "params.qa.few_shot_examples": ["auto/cot.json"],
    },
    {
        "params.qa.system_prompt": ["ccot/format-thought.txt"],
        "params.qa.few_shot_examples": ["auto/ccot.json"],
    },
    {
        "params.qa.system_prompt": ["cte/excellent-format-few.txt", "cte/format-relevant-few.txt"],
        "params.qa.few_shot_examples": ["auto/cte.json"],
    },
]

exp_configs = [
    exp_config
    for params_record in params_records
    for exp_config in product_experiment_configs({**common_params, **params_record})
]
print(f"{len(exp_configs)} experiment configurations")

180 experiment configurations


In [16]:
target_params = exp_configs[0].keys()
target_params

dict_keys(['params.dataset.path', 'params.qa.temperature', 'params.qa.user_prompt_template', 'params.qa.n_shot', 'params.run', 'params.qa.system_prompt', 'params.qa.few_shot_examples'])

In [17]:
results_path = Path("results.jsonl")
if results_path.exists():
    with open(results_path) as f:
        experiments = [json.loads(line) for line in f]
else:
    experiments = []

print(f"{len(experiments)} experiments")
next(iter(experiments), None)

144 experiments


{'id': 'workspace',
 'name': None,
 'params.dataset.path': 'bdsaglam/musique-mini',
 'params.dataset.name': 'answerable',
 'params.dataset.split': 'validation',
 'params.qa.model': 'llama-3-70b-tgi',
 'params.qa.temperature': 0.7,
 'params.qa.system_prompt': 'direct/helpful-output-format-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'auto/direct.json',
 'params.qa.n_shot': 0,
 'params.run': 1,
 'metrics.exact_match': 0.55,
 'metrics.f1': 0.6719070243,
 'metrics.2hops.exact_match': 0.65,
 'metrics.2hops.f1': 0.7548031968,
 'metrics.3hops.exact_match': 0.54,
 'metrics.3hops.f1': 0.6851904762,
 'metrics.4hops.exact_match': 0.46,
 'metrics.4hops.f1': 0.5757274001,
 'metrics.gen_token_count.all.count': 300.0,
 'metrics.gen_token_count.all.mean': 5.76,
 'metrics.gen_token_count.all.std': 1.9394858471,
 'metrics.gen_token_count.all.min': 3.0,
 'metrics.gen_token_count.all.25%': 5.0,
 'metrics.gen_token_count.all.50%': 5.0,
 'metrics.gen_token_count.all

In [18]:
df = pd.DataFrame(experiments)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

144 experiments after preprocessing


In [19]:
existing_configs = df[target_params].to_dict(orient="records")
existing_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 0.7,
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.n_shot': 0,
 'params.run': 1,
 'params.qa.system_prompt': 'direct/helpful-output-format-few.txt',
 'params.qa.few_shot_examples': 'auto/direct.json'}

In [20]:
target_params = list({**common_params, **params_records[0]}.keys())
target_params.remove("params.run")
target_params

['params.dataset.path',
 'params.qa.temperature',
 'params.qa.user_prompt_template',
 'params.qa.n_shot',
 'params.qa.system_prompt',
 'params.qa.few_shot_examples']

In [21]:
run_counts = df.groupby(target_params)["params.run"].aggregate(tuple).reset_index()
mask = run_counts["params.run"].map(len) != 3
run_counts.loc[mask]

Unnamed: 0,params.dataset.path,params.qa.temperature,params.qa.user_prompt_template,params.qa.n_shot,params.qa.system_prompt,params.qa.few_shot_examples,params.run


In [22]:
# find the missing configurations
missing_configs = [dict(kv) for kv in list({tuple(sorted(config.items())) for config in exp_configs} - {tuple(sorted(config.items())) for config in existing_configs})]
print(f"{len(missing_configs)} missing configurations")
next(iter(missing_configs), None)


36 missing configurations


{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.few_shot_examples': 'auto/cte.json',
 'params.qa.n_shot': 9,
 'params.qa.system_prompt': 'cte/format-relevant-few.txt',
 'params.qa.temperature': 0.7,
 'params.qa.user_prompt_template': 'cq.txt',
 'params.run': 2}

In [23]:
def make_command(exp_config):
    run = exp_config['params.run']
    lines = ["dvc exp run --queue", f"-S run={run}"]
    for target_param in target_params:
        arg_name = target_param.split(".", 1)[-1]
        arg_value = exp_config[target_param]
        lines.append(f"-S {arg_name}='{arg_value}'")
    
    command = " \\\n    ".join(lines)
    return command

In [24]:
with open("../../tmp/missing_configs.sh", "w") as f:
    f.write('#!/bin/sh\n')
    for exp_config in missing_configs:
        f.write(make_command(exp_config))
        f.write("\n\n")