In [1]:
import itertools
import json
from pathlib import Path

import numpy as np
import pandas as pd
from bellem.dvc.experiment import load_experiments
from bellem.utils import jprint

In [2]:
def product_experiment_configs(params_record):
    # Generate all possible combinations of parameters
    keys = params_record.keys()
    values = params_record.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [3]:
params_records = [
    {
        "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
        "params.qa.system_prompt": "direct/excellent-few.txt,direct/helpful-output-format-few.txt,direct/minimal-output-format-few.txt,direct/minimal-output-format.txt".split(
            ","
        ),
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["empty.json", "direct-2-shot.json"],
        "params.run": [1, 2, 3],
    },
    {
    "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
        "params.qa.system_prompt": "cot/format-minimal.txt,cot/format-reasoning.txt,cot/excellent-format-reasoning.txt,cot/format-think.txt".split(
            ","
        ),
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["empty.json"],
        "params.run": [1, 2, 3],
    },
    {
        "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
        "params.qa.system_prompt": "cte/excellent-format-few.txt,cte/format-few.txt,cte/format-relevant-few.txt".split(
            ","
        ),
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": "empty.json,cte-2-shot-delim-1.json,cte-2-shot-delim-2.json".split(
            ","
        ),
        "params.run": [1, 2, 3],
    },
]

exp_configs = [
    exp_config
    for params_record in params_records
    for exp_config in product_experiment_configs(params_record)
]
print(f"{len(exp_configs)} experiment configurations")

441 experiment configurations


In [4]:
target_params = exp_configs[0].keys()
target_params

dict_keys(['params.dataset.path', 'params.qa.temperature', 'params.qa.system_prompt', 'params.qa.user_prompt_template', 'params.qa.few_shot_examples', 'params.run'])

In [5]:
results_path = Path("results.jsonl")
if results_path.exists():
    with open(results_path) as f:
        experiments = [json.loads(line) for line in f]
else:
    experiments = []

print(f"{len(experiments)} experiments")
next(iter(experiments), None)

525 experiments


{'id': '673332d7597d678ec85a3c829f02f47ff80356cd',
 'name': 'wrong-mome',
 'params.dataset.path': 'bdsaglam/musique-mini',
 'params.dataset.name': 'answerable',
 'params.dataset.split': 'validation',
 'params.qa.model': 'llama-3-70b-tgi',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cot/format-think.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cot-2-shot.json',
 'params.run': 2,
 'metrics.exact_match': 0.1166666667,
 'metrics.f1': 0.1516770781,
 'metrics.2hops.exact_match': 0.21,
 'metrics.2hops.f1': 0.2645693278,
 'metrics.3hops.exact_match': 0.1,
 'metrics.3hops.f1': 0.12,
 'metrics.4hops.exact_match': 0.04,
 'metrics.4hops.f1': 0.0704619065,
 'params.qa.technique': 'COT',
 'params.qa.n_shot': 2}

In [6]:
df = pd.DataFrame(experiments)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

525 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.exact_match,metrics.f1,metrics.2hops.exact_match,metrics.2hops.f1,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.technique,params.qa.n_shot
0,673332d7597d678ec85a3c829f02f47ff80356cd,wrong-mome,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cot/format-think.txt,cq.txt,cot-2-shot.json,...,0.116667,0.151677,0.21,0.264569,0.1,0.12,0.04,0.070462,COT,2
1,5bcf06a7621f96e6319c54f6111dbeeeb831a3b6,wacky-juts,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cot/format-think.txt,cq.txt,empty.json,...,0.053333,0.102588,0.06,0.134806,0.07,0.10838,0.03,0.064579,COT,0
2,601025d8649e1c8a131d4ee5aa1fc2321800353d,focal-bort,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cot/excellent-format-reasoning.txt,cq.txt,cot-2-shot.json,...,0.163333,0.19546,0.26,0.318119,0.18,0.201705,0.05,0.066556,COT,2
3,ae06cd16088072e3c045d9037627bffbdd42a6d4,yolky-whit,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cot/excellent-format-reasoning.txt,cq.txt,empty.json,...,0.06,0.082742,0.08,0.109836,0.06,0.084391,0.04,0.054,COT,0
4,7f07eeb51d8036d8a74f54aacb59dc33bc2744a7,lippy-joss,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cot/format-reasoning.txt,cq.txt,cot-2-shot.json,...,0.12,0.173731,0.21,0.298313,0.11,0.155691,0.04,0.06719,COT,2


In [7]:
existing_configs = df[target_params].to_dict(orient="records")
existing_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cot/format-think.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cot-2-shot.json',
 'params.run': 2}

In [8]:
target_params

dict_keys(['params.dataset.path', 'params.qa.temperature', 'params.qa.system_prompt', 'params.qa.user_prompt_template', 'params.qa.few_shot_examples', 'params.run'])

In [9]:
run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(tuple)
    .reset_index()
)
mask = run_counts["params.run"].map(len) < 3
run_counts.loc[mask]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run


In [10]:
exp_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 0.1,
 'params.qa.system_prompt': 'direct/excellent-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'empty.json',
 'params.run': 1}

In [11]:
existing_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cot/format-think.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cot-2-shot.json',
 'params.run': 2}

In [12]:
tuple(sorted(exp_configs[0].items()))

(('params.dataset.path', 'bdsaglam/musique-mini'),
 ('params.qa.few_shot_examples', 'empty.json'),
 ('params.qa.system_prompt', 'direct/excellent-few.txt'),
 ('params.qa.temperature', 0.1),
 ('params.qa.user_prompt_template', 'cq.txt'),
 ('params.run', 1))

In [13]:
tuple(sorted(existing_configs[0].items()))

(('params.dataset.path', 'bdsaglam/musique-mini'),
 ('params.qa.few_shot_examples', 'cot-2-shot.json'),
 ('params.qa.system_prompt', 'cot/format-think.txt'),
 ('params.qa.temperature', 2.0),
 ('params.qa.user_prompt_template', 'cq.txt'),
 ('params.run', 2))

In [19]:
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")
jprint(next(iter(missing_configs), None))

0 missing configurations
null


In [15]:
def make_command(exp_config):
    run = exp_config['params.run']
    dataset_path = exp_config['params.dataset.path']
    temperature = exp_config['params.qa.temperature']
    system_prompt = exp_config['params.qa.system_prompt']
    user_prompt_template = exp_config['params.qa.user_prompt_template']
    few_shot_examples = exp_config['params.qa.few_shot_examples']
    command = f"""
dvc exp run --queue \\
    -S params.dataset.path='{dataset_path}' \\
    -S qa.temperature='{temperature}' \\
    -S qa.system_prompt='{system_prompt}' \\
    -S qa.user_prompt_template='{user_prompt_template}' \\
    -S qa.few_shot_examples='{few_shot_examples}' \\
    -S run='{run}'
    """
    return command

In [16]:
if missing_configs:
    with open("../../tmp/missing_configs.sh", "w") as f:
        f.write('#!/bin/sh\n')
        for exp_config in missing_configs:
            f.write(make_command(exp_config))
            f.write("\n")