In [27]:
import itertools
import json
from pathlib import Path

import numpy as np
import pandas as pd
from bellem.dvc.experiment import load_experiments

In [28]:
def product_experiment_configs(params_record):
    # Generate all possible combinations of parameters
    keys = params_record.keys()
    values = params_record.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [29]:
params_records = [
    {
        "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
        "params.qa.system_prompt": "standard/excellent-few.txt,standard/helpful-output-format-few.txt,standard/minimal-output-format-few.txt,standard/minimal-output-format.txt".split(
            ","
        ),
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["empty.json", "standard-2-shot.json"],
        "params.run": [1, 2, 3],
    },
    # {
    # "params.dataset.path": ["bdsaglam/musique-mini"],
    #     "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
    #     "params.qa.system_prompt": "cot/format-minimal.txt,cot/format-reasoning.txt,cot/excellent-format-reasoning.txt,cot/format-think.txt".split(
    #         ","
    #     ),
    #     "params.qa.user_prompt_template": ["cq.txt"],
    #     "params.qa.few_shot_examples": ["empty.json"],
    #     "params.run": [1, 2, 3],
    # },
    {
        "params.dataset.path": ["bdsaglam/musique-mini"],
        "params.qa.temperature": [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0],
        "params.qa.system_prompt": "cte/excellent-format-few.txt,cte/format-few.txt,cte/format-relevant-few.txt".split(
            ","
        ),
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": "empty.json,cte-2-shot-delim-1.json,cte-2-shot-delim-2.json".split(
            ","
        ),
        "params.run": [1, 2, 3],
    },
]

exp_configs = [
    exp_config
    for params_record in params_records
    for exp_config in product_experiment_configs(params_record)
]
print(f"{len(exp_configs)} experiment configurations")

357 experiment configurations


In [30]:
target_params = exp_configs[0].keys()
target_params

dict_keys(['params.dataset.path', 'params.qa.temperature', 'params.qa.system_prompt', 'params.qa.user_prompt_template', 'params.qa.few_shot_examples', 'params.run'])

In [31]:
results_path = Path("results.jsonl")
if results_path.exists():
    with open(results_path) as f:
        experiments = [json.loads(line) for line in f]
else:
    experiments = []

print(f"{len(experiments)} experiments")
next(iter(experiments), None)

357 experiments


{'id': '2eea485f9fada3ab346e4930dd791bcf69f615d2',
 'name': 'adunc-wife',
 'params.dataset.path': 'bdsaglam/musique-mini',
 'params.dataset.name': 'answerable',
 'params.dataset.split': 'validation',
 'params.qa.model': 'llama-3-70b-tgi',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cte/format-relevant-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cte-2-shot-delim-2.json',
 'params.run': 3,
 'metrics.exact_match': 0.08,
 'metrics.f1': 0.1006194363,
 'metrics.2hops.exact_match': 0.16,
 'metrics.2hops.f1': 0.2044984755,
 'metrics.3hops.exact_match': 0.06,
 'metrics.3hops.f1': 0.0687884047,
 'metrics.4hops.exact_match': 0.02,
 'metrics.4hops.f1': 0.0285714286,
 'params.qa.technique': 'CTE',
 'params.qa.n_shot': 2}

In [32]:
df = pd.DataFrame(experiments)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

357 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.exact_match,metrics.f1,metrics.2hops.exact_match,metrics.2hops.f1,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.technique,params.qa.n_shot
0,2eea485f9fada3ab346e4930dd791bcf69f615d2,adunc-wife,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cte/format-relevant-few.txt,cq.txt,cte-2-shot-delim-2.json,...,0.08,0.100619,0.16,0.204498,0.06,0.068788,0.02,0.028571,CTE,2
1,c48d065577d2b1f2c78e1e4904e70d85117a5c11,dress-puck,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cte/format-relevant-few.txt,cq.txt,cte-2-shot-delim-2.json,...,0.09,0.100029,0.16,0.176028,0.07,0.082726,0.04,0.041333,CTE,2
2,24d95edf9f3a0a20071cd7633221d50a6d9f2e84,moody-trug,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cte/format-relevant-few.txt,cq.txt,cte-2-shot-delim-2.json,...,0.096667,0.114228,0.17,0.203247,0.11,0.120667,0.01,0.018769,CTE,2
3,f779879e21e0a38c23ba832c99a364f63cd0c4cd,lying-over,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cte/format-relevant-few.txt,cq.txt,cte-2-shot-delim-1.json,...,0.18,0.229397,0.28,0.355347,0.15,0.207989,0.11,0.124856,CTE,2
4,c348b5aec10150fdbcef567c4d15946e627a039d,sober-saga,bdsaglam/musique-mini,answerable,validation,llama-3-70b-tgi,2.0,cte/format-relevant-few.txt,cq.txt,cte-2-shot-delim-1.json,...,0.173333,0.214778,0.27,0.348005,0.19,0.212121,0.06,0.084207,CTE,2


In [33]:
existing_configs = df[target_params].to_dict(orient="records")
existing_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cte/format-relevant-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cte-2-shot-delim-2.json',
 'params.run': 3}

In [34]:
target_params

dict_keys(['params.dataset.path', 'params.qa.temperature', 'params.qa.system_prompt', 'params.qa.user_prompt_template', 'params.qa.few_shot_examples', 'params.run'])

In [35]:
run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(tuple)
    .reset_index()
)
mask = run_counts["params.run"].map(len) < 3
run_counts.loc[mask]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run


In [36]:
exp_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 0.1,
 'params.qa.system_prompt': 'standard/excellent-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'empty.json',
 'params.run': 1}

In [37]:
existing_configs[0]

{'params.dataset.path': 'bdsaglam/musique-mini',
 'params.qa.temperature': 2.0,
 'params.qa.system_prompt': 'cte/format-relevant-few.txt',
 'params.qa.user_prompt_template': 'cq.txt',
 'params.qa.few_shot_examples': 'cte-2-shot-delim-2.json',
 'params.run': 3}

In [38]:
tuple(sorted(exp_configs[0].items()))

(('params.dataset.path', 'bdsaglam/musique-mini'),
 ('params.qa.few_shot_examples', 'empty.json'),
 ('params.qa.system_prompt', 'standard/excellent-few.txt'),
 ('params.qa.temperature', 0.1),
 ('params.qa.user_prompt_template', 'cq.txt'),
 ('params.run', 1))

In [39]:
tuple(sorted(existing_configs[0].items()))

(('params.dataset.path', 'bdsaglam/musique-mini'),
 ('params.qa.few_shot_examples', 'cte-2-shot-delim-2.json'),
 ('params.qa.system_prompt', 'cte/format-relevant-few.txt'),
 ('params.qa.temperature', 2.0),
 ('params.qa.user_prompt_template', 'cq.txt'),
 ('params.run', 3))

In [40]:
# find the missing configurations
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")
next(iter(missing_configs), None)


0 missing configurations


In [41]:
def make_command(exp_config):
    run = exp_config['params.run']
    dataset_path = exp_config['params.dataset.path']
    temperature = exp_config['params.qa.temperature']
    system_prompt = exp_config['params.qa.system_prompt']
    user_prompt_template = exp_config['params.qa.user_prompt_template']
    few_shot_examples = exp_config['params.qa.few_shot_examples']
    command = f"""
dvc exp run --queue \\
    -S params.dataset.path='{dataset_path}' \\
    -S qa.temperature='{temperature}' \\
    -S qa.system_prompt='{system_prompt}' \\
    -S qa.user_prompt_template='{user_prompt_template}' \\
    -S qa.few_shot_examples='{few_shot_examples}' \\
    -S run='{run}'
    """
    return command

In [42]:
with open("../../tmp/missing_configs.sh", "w") as f:
    f.write('#!/bin/sh\n')
    for exp_config in missing_configs:
        f.write(make_command(exp_config))
        f.write("\n")