In [76]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [77]:
def sorted_tuple(x):
    return tuple(sorted(x))

In [78]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/drop-mini-validation").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
next(iter(experiments), None)

0 experiments


In [79]:
df = pd.json_normalize(experiments)
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)
print(f"{len(df)} experiments before preprocessing")
df.head()

0 experiments before preprocessing


In [80]:
if 'params.qa.top_p' not in df.columns:
    df['params.qa.top_p'] = 0.999

df['params.qa.top_p'] = df['params.qa.top_p'].map(lambda x: 0.999 if x > 0.99 else x)

In [81]:
param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

In [82]:
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
print(f"{len(df)} experiments after dropping nan")

0 experiments after dropping nan


In [83]:
df.drop_duplicates(subset=param_cols + metric_cols[:2], keep='last', inplace=True)
print(f"{len(df)} experiments after dropping duplicates")

0 experiments after dropping duplicates


In [84]:
df.head()

Unnamed: 0,params.qa.top_p


In [85]:
def parse_qa_technique(row):
    model = row['params.qa.model']
    if 'r1' in model:
        base_technique = 'REASONING'
    else:
        base_technique = row["params.qa.system_prompt"].split("/")[0].replace(".txt", "").upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [86]:
def remove_ext(filename):
    return filename.split(".", 1)[0]

def parse_instruction(row):
    sp = remove_ext(row["params.qa.system_prompt"]).lower()
    up = remove_ext(row["params.qa.user_prompt_template"]).lower().replace(".txt", "")
    fs = remove_ext(row["params.qa.few_shot_examples"]).lower()
    return f"{sp}:{up}:{fs}"

if len(df):
    df["params.qa.instruction"] = df.apply(parse_instruction, axis=1)

In [87]:
# make temperature 1 decimal
if len(df):
    df["params.qa.temperature"] = df["params.qa.temperature"].round(1)

In [88]:
for col in param_cols:
    values = list(df[col].unique())
    print(f"- {col}: {values}")
    print()

- params.qa.top_p: []



In [89]:
df.to_json('exps.jsonl', orient='records', lines=True)

## Setup remaining experiments

In [90]:
def produce_experiment_configs(common_params, varying_params):
    # Generate all possible combinations of parameters
    varying_params = {**common_params, **varying_params}
    keys = varying_params.keys()
    values = varying_params.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [91]:
def produce_all_experiment_configs(common_params: dict, varying_params_list: list[dict]):
    for params in varying_params_list:
        for exp_config in produce_experiment_configs(common_params, params):
            yield exp_config

In [92]:
common_params = {
    "params.dataset.path": ["bdsaglam/drop-musique-mini"],
    "params.dataset.name": ["default"],
    "params.dataset.split": ["validation"],
    "params.qa.n_sc": [
        1,
        5,
    ],
    "params.qa.n_shot": [
        0,
        1,
        3,
        5,
    ],
    "params.run": [
        1,
        2,
        3,
    ],
}

In [93]:
varying_params_list = [
    {
        "params.qa.model": [
            "llama-3.3-70b",
        ],
        "params.qa.temperature": [0.0],
        "params.qa.top_p": [0.999],
        "params.qa.system_prompt": ["direct/format-few-drop.txt"],
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["drop/direct.json"],
    },
    {
        "params.qa.model": [
            "llama-3.3-70b",
        ],
        "params.qa.temperature": [
            0.5,
        ],
        "params.qa.top_p": [
            0.95,
        ],
        "params.qa.system_prompt": ["direct/format-few-drop.txt"],
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["drop/direct.json"],
    },
    {
        "params.qa.model": [
            "llama-3.3-70b",
        ],
        "params.qa.temperature": [
            0.0,
        ],
        "params.qa.top_p": [0.999],
        "params.qa.system_prompt": ["ccot/format-thought-drop.txt"],
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["drop/ccot.json"],
    },
    {
        "params.qa.model": [
            "llama-3.3-70b",
        ],
        "params.qa.temperature": [
            0.5,
        ],
        "params.qa.top_p": [
            0.95,
        ],
        "params.qa.system_prompt": ["ccot/format-thought-drop.txt"],
        "params.qa.user_prompt_template": ["cq.txt"],
        "params.qa.few_shot_examples": ["drop/ccot.json"],
    },
    # {
    #     "params.qa.model": [
    #         "deepseek-r1-llama-70b",
    #     ],
    #     "params.qa.temperature": [
    #         0.5,
    #     ],
    #     "params.qa.top_p": [
    #         0.95,
    #     ],
    #     "params.qa.system_prompt": ["empty.txt"],
    #     "params.qa.user_prompt_template": ["icq-format-drop.txt"],
    #     "params.qa.few_shot_examples": ["drop/reasoning.json"],
    # },
]

In [94]:
exp_configs = list(produce_all_experiment_configs(common_params, varying_params_list))
target_params = list(exp_configs[0].keys())
print(f"{len(exp_configs)} experiment configurations")
print(target_params)

96 experiment configurations
['params.dataset.path', 'params.dataset.name', 'params.dataset.split', 'params.qa.n_sc', 'params.qa.n_shot', 'params.run', 'params.qa.model', 'params.qa.temperature', 'params.qa.top_p', 'params.qa.system_prompt', 'params.qa.user_prompt_template', 'params.qa.few_shot_examples']


In [95]:
if len(df):
    existing_configs = df[target_params].to_dict(orient="records")
    existing_configs[0]
else:
    existing_configs = []

print("Existing exps:", len(existing_configs))

Existing exps: 0


In [96]:
pairs = []
for exp_config in exp_configs:
    for existing_config in existing_configs:
        if len(sorted(exp_config.items())) != len(sorted(existing_config.items())):
            pairs.append((
                sorted(exp_config.items()), 
                sorted(existing_config.items())
            ))
assert len(pairs) == 0, pairs

In [97]:
# find the missing configurations
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")

96 missing configurations


In [98]:
def make_command(exp_config):
    lines = ["dvc exp run --queue"]
    for target_param in target_params:
        arg_name = target_param.split(".", 1)[-1]
        arg_value = exp_config[target_param]
        if isinstance(arg_value, str) and '[' in arg_value:
            arg_value = f'"{arg_value}"'
        lines.append(f"-S {arg_name}='{arg_value}'")

    command = " \\\n    ".join(lines)
    return command

In [99]:
with open("run.sh", "w") as f:
    f.write("#!/bin/sh\n\n")
    for exp_config in missing_configs:
        f.write(make_command(exp_config))
        f.write("\n\n")

## Inspect

In [100]:
df

Unnamed: 0,params.qa.top_p,params.qa.technique
