In [1]:
import json
from pathlib import Path

import pandas as pd

In [2]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-manual-new/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
# experiments[1]

25 experiments


In [None]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["metrics.f1"] > 0.3)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

25 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,,1104.0,1104.0,1104.0,1104.0,1104.0,,,,
1,8f3d199ad17105047f5d38746174c27adc9349c0,oaken-omer,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,802.290088,79.0,284.0,366.0,503.5,8417.0,0.198684,0.360385,0.269136,0.366734
2,540d342a29e45243d6746d5bbbc569fe2ab288b3,wedgy-taka,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,967.510242,119.0,299.0,412.0,791.0,8185.0,0.335526,0.518713,0.355556,0.460453
3,0bb3103177d4536d6b27efa31f3eae575104c4a9,mopey-axon,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,1855.602503,99.0,303.0,410.0,756.0,8409.0,0.328947,0.507641,0.375309,0.467575
4,68bda76b706fe6d93720c35066ace7e248e0c6e4,myoid-scow,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.5,direct/format-few.txt,cq.txt,manual/reasoning.json,...,525.705511,87.0,322.0,433.0,605.75,8085.0,0.244737,0.408484,0.281481,0.379865


In [4]:
def parse_qa_technique(row):
    model = row['params.qa.model']
    if 'r1' in model:
        base_technique = 'REASONING'
    else:
        base_technique = row["params.qa.system_prompt"].split("/")[0].replace(".txt", "").upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [5]:
def remove_ext(filename):
    return filename.split(".", 1)[0]

def parse_instruction(row):
    sp = remove_ext(row["params.qa.system_prompt"]).lower()
    up = remove_ext(row["params.qa.user_prompt_template"]).lower().replace(".txt", "")
    fs = remove_ext(row["params.qa.few_shot_examples"]).lower()
    return f"{sp}:{up}:{fs}"

df["params.qa.instruction"] = df.apply(parse_instruction, axis=1)

In [6]:
# make temperature 1 decimal
df["params.qa.temperature"] = df["params.qa.temperature"].round(1)

In [7]:
df.head()

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.technique,params.qa.instruction
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,1104.0,1104.0,1104.0,1104.0,,,,,REASONING,empty:icq-format:manual/reasoning
1,8f3d199ad17105047f5d38746174c27adc9349c0,oaken-omer,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,284.0,366.0,503.5,8417.0,0.198684,0.360385,0.269136,0.366734,REASONING,empty:icq-format:manual/reasoning
2,540d342a29e45243d6746d5bbbc569fe2ab288b3,wedgy-taka,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,299.0,412.0,791.0,8185.0,0.335526,0.518713,0.355556,0.460453,REASONING,empty:icq-format:manual/reasoning
3,0bb3103177d4536d6b27efa31f3eae575104c4a9,mopey-axon,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,303.0,410.0,756.0,8409.0,0.328947,0.507641,0.375309,0.467575,REASONING,empty:icq-format:manual/reasoning
4,68bda76b706fe6d93720c35066ace7e248e0c6e4,myoid-scow,bdsaglam/musique,answerable,validation,deepseek-r1-llama-8b,0.5,direct/format-few.txt,cq.txt,manual/reasoning.json,...,322.0,433.0,605.75,8085.0,0.244737,0.408484,0.281481,0.379865,REASONING,direct/format-few:cq:manual/reasoning


In [8]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.technique",
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.model",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
# run_counts.loc[run_counts["params.run"].map(len) != 3]

In [9]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.technique
	REASONING
- params.qa.system_prompt
	direct/format-few.txt
	empty.txt
- params.qa.user_prompt_template
	cq.txt
	icq-format.txt
- params.qa.few_shot_examples
	manual/reasoning.json
- params.qa.n_shot
	0
	1
- params.qa.n_sc
	1
- params.qa.model
	deepseek-r1-llama-8b
	deepseek-r1-llama-70b
- params.qa.temperature
	0.0
	0.5
- params.run
	(1, 2, 3)
	(1,)


In [10]:
df.to_json('exps2.jsonl', orient='records', lines=True)