In [11]:
import json
from pathlib import Path

import pandas as pd

In [12]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-manual-new/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
# experiments[1]

6 experiments


In [13]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["metrics.f1"] > 0.3)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

6 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,,1104.0,1104.0,1104.0,1104.0,1104.0,,,,
1,f1202558381b373e3887f10839602f88f38b50b4,bawdy-orcs,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,direct/format-few.txt,cq.txt,manual/reasoning.json,...,627.165602,120.0,294.0,423.0,759.75,3723.0,0.568421,0.703708,0.51358,0.616449
2,d9f75e1069557bcf91ffbb0c386aba19ec0a22dc,flown-yogi,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,859.926068,167.0,297.75,369.0,529.5,8349.0,0.588158,0.720796,0.501235,0.596748
3,d5ba2d737fccc1aa1d1beb9940d4547f90aa5de2,woods-azan,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,477.133183,83.0,299.25,378.0,530.0,7973.0,0.581579,0.71714,0.496296,0.593055
4,6feea8e6705dcf469acf9669e9e421f4a39947b4,keyed-bade,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,611.400695,122.0,284.25,391.0,753.25,8057.0,0.542105,0.699619,0.469136,0.567724


In [14]:
def parse_qa_technique(row):
    model = row['params.qa.model']
    if 'r1' in model:
        base_technique = 'REASONING'
    else:
        base_technique = row["params.qa.system_prompt"].split("/")[0].replace(".txt", "").upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [15]:
def remove_ext(filename):
    return filename.split(".", 1)[0]

def parse_instruction(row):
    sp = remove_ext(row["params.qa.system_prompt"]).lower()
    up = remove_ext(row["params.qa.user_prompt_template"]).lower().replace(".txt", "")
    fs = remove_ext(row["params.qa.few_shot_examples"]).lower()
    return f"{sp}:{up}:{fs}"

df["params.qa.instruction"] = df.apply(parse_instruction, axis=1)

In [16]:
# make temperature 1 decimal
df["params.qa.temperature"] = df["params.qa.temperature"].round(1)

In [17]:
df.head()

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.technique,params.qa.instruction
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,1104.0,1104.0,1104.0,1104.0,,,,,REASONING,empty:icq-format:manual/reasoning
1,f1202558381b373e3887f10839602f88f38b50b4,bawdy-orcs,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,direct/format-few.txt,cq.txt,manual/reasoning.json,...,294.0,423.0,759.75,3723.0,0.568421,0.703708,0.51358,0.616449,REASONING,direct/format-few:cq:manual/reasoning
2,d9f75e1069557bcf91ffbb0c386aba19ec0a22dc,flown-yogi,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,297.75,369.0,529.5,8349.0,0.588158,0.720796,0.501235,0.596748,REASONING,empty:icq-format:manual/reasoning
3,d5ba2d737fccc1aa1d1beb9940d4547f90aa5de2,woods-azan,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,299.25,378.0,530.0,7973.0,0.581579,0.71714,0.496296,0.593055,REASONING,empty:icq-format:manual/reasoning
4,6feea8e6705dcf469acf9669e9e421f4a39947b4,keyed-bade,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,284.25,391.0,753.25,8057.0,0.542105,0.699619,0.469136,0.567724,REASONING,empty:icq-format:manual/reasoning


In [18]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.technique",
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.model",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
# run_counts.loc[run_counts["params.run"].map(len) != 3]

In [19]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.technique
	REASONING
- params.qa.system_prompt
	direct/format-few.txt
	empty.txt
- params.qa.user_prompt_template
	cq.txt
	icq-format.txt
- params.qa.few_shot_examples
	manual/reasoning.json
- params.qa.n_shot
	0
	1
- params.qa.n_sc
	1
- params.qa.model
	deepseek-r1-llama-70b
- params.qa.temperature
	0.5
	0.0
- params.run
	(3,)
	(2,)
	(1,)


In [20]:
df.to_json('exps2.jsonl', orient='records', lines=True)