In [17]:
import json
from pathlib import Path

import pandas as pd

In [18]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-manual-new/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
# experiments[1]

6 experiments


In [19]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["metrics.f1"] > 0.3)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

6 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,,1104.0,1104.0,1104.0,1104.0,1104.0,,,,
1,5710ac8782bcfa94a2c7abfa75e9ef773b3abb1f,radio-afro,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,360.693117,101.0,300.5,380.0,537.0,3197.0,0.577632,0.71617,0.506173,0.609715
2,800c2520ee4057b5f00a1e6e1987d83353fd3e59,folio-cats,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,336.816364,108.0,295.25,369.5,504.0,3229.0,0.575,0.709096,0.508642,0.608247
3,d9f27bcf3c7dac385524dbd852e5b5cb6e2305a0,tangy-bomb,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,585.11711,118.0,281.75,369.5,745.0,3730.0,0.539474,0.691333,0.491358,0.584486
4,aeccce6779ce04f933a1385564ff3b6bf7ca88cb,coxal-jato,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,542.787816,93.0,282.0,376.0,675.75,3299.0,0.531579,0.686438,0.503704,0.599476


In [20]:
def parse_qa_technique(row):
    model = row['params.qa.model']
    if 'r1' in model:
        base_technique = 'REASONING'
    else:
        base_technique = row["params.qa.system_prompt"].split("/")[0].replace(".txt", "").upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [21]:
def remove_ext(filename):
    return filename.split(".", 1)[0]

def parse_instruction(row):
    sp = remove_ext(row["params.qa.system_prompt"]).lower()
    up = remove_ext(row["params.qa.user_prompt_template"]).lower().replace(".txt", "")
    fs = remove_ext(row["params.qa.few_shot_examples"]).lower()
    return f"{sp}:{up}:{fs}"

df["params.qa.instruction"] = df.apply(parse_instruction, axis=1)

In [22]:
# make temperature 1 decimal
df["params.qa.temperature"] = df["params.qa.temperature"].round(1)

In [23]:
df.head()

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.technique,params.qa.instruction
0,workspace,,bdsaglam/musique,answerable,train,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,1104.0,1104.0,1104.0,1104.0,,,,,REASONING,empty:icq-format:manual/reasoning
1,5710ac8782bcfa94a2c7abfa75e9ef773b3abb1f,radio-afro,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,300.5,380.0,537.0,3197.0,0.577632,0.71617,0.506173,0.609715,REASONING,empty:icq-format:manual/reasoning
2,800c2520ee4057b5f00a1e6e1987d83353fd3e59,folio-cats,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.0,empty.txt,icq-format.txt,manual/reasoning.json,...,295.25,369.5,504.0,3229.0,0.575,0.709096,0.508642,0.608247,REASONING,empty:icq-format:manual/reasoning
3,d9f27bcf3c7dac385524dbd852e5b5cb6e2305a0,tangy-bomb,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,281.75,369.5,745.0,3730.0,0.539474,0.691333,0.491358,0.584486,REASONING,empty:icq-format:manual/reasoning
4,aeccce6779ce04f933a1385564ff3b6bf7ca88cb,coxal-jato,bdsaglam/musique,answerable,validation,deepseek-r1-llama-70b,0.5,empty.txt,icq-format.txt,manual/reasoning.json,...,282.0,376.0,675.75,3299.0,0.531579,0.686438,0.503704,0.599476,REASONING,empty:icq-format:manual/reasoning


In [24]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.technique",
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.model",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
# run_counts.loc[run_counts["params.run"].map(len) != 3]

In [25]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.technique
	REASONING
- params.qa.system_prompt
	empty.txt
- params.qa.user_prompt_template
	icq-format.txt
- params.qa.few_shot_examples
	manual/reasoning.json
- params.qa.n_shot
	0
	1
- params.qa.n_sc
	1
- params.qa.model
	deepseek-r1-llama-70b
- params.qa.temperature
	0.0
	0.5
- params.run
	(1,)
	(1, 3)
	(3,)
	(1, 2)


In [26]:
df.to_json('exps2.jsonl', orient='records', lines=True)