In [1]:
import json
from pathlib import Path

import pandas as pd

In [2]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-manual-new/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
# experiments[1]

414 experiments


In [3]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["metrics.f1"] > 0.3)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")
df.head()

392 experiments after preprocessing


Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.success.75%,metrics.gen_token_count.success.max,metrics.gen_token_count.fail.count,metrics.gen_token_count.fail.mean,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max
1,89ceb09a69f6dd22d587ad0e1d1161a7f31b166d,coxal-eyas,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,1.0,cok/adapted.txt,cq.txt,manual/cok.json,...,106.0,283.0,673.0,102.307578,30.530511,45.0,83.0,94.0,114.0,239.0
2,569fe899f90c8bdaf47866bbb36fc24c88f4925c,sulfa-lame,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cot/format-thought.txt,cq.txt,manual/cot.json,...,91.0,919.0,716.0,94.681564,60.827044,29.0,59.0,80.0,112.25,873.0
3,98498237a46b9b473706d7f773a27e67e30523d9,wacky-ciao,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cte/format-triples-ere-few.txt,cq.txt,manual/cte-triples.json,...,55.0,347.0,686.0,52.836735,21.162,18.0,39.0,48.0,61.0,177.0
4,288b4a6bb49a305006dc7f891cc0c52cc0b77d0b,tarry-bani,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cte/format-triples-few.txt,cq.txt,manual/cte-triples.json,...,54.0,141.0,643.0,52.905132,26.209948,16.0,36.0,46.0,61.5,228.0
5,e3ed6981ac0d21340a3b5eca1e92696a1e7fdd0a,keyed-brig,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.0,cot/format-thought.txt,cq.txt,manual/cot.json,...,75.0,375.0,709.0,92.279267,70.008643,23.0,52.0,75.0,104.0,839.0


In [4]:
def parse_qa_technique(row):
    model = row['params.qa.model']
    if 'r1' in model:
        base_technique = 'REASONING'
    else:
        base_technique = row["params.qa.system_prompt"].split("/")[0].replace(".txt", "").upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [5]:
def remove_ext(filename):
    return filename.split(".", 1)[0]

def parse_instruction(row):
    sp = remove_ext(row["params.qa.system_prompt"]).lower()
    up = remove_ext(row["params.qa.user_prompt_template"]).lower().replace(".txt", "")
    fs = remove_ext(row["params.qa.few_shot_examples"]).lower()
    return f"{sp}:{up}:{fs}"

df["params.qa.instruction"] = df.apply(parse_instruction, axis=1)

In [6]:
# make temperature 1 decimal
df["params.qa.temperature"] = df["params.qa.temperature"].round(1)

In [7]:
df.head()

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.fail.count,metrics.gen_token_count.fail.mean,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,params.qa.technique,params.qa.instruction
1,89ceb09a69f6dd22d587ad0e1d1161a7f31b166d,coxal-eyas,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,1.0,cok/adapted.txt,cq.txt,manual/cok.json,...,673.0,102.307578,30.530511,45.0,83.0,94.0,114.0,239.0,COK,cok/adapted:cq:manual/cok
2,569fe899f90c8bdaf47866bbb36fc24c88f4925c,sulfa-lame,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cot/format-thought.txt,cq.txt,manual/cot.json,...,716.0,94.681564,60.827044,29.0,59.0,80.0,112.25,873.0,COT,cot/format-thought:cq:manual/cot
3,98498237a46b9b473706d7f773a27e67e30523d9,wacky-ciao,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cte/format-triples-ere-few.txt,cq.txt,manual/cte-triples.json,...,686.0,52.836735,21.162,18.0,39.0,48.0,61.0,177.0,CTE,cte/format-triples-ere-few:cq:manual/cte-triples
4,288b4a6bb49a305006dc7f891cc0c52cc0b77d0b,tarry-bani,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.5,cte/format-triples-few.txt,cq.txt,manual/cte-triples.json,...,643.0,52.905132,26.209948,16.0,36.0,46.0,61.5,228.0,CTE,cte/format-triples-few:cq:manual/cte-triples
5,e3ed6981ac0d21340a3b5eca1e92696a1e7fdd0a,keyed-brig,bdsaglam/musique,answerable,validation,llama-3.3-70b-tgi,0.0,cot/format-thought.txt,cq.txt,manual/cot.json,...,709.0,92.279267,70.008643,23.0,52.0,75.0,104.0,839.0,COT,cot/format-thought:cq:manual/cot


In [11]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.technique",
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.model",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
# run_counts.loc[run_counts["params.run"].map(len) != 3]

In [12]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.technique
	CCOT
	COK
	COT
	CTE
	DIRECT
	REASONING
- params.qa.system_prompt
	ccot/format-thought.txt
	cok/adapted.txt
	cot/format-thought.txt
	cte/format-sro-triples-few.txt
	cte/format-triples-ere-few.txt
	cte/format-triples-few.txt
	direct/format-few.txt
	empty.txt
- params.qa.user_prompt_template
	cq.txt
	icq-format.txt
- params.qa.few_shot_examples
	manual/ccot.json
	manual/cok.json
	manual/cot.json
	manual/cte-triples.json
	manual/direct.json
	semi/direct.json
	empty.json
- params.qa.n_shot
	0
	1
- params.qa.n_sc
	1
- params.qa.model
	llama-3-70b-tgi
	llama-3-8b-tgi
	llama-3.3-70b-tgi
	deepseek-r1-llama-70b-tgi
	deepseek-r1-llama-8b-tgi
	deepseek-r1-qwen-32b-tgi
- params.qa.temperature
	0.0
	0.5
	1.0
- params.run
	(1, 2, 3)
	(1, 2)
	(1,)


In [10]:
df.to_json('exps.jsonl', orient='records', lines=True)