In [1]:
import json
from pathlib import Path

import pandas as pd

In [2]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-manual-new/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
# experiments[1]

414 experiments


In [None]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["metrics.f1"] > 0.3)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols[:2], inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

392 experiments after preprocessing


In [4]:
def parse_qa_technique(row):
    base_technique = row["params.qa.system_prompt"].split("/")[0].upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [5]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) != 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.n_shot,params.qa.n_sc,params.qa.temperature,params.run
0,ccot/format-thought.txt,cq.txt,manual/ccot.json,0,1,0.0,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
1,ccot/format-thought.txt,cq.txt,manual/ccot.json,0,1,0.5,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
2,ccot/format-thought.txt,cq.txt,manual/ccot.json,0,1,1.0,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
3,ccot/format-thought.txt,cq.txt,manual/ccot.json,1,1,0.0,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
4,ccot/format-thought.txt,cq.txt,manual/ccot.json,1,1,0.5,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
5,ccot/format-thought.txt,cq.txt,manual/ccot.json,1,1,1.0,"(1, 1, 1, 2, 2, 2, 3, 3, 3)"
6,cok/adapted.txt,cq.txt,manual/cok.json,0,1,0.0,"(1, 1, 1, 2, 2, 2, 3, 3)"
7,cok/adapted.txt,cq.txt,manual/cok.json,0,1,0.5,"(1, 1, 1, 2, 2, 2, 3, 3)"
8,cok/adapted.txt,cq.txt,manual/cok.json,0,1,1.0,"(1, 1, 2, 2, 3, 3)"
9,cok/adapted.txt,cq.txt,manual/cok.json,1,1,0.0,"(1, 1, 1, 2, 2, 2, 3, 3)"


In [6]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.system_prompt
	ccot/format-thought.txt
	cok/adapted.txt
	cot/format-thought.txt
	cte/format-sro-triples-few.txt
	cte/format-triples-ere-few.txt
	cte/format-triples-few.txt
	direct/format-few.txt
	empty.txt
- params.qa.user_prompt_template
	cq.txt
	icq-format.txt
- params.qa.few_shot_examples
	manual/ccot.json
	manual/cok.json
	manual/cot.json
	manual/cte-triples.json
	empty.json
	manual/direct.json
	semi/direct.json
- params.qa.n_shot
	0
	1
- params.qa.n_sc
	1
- params.qa.temperature
	0.0
	0.5
	1.0
- params.run
	(1, 1, 1, 2, 2, 2, 3, 3, 3)
	(1, 1, 1, 2, 2, 2, 3, 3)
	(1, 1, 2, 2, 3, 3)
	(1, 1, 1, 2, 2, 3, 3)
	(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3)
	(1,)


In [7]:
df.to_json('exps.jsonl', orient='records', lines=True)