In [1]:
import json
from pathlib import Path

import pandas as pd

In [2]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-sweep-interweave/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[0]

436 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'dataset': {'path': 'bdsaglam/musique-mini',
   'name': 'answerable',
   'split': 'validation'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.7,
   'system_prompt': 'direct/helpful-output-format-few.txt',
   'user_prompt_template': 'cq.txt',
   'few_shot_examples': 'auto/direct.json',
   'n_shot': 0},
  'run': 1},
 'metrics': {'exact_match': 0.55,
  'f1': 0.6719070243482008,
  'fuzzy_match': 0.6233333333333333,
  '2hops': {'exact_match': 0.65,
   'f1': 0.7548031968031969,
   'fuzzy_match': 0.72},
  '3hops': {'exact_match': 0.54,
   'f1': 0.6851904761904762,
   'fuzzy_match': 0.67},
  '4hops': {'exact_match': 0.46,
   'f1': 0.5757274000509295,
   'fuzzy_match': 0.48},
  'gen_token_count': {'all': {'count': 300.0,
    'mean': 5.76,
    'std': 1.939485847117882,
    'min': 3.0,
    '25%': 5.0,
    '50%': 5.0,
    '75%': 7.0,
    'max': 13.0},
   'success': {'count': 198.0,
    'mean': 5.777777777777778,
   

In [3]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique-sweep") 
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=param_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

212 experiments after preprocessing


In [4]:
def parse_qa_technique(system_prompt_filename):
    return system_prompt_filename.split("/")[0]

df["params.qa.technique"] = df["params.qa.system_prompt"].map(parse_qa_technique)

In [5]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) != 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.n_shot,params.qa.temperature,params.run
0,ccot/excellent-format-thought.txt,cq.txt,auto-interweave/ccot.json,0,0.1,"(2,)"
1,ccot/excellent-format-thought.txt,cq.txt,auto-interweave/ccot.json,0,0.7,"(2,)"
2,ccot/excellent-format-thought.txt,cq.txt,auto-interweave/ccot.json,0,1.5,"(1,)"
3,ccot/excellent-format-thought.txt,cq.txt,auto-interweave/ccot.json,0,2.0,"(3, 3)"
4,ccot/excellent-format-thought.txt,cq.txt,auto-interweave/ccot.json,1,0.1,"(1, 3)"
...,...,...,...,...,...,...
164,direct/format-few.txt,cq.txt,auto-interweave/direct.json,2,2.0,"(1,)"
165,direct/format-few.txt,cq.txt,auto-interweave/direct.json,3,0.7,"(2,)"
166,direct/format-few.txt,cq.txt,auto-interweave/direct.json,3,1.5,"(2,)"
167,direct/format-few.txt,cq.txt,auto-interweave/direct.json,6,1.5,"(3,)"


In [6]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.system_prompt
	ccot/excellent-format-thought.txt
	ccot/format-thought.txt
	cot/excellent-format-thought.txt
	cot/format-thought.txt
	cte/excellent-format-triplets-few.txt
	cte/format-triplets-few.txt
	direct/excellent-format-few.txt
	direct/format-few.txt
- params.qa.user_prompt_template
	cq.txt
- params.qa.few_shot_examples
	auto-interweave/ccot.json
	auto-interweave/cot.json
	auto-interweave/cte.json
	auto-interweave/direct.json
- params.qa.n_shot
	0
	1
	2
	3
	6
	9
- params.qa.temperature
	0.1
	0.7
	1.5
	2.0
	0.5
	1.0
- params.run
	(2,)
	(1,)
	(3, 3)
	(1, 3)
	(3,)
	(1, 1, 3)
	(2, 2)
	(1, 2)


In [7]:
df.to_json('results.jsonl', orient='records', lines=True)