In [2]:
import json
from pathlib import Path

import pandas as pd

In [3]:
from mhqa.dvc import load_experiments

filepaths = list(Path("../../tmp/musique-sweep/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[0]

2 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'train': {'dataset': {'path': 'bdsaglam/musique-sweep',
    'name': 'answerable',
    'split': 'train'},
   'optimizer': 'bfsrs-medium',
   'ensemble': 'no'},
  'evaluation': {'dataset': {'path': 'bdsaglam/musique-sweep',
    'name': 'answerable',
    'split': 'validation'}},
  'qa': {'model': 'llama-3.3-70b-tgi',
   'temperature': 0.1,
   'technique': 'agent-simple'},
  'run': 1},
 'metrics': {'exact_match': 0.6123293338849813,
  'f1': 0.7363328198642227,
  'fuzzy_match': 0.6921803889118743,
  '2hops': {'exact_match': 0.639776357827476,
   'f1': 0.7612019783756603,
   'fuzzy_match': 0.7044728434504792},
  '3hops': {'exact_match': 0.6171052631578947,
   'f1': 0.7644833546727637,
   'fuzzy_match': 0.7394736842105263},
  '4hops': {'exact_match': 0.5185185185185185,
   'f1': 0.6066276521832077,
   'fuzzy_match': 0.5654320987654321}}}

In [4]:
for exp in experiments:
    train_params = exp["params"]["train"]
    if train_params.get("ensemble") is None:
        train_params["ensemble"] = "no"

experiments[0]

{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'train': {'dataset': {'path': 'bdsaglam/musique-sweep',
    'name': 'answerable',
    'split': 'train'},
   'optimizer': 'bfsrs-medium',
   'ensemble': 'no'},
  'evaluation': {'dataset': {'path': 'bdsaglam/musique-sweep',
    'name': 'answerable',
    'split': 'validation'}},
  'qa': {'model': 'llama-3.3-70b-tgi',
   'temperature': 0.1,
   'technique': 'agent-simple'},
  'run': 1},
 'metrics': {'exact_match': 0.6123293338849813,
  'f1': 0.7363328198642227,
  'fuzzy_match': 0.6921803889118743,
  '2hops': {'exact_match': 0.639776357827476,
   'f1': 0.7612019783756603,
   'fuzzy_match': 0.7044728434504792},
  '3hops': {'exact_match': 0.6171052631578947,
   'f1': 0.7644833546727637,
   'fuzzy_match': 0.7394736842105263},
  '4hops': {'exact_match': 0.5185185185185185,
   'f1': 0.6066276521832077,
   'fuzzy_match': 0.5654320987654321}}}

In [5]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = df["params.evaluation.dataset.path"] == "bdsaglam/musique-sweep"

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=param_cols + metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

2 experiments after preprocessing


In [6]:
def sorted_tuple(x):
    return tuple(sorted(x))

run_counts = (
    df.groupby(param_cols[:-1])[
        "params.run"
    ]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) != 6]

Unnamed: 0,params.train.dataset.path,params.train.dataset.name,params.train.dataset.split,params.train.optimizer,params.train.ensemble,params.evaluation.dataset.path,params.evaluation.dataset.name,params.evaluation.dataset.split,params.qa.model,params.qa.temperature,params.qa.technique,params.run
0,bdsaglam/musique-sweep,answerable,train,bfsrs-medium,no,bdsaglam/musique-sweep,answerable,validation,llama-3.3-70b-tgi,0.1,agent-simple,"(1,)"
1,bdsaglam/musique-sweep,answerable,train,miprov2-light,no,bdsaglam/musique-sweep,answerable,validation,llama-3.3-70b-tgi,0.1,cot,"(1,)"


In [7]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.train.dataset.path
	bdsaglam/musique-sweep
- params.train.dataset.name
	answerable
- params.train.dataset.split
	train
- params.train.optimizer
	bfsrs-medium
	miprov2-light
- params.train.ensemble
	no
- params.evaluation.dataset.path
	bdsaglam/musique-sweep
- params.evaluation.dataset.name
	answerable
- params.evaluation.dataset.split
	validation
- params.qa.model
	llama-3.3-70b-tgi
- params.qa.temperature
	0.1
- params.qa.technique
	agent-simple
	cot
- params.run
	(1,)


In [8]:
df.to_json('results.jsonl', orient='records', lines=True)

In [19]:
df[["params.qa.technique", "params.train.optimizer", "metrics.exact_match", "metrics.f1"]]

Unnamed: 0,params.qa.technique,params.train.optimizer,metrics.exact_match,metrics.f1
0,agent-simple,bfsrs-medium,0.612329,0.736333
1,cot,miprov2-light,0.606667,0.723042
