In [5]:
import json
from pathlib import Path

import pandas as pd

In [6]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-full-cold/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[1]

6 experiments


{'commit': 'cf71c6e910996adea08e6d2d6b7e5f75e475b01b',
 'id': 'c200509b2b27676d059d63d432d335caad8d102e',
 'name': 'gypsy-feel',
 'params': {'dataset': {'path': 'bdsaglam/musique',
   'name': 'answerable',
   'split': 'validation'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.0,
   'system_prompt': 'cok/adapted.txt',
   'user_prompt_template': 'cq.txt',
   'few_shot_examples': 'semi/cok.json',
   'n_shot': 0,
   'n_sc': 1},
  'run': 1},
 'metrics': {'exact_match': 0.6123293338849813,
  'f1': 0.725410446072206,
  'fuzzy_match': 0.6793545717832024,
  '2hops': {'exact_match': 0.6517571884984026,
   'f1': 0.7596634205738277,
   'fuzzy_match': 0.7140575079872205},
  '3hops': {'exact_match': 0.6092105263157894,
   'f1': 0.736010068351705,
   'fuzzy_match': 0.6934210526315789},
  '4hops': {'exact_match': 0.4962962962962963,
   'f1': 0.599631589261219,
   'fuzzy_match': 0.5456790123456791},
  'gen_token_count': {'all': {'count': 2417.0,
    'mean': 97.19693835333058,
    'std': 31

In [7]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["params.qa.few_shot_examples"].str.contains("semi")) 
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

df['params.qa.n_sc'] = df['params.qa.n_sc'].fillna(1)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

5 experiments after preprocessing


In [8]:
def parse_qa_technique(row):
    base_technique = row["params.qa.system_prompt"].split("/")[0].upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [9]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) != 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.n_shot,params.qa.n_sc,params.qa.temperature,params.run
0,ccot/format-thought.txt,cq.txt,semi/ccot.json,0,1,0.0,"(1,)"
1,cok/adapted.txt,cq.txt,semi/cok.json,0,1,0.0,"(1,)"
2,cot/format-thought.txt,cq.txt,semi/cot.json,0,1,0.0,"(1,)"
3,cte/format-triplets-few.txt,cq.txt,semi/cte.json,0,1,0.0,"(1,)"
4,direct/format-few.txt,cq.txt,semi/direct.json,0,1,0.0,"(1,)"


In [10]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.system_prompt
	ccot/format-thought.txt
	cok/adapted.txt
	cot/format-thought.txt
	cte/format-triplets-few.txt
	direct/format-few.txt
- params.qa.user_prompt_template
	cq.txt
- params.qa.few_shot_examples
	semi/ccot.json
	semi/cok.json
	semi/cot.json
	semi/cte.json
	semi/direct.json
- params.qa.n_shot
	0
- params.qa.n_sc
	1
- params.qa.temperature
	0.0
- params.run
	(1,)


In [11]:
df.to_json('exps.jsonl', orient='records', lines=True)

In [12]:
df.sort_values("metrics.f1").head(10)

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.gen_token_count.success.max,metrics.gen_token_count.fail.count,metrics.gen_token_count.fail.mean,metrics.gen_token_count.fail.std,metrics.gen_token_count.fail.min,metrics.gen_token_count.fail.25%,metrics.gen_token_count.fail.50%,metrics.gen_token_count.fail.75%,metrics.gen_token_count.fail.max,params.qa.technique
3,935a61ce3308754d0a6951dbe88b6549c88a0db5,rusty-eric,bdsaglam/musique,answerable,validation,llama-3-70b-tgi,0.0,direct/format-few.txt,cq.txt,semi/direct.json,...,105.0,1560.0,50.284615,44.330988,3.0,22.0,43.0,65.0,878.0,DIRECT
5,563c046dc49a0e2fb085f425c997d2027637bb38,laigh-flex,bdsaglam/musique,answerable,validation,llama-3-70b-tgi,0.0,cot/format-thought.txt,cq.txt,semi/cot.json,...,211.0,712.0,65.831461,92.391326,16.0,46.75,58.0,72.0,2444.0,COT
2,0f93eff9ecde53fd7ea809dda0a13708bd5191c7,added-kana,bdsaglam/musique,answerable,validation,llama-3-70b-tgi,0.0,cte/format-triplets-few.txt,cq.txt,semi/cte.json,...,177.0,697.0,45.964132,23.595919,7.0,33.0,43.0,56.0,263.0,CTE
4,9dbb787b74750249ba88c9cee6e5cd39ac1b170a,adust-paws,bdsaglam/musique,answerable,validation,llama-3-70b-tgi,0.0,ccot/format-thought.txt,cq.txt,semi/ccot.json,...,162.0,700.0,49.575714,17.156312,19.0,38.0,47.0,59.0,187.0,CCOT
1,c200509b2b27676d059d63d432d335caad8d102e,gypsy-feel,bdsaglam/musique,answerable,validation,llama-3-70b-tgi,0.0,cok/adapted.txt,cq.txt,semi/cok.json,...,243.0,674.0,102.606825,35.05243,24.0,78.0,97.0,118.0,300.0,COK
