In [27]:
import json
from pathlib import Path

import pandas as pd

In [28]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-sweep-semi/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[1]

830 experiments


{'commit': 'c9ffff5dd2706e29c13cbc66cd0551cfb9dcdd1b',
 'id': '5bf0f42a3be03329be9d148689ff94f29ae6eebe',
 'name': 'irate-cons',
 'params': {'dataset': {'path': 'bdsaglam/musique-sweep',
   'name': 'answerable',
   'split': 'train'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.1,
   'system_prompt': 'cot/original.txt',
   'user_prompt_template': 'cq.txt',
   'few_shot_examples': 'semi/cot-original.json',
   'n_shot': 0},
  'run': 2},
 'metrics': {'exact_match': 0.6033333333333334,
  'f1': 0.7033493455861877,
  'fuzzy_match': 0.6766666666666666,
  '2hops': {'exact_match': 0.71,
   'f1': 0.8188099415204678,
   'fuzzy_match': 0.81},
  '3hops': {'exact_match': 0.58,
   'f1': 0.6703333333333333,
   'fuzzy_match': 0.64},
  '4hops': {'exact_match': 0.52,
   'f1': 0.6209047619047618,
   'fuzzy_match': 0.58}}}

In [29]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique-sweep")
    & (df["params.qa.few_shot_examples"].str.contains("semi")) 
    & (df["metrics.f1"] > 0.01)
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

df['params.qa.n_sc'] = df['params.qa.n_sc'].fillna(1)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols + metric_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

540 experiments after preprocessing


In [30]:
def parse_qa_technique(row):
    base_technique = row["params.qa.system_prompt"].split("/")[0].upper()
    sc = int(row["params.qa.n_sc"])
    sc_suffix = f"-SC({sc})" if sc > 1 else ""
    return f"{base_technique}{sc_suffix}"

df["params.qa.technique"] = df.apply(parse_qa_technique, axis=1)

In [31]:
def sorted_tuple(x):
    return tuple(sorted(x))


run_counts = (
    df.groupby(
        [
            "params.qa.system_prompt",
            "params.qa.user_prompt_template",
            "params.qa.few_shot_examples",
            "params.qa.n_shot",
            "params.qa.n_sc",
            "params.qa.temperature",
        ]
    )["params.run"]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) != 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.n_shot,params.qa.n_sc,params.qa.temperature,params.run
6,ccot/format-thought.txt,cq.txt,semi/ccot.json,0,10.0,0.7,"(1, 3)"
13,ccot/format-thought.txt,cq.txt,semi/ccot.json,1,10.0,0.7,"(1,)"
19,ccot/format-thought.txt,cq.txt,semi/ccot.json,2,10.0,0.5,"(1,)"
24,ccot/format-thought.txt,cq.txt,semi/ccot.json,3,10.0,0.1,"(2,)"
25,ccot/format-thought.txt,cq.txt,semi/ccot.json,3,10.0,0.5,"(2,)"
26,ccot/format-thought.txt,cq.txt,semi/ccot.json,3,10.0,0.7,"(2,)"
33,ccot/original.txt,cq.txt,semi/ccot-original.json,0,10.0,0.7,"(1, 3)"
39,ccot/original.txt,cq.txt,semi/ccot-original.json,1,10.0,0.5,"(2, 3)"
40,ccot/original.txt,cq.txt,semi/ccot-original.json,1,10.0,0.7,"(1, 3)"
47,ccot/original.txt,cq.txt,semi/ccot-original.json,2,10.0,0.7,"(1,)"


In [32]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.system_prompt
	ccot/format-thought.txt
	ccot/original.txt
	cok/adapted.txt
	cok/original.txt
	cot/format-thought.txt
	cot/original.txt
	cte/format-triplets-few.txt
	direct/format-few.txt
- params.qa.user_prompt_template
	cq.txt
- params.qa.few_shot_examples
	semi/ccot.json
	semi/ccot-original.json
	semi/cok.json
	semi/cot.json
	semi/cot-original.json
	semi/cte.json
	semi/direct.json
- params.qa.n_shot
	0
	1
	2
	3
- params.qa.n_sc
	1.0
	10.0
- params.qa.temperature
	0.1
	0.5
	0.7
	1.0
- params.run
	(1, 2, 3)
	(1, 3)
	(1,)
	(2,)
	(2, 3)
	(1, 2)
	(3,)


In [33]:
df.to_json('exps.jsonl', orient='records', lines=True)

In [34]:
df.sort_values("metrics.f1").head(10)

Unnamed: 0,id,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,...,metrics.exact_match,metrics.f1,metrics.2hops.exact_match,metrics.2hops.f1,metrics.3hops.exact_match,metrics.3hops.f1,metrics.4hops.exact_match,metrics.4hops.f1,params.qa.n_sc,params.qa.technique
116,a4e5c872adea31d202e6072977e09d59503c7acd,owing-toff,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,1.0,cok/adapted.txt,cq.txt,semi/cok.json,...,0.006667,0.010167,0.02,0.0305,0.0,0.0,0.0,0.0,1.0,COK
799,95e410912317e3864369a940f42047cc99179ece,buxom-ibex,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.5,cot/format-thought.txt,cq.txt,semi/cot.json,...,0.003333,0.010733,0.0,0.000714,0.0,0.001818,0.01,0.029667,10.0,COT-SC(10)
614,8879aa3190b7f9e76f8ed871e1e274b62d59bc0f,dedal-kobs,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.5,cok/original.txt,cq.txt,semi/cok.json,...,0.01,0.011113,0.02,0.02205,0.0,0.00129,0.01,0.01,1.0,COK
174,f762a2e26510ec9fe37bd3ae1daef87a0468bbea,irony-dohs,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.7,ccot/original.txt,cq.txt,semi/ccot-original.json,...,0.01,0.011333,0.01,0.01,0.0,0.0,0.02,0.024,1.0,CCOT
608,d8d25f64766d0a3cfbf2733c7903f6a6edeb8f82,holey-cool,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.7,cok/original.txt,cq.txt,semi/cok.json,...,0.006667,0.011379,0.01,0.014,0.01,0.016136,0.0,0.004,1.0,COK
33,70d5187d00f3f604794651eb8c4264b33c7d2de1,spiry-pech,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,1.0,cok/adapted.txt,cq.txt,semi/cok.json,...,0.01,0.013434,0.01,0.011818,0.0,0.0,0.02,0.028485,1.0,COK
86,fb431168b062d54628e187dcc7a40831d196ecb0,sural-colt,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.1,cok/adapted.txt,cq.txt,semi/cok.json,...,0.01,0.013845,0.0,0.006667,0.01,0.01487,0.02,0.02,1.0,COK
775,6d85608dc37e505cfb3d6bd7f35bd2e8748ee8ec,coxal-kago,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.5,ccot/format-thought.txt,cq.txt,semi/ccot.json,...,0.006667,0.014667,0.01,0.011667,0.0,0.006667,0.01,0.025667,10.0,CCOT-SC(10)
604,c24fb50812398bcd314c1808f15e1b6ebe203954,gooey-luce,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.5,cok/original.txt,cq.txt,semi/cok.json,...,0.01,0.016961,0.02,0.023889,0.0,0.006176,0.01,0.020818,1.0,COK
720,11ca5dc6fc678421aaa3176e2e83deab0fd89739,uncut-bunk,bdsaglam/musique-sweep,answerable,train,llama-3-70b-tgi,0.5,cte/format-triplets-few.txt,cq.txt,semi/cte.json,...,0.013333,0.017222,0.01,0.01,0.0,0.0,0.03,0.041667,10.0,CTE-SC(10)
