In [1]:
import json
from pathlib import Path

import pandas as pd

In [2]:
from bellem.dvc.experiment import load_experiments

filepaths = list(Path("../../tmp/musique-temperature-sweep2/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[0]

157 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'dataset': {'path': 'bdsaglam/musique',
   'name': 'answerable',
   'split': 'validation'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.1,
   'system_prompt': 'no-role.txt',
   'user_prompt_template': 'cq.txt',
   'few_shot_examples': 'empty.json'},
  'run': 1},
 'metrics': {}}

In [3]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
mask = (
    (df["params.dataset.path"] == "bdsaglam/musique")
    & (df["params.qa.system_prompt"] != "no-role.txt")
)

df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=param_cols + metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

117 experiments after preprocessing


In [4]:
df["params.qa.system_prompt"] = df["params.qa.system_prompt"].map(lambda x: x.replace('standard/', 'direct/'))
df["params.qa.few_shot_examples"] = df["params.qa.few_shot_examples"].map(lambda x: x.replace('standard', 'direct'))

In [5]:
def parse_qa_technique(system_prompt_filename):
    if "cte" in system_prompt_filename:
        return "CTE"
    if "ccot" in system_prompt_filename:
        return "CCOT"
    if "cot" in system_prompt_filename:
        return "COT"
    return "Direct"

df["params.qa.technique"] = df["params.qa.system_prompt"].map(parse_qa_technique)

In [6]:
import re


def parse_n_shot(x):
    if 'empty' in x:
        return 0
    # match the digit in the string
    x = re.search(r'\d+', x).group()
    return int(x)

assert parse_n_shot('empty.json') == 0
assert parse_n_shot('cot-2-shot.json') == 2

In [7]:
df['params.qa.n_shot'] = df['params.qa.few_shot_examples'].map(parse_n_shot)

In [8]:
def sorted_tuple(x):
    return tuple(sorted(x))

run_counts = (
    df.groupby(["params.qa.system_prompt", "params.qa.user_prompt_template", "params.qa.few_shot_examples", "params.qa.temperature"])[
        "params.run"
    ]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) < 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run
4,ccot/format-reasoning.txt,cq.txt,ccot-2-shot-thought.json,0.1,"(1, 3)"
8,ccot/format-thought.txt,cq.txt,ccot-2-shot-reasoning.json,0.1,"(1, 2)"
12,ccot/format-thought.txt,cq.txt,ccot-2-shot-thought.json,0.1,"(1, 3)"


In [9]:
run_counts.loc[run_counts["params.run"].map(len) > 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run


In [10]:
def is_wrong_config(row):
    if row['params.qa.technique'] != 'CCOT':
        return False
    sp = row['params.qa.system_prompt']
    fse = row['params.qa.few_shot_examples']
    return sp.rsplit("-")[-1].split(".")[0] != fse.rsplit("-")[-1].split(".")[0]

In [11]:
mask = df.apply(is_wrong_config, axis=1)
print(mask.sum())

22


In [12]:
df = df.loc[~mask].copy()

In [13]:
for col in run_counts.columns:
    print(f"- {col}")
    for value in run_counts[col].unique():
        print(f"\t{value}")

- params.qa.system_prompt
	ccot/format-reasoning.txt
	ccot/format-thought.txt
	cot/format-minimal.txt
	cot/format-think.txt
	cte/excellent-format-few.txt
	direct/helpful-output-format-few.txt
	direct/minimal-output-format-few.txt
- params.qa.user_prompt_template
	cq.txt
- params.qa.few_shot_examples
	ccot-2-shot-reasoning.json
	ccot-2-shot-thought.json
	cot-2-shot.json
	cte-2-shot-delim-1.json
	cte-2-shot-delim-2.json
	direct-2-shot.json
- params.qa.temperature
	0.1
	0.3
	0.5
	0.7
- params.run
	(1, 2, 3)
	(1, 3)
	(1, 2)


In [14]:
df.to_json('results.jsonl', orient='records', lines=True)