In [13]:
import json
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [14]:
from bellem.dvc.experiment import load_experiments

filenames = [
    "temperature-sweep-1.json",
    "temperature-sweep-2.json",
    "temperature-sweep-3.json",
    "temperature-sweep-4.json",
]

experiments = [exp for filename in filenames for exp in load_experiments(filename)]
print(f"{len(experiments)} experiments")
experiments[0]

843 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'dataset': {'path': 'bdsaglam/musique',
   'name': 'answerable',
   'split': 'validation'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.1,
   'system_prompt': 'no-role.txt',
   'user_prompt_template': 'cq.txt',
   'few_shot_examples': 'empty.json'},
  'run': 1},
 'metrics': {}}

In [15]:
df = pd.json_normalize(experiments).drop(columns=["commit", "id", "name"])
mask = (df["params.dataset.path"] == "bdsaglam/musique") & (
    df["params.qa.system_prompt"] != "no-role.txt"
)
df = df.loc[mask].copy()
df.drop(columns=[col for col in df.columns if "fuzzy" in col], inplace=True)

param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.dropna(subset=param_cols + metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True)

print(f"{len(df)} experiments after preprocessing")

381 experiments after preprocessing


In [16]:
def parse_qa_technique(system_prompt_filename):
    if "cte" in system_prompt_filename:
        return "CTE"
    if "cot" in system_prompt_filename:
        return "COT"
    return "Standard"

df["params.qa.technique"] = df["params.qa.system_prompt"].map(parse_qa_technique)

In [17]:
import re

def parse_n_shot(x):
    if 'empty' in x:
        return 0
    # match the digit in the string
    x = re.search(r'\d+', x).group()
    return int(x)

assert parse_n_shot('empty.json') == 0
assert parse_n_shot('cot-2-shot.json') == 2

In [18]:
df['params.qa.n_shot'] = df['params.qa.few_shot_examples'].map(parse_n_shot)

In [19]:
def remove_ext(filename):
    return filename.replace(".txt", "")

df["params.qa.system_prompt"] = df["params.qa.system_prompt"].map(remove_ext)
df["params.qa.user_prompt_template"] = df["params.qa.user_prompt_template"].map(remove_ext)

In [28]:
def sorted_tuple(x):
    return tuple(sorted(x))

run_counts = (
    df.groupby(["params.qa.system_prompt", "params.qa.user_prompt_template", "params.qa.few_shot_examples", "params.qa.temperature"])[
        "params.run"
    ]
    .aggregate(sorted_tuple)
    .reset_index()
)
run_counts.loc[run_counts["params.run"].map(len) < 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run
37,cte/excellent-format-few,cq,empty.json,1.0,"(2, 3)"
39,cte/excellent-format-few,cq,empty.json,2.0,"(1, 2)"
53,cte/format-few,cq,empty.json,1.0,"(1, 3)"


In [29]:
run_counts.loc[run_counts["params.run"].map(len) > 3]

Unnamed: 0,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.qa.temperature,params.run


In [30]:
for col in run_counts.columns:
    print(f"# {col}")
    print(run_counts[col].unique())

# params.qa.system_prompt
['cot/excellent-format-reasoning' 'cot/format-minimal'
 'cot/format-reasoning' 'cot/format-think' 'cte/excellent-format-few'
 'cte/excellent-format-fewest' 'cte/format-few' 'cte/format-relevant-few'
 'standard/excellent-few' 'standard/excellent-fewest'
 'standard/helpful-output-format-few' 'standard/minimal'
 'standard/minimal-output-format' 'standard/minimal-output-format-few'
 'standard/minimal-output-format-few-no-prio'
 'standard/minimal-output-format-fewest']
# params.qa.user_prompt_template
['cq']
# params.qa.few_shot_examples
['empty.json']
# params.qa.temperature
[0.  0.1 0.3 0.5 0.7 1.  1.5 2. ]
# params.run
[(1, 2, 3) (2, 3) (1, 2) (1, 3)]


In [22]:
df.to_json('temperature-sweep.jsonl', orient='records', lines=True)