In [1]:
import json
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
from bellek.dvc.experiment import load_experiments

experiments = load_experiments('prompt-engineering.json')
print(f"{len(experiments)} experiments")
experiments[0]

19 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'dataset': {'path': 'bdsaglam/musique-thesis',
   'name': 'answerable',
   'split': 'validation'},
  'qa': {'model': 'llama-3-70b-tgi',
   'temperature': 0.1,
   'system_prompt': 'excellent-qa.txt',
   'user_prompt_template': 'line-break.txt',
   'few_shot_examples': 'empty.json'},
  'run': 1},
 'metrics': {'exact_match': 0.57,
  'f1': 0.673044113780956,
  'fuzzy_match': 0.64}}

In [3]:
df = (
    pd.json_normalize(experiments)
    .sort_values(
        [
            "params.dataset.path",
            "params.dataset.name",
            "params.dataset.split",
            "params.qa.model",
            "params.qa.temperature",
            "params.run",
        ]
    )
    .drop(columns=["commit", "id"])
    .drop_duplicates()
    .reset_index(drop=True)
)
dataset_path = "bdsaglam/musique-thesis"
mask = (df["params.dataset.path"] == dataset_path) & ~(df['params.qa.system_prompt'].str.contains("../.."))
df = df.loc[mask].reset_index(drop=True)
df

Unnamed: 0,name,params.dataset.path,params.dataset.name,params.dataset.split,params.qa.model,params.qa.temperature,params.qa.system_prompt,params.qa.user_prompt_template,params.qa.few_shot_examples,params.run,metrics.exact_match,metrics.f1,metrics.fuzzy_match
0,,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,line-break.txt,empty.json,1,0.57,0.673044,0.64
1,gimpy-pupa,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,question-context.txt,empty.json,1,0.52,0.61652,0.59
2,sandy-teff,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,context-question.txt,empty.json,1,0.57,0.679027,0.64
3,reedy-mung,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,line-break.txt,empty.json,1,0.58,0.674412,0.64
4,awing-rite,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,helpful-assistant.txt,context-question.txt,empty.json,1,0.58,0.674746,0.65
5,pique-nave,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,helpful-assistant.txt,question-context.txt,empty.json,1,0.52,0.623414,0.62
6,based-ceps,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,helpful-assistant.txt,line-break.txt,empty.json,1,0.57,0.675176,0.66
7,dural-saga,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,question-context.txt,empty.json,2,0.52,0.61452,0.6
8,runny-cuds,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,context-question.txt,empty.json,2,0.57,0.679027,0.64
9,vatic-bunt,bdsaglam/musique-thesis,answerable,validation,llama-3-70b-tgi,0.1,excellent-qa.txt,line-break.txt,empty.json,2,0.57,0.664155,0.63


In [4]:
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

In [5]:
# Average metrics over repeated runs (params.run)
df_avg = df.groupby(["params.qa.system_prompt", "params.qa.user_prompt_template"])[metric_cols].mean()
df_avg

Unnamed: 0_level_0,Unnamed: 1_level_0,metrics.exact_match,metrics.f1,metrics.fuzzy_match
params.qa.system_prompt,params.qa.user_prompt_template,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
excellent-qa.txt,context-question.txt,0.57,0.679112,0.64
excellent-qa.txt,line-break.txt,0.5725,0.669006,0.635
excellent-qa.txt,question-context.txt,0.52,0.615187,0.596667
helpful-assistant.txt,context-question.txt,0.58,0.675361,0.656667
helpful-assistant.txt,line-break.txt,0.57,0.675176,0.66
helpful-assistant.txt,question-context.txt,0.52,0.623224,0.613333
