In [1]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [2]:
def sorted_tuple(x):
    return tuple(sorted(x))

In [3]:
from adapt.dvc import load_experiments

filepaths = list(Path("../../tmp/erx/").glob("*.json"))
experiments = [exp for fp in filepaths for exp in load_experiments(fp)]
print(f"{len(experiments)} experiments")
experiments[0]

2 experiments


{'commit': 'workspace',
 'id': 'workspace',
 'name': None,
 'params': {'task': 'erx',
  'train': {'dataset': {'path': 'bdsaglam/web_nlg-erx-concat',
    'name': 'release_v3.0_en',
    'split': 'train[:100]'},
   'optimizer': 'noop',
   'ensemble': 'no'},
  'evaluation': {'dataset': {'path': 'bdsaglam/web_nlg-erx-concat',
    'name': 'release_v3.0_en',
    'split': 'dev'}},
  'program': {'prompting': 'structured'},
  'lm': {'model': 'llama-3-8b', 'temperature': 0.0},
  'run': 1},
 'metrics': {'exact.precision': 0.014585968840764236,
  'exact.recall': 0.013411106866989222,
  'exact.f1': 0.013893968089212146,
  'fuzzy.precision': 0.2980273675422333,
  'fuzzy.recall': 0.2736804602675239,
  'fuzzy.f1': 0.28203483319061007}}

In [4]:
df = pd.json_normalize(experiments).drop(columns=["commit"])
print(f"{len(df)} experiments before preprocessing")
df.head()

2 experiments before preprocessing


Unnamed: 0,id,name,params.task,params.train.dataset.path,params.train.dataset.name,params.train.dataset.split,params.train.optimizer,params.train.ensemble,params.evaluation.dataset.path,params.evaluation.dataset.name,...,params.program.prompting,params.lm.model,params.lm.temperature,params.run,metrics.exact.precision,metrics.exact.recall,metrics.exact.f1,metrics.fuzzy.precision,metrics.fuzzy.recall,metrics.fuzzy.f1
0,workspace,,erx,bdsaglam/web_nlg-erx-concat,release_v3.0_en,train[:100],noop,no,bdsaglam/web_nlg-erx-concat,release_v3.0_en,...,structured,llama-3-8b,0.0,1,0.014586,0.013411,0.013894,0.298027,0.27368,0.282035
1,ff400268c8547f877d473be79df51695b9a1d9ed,round-duff,erx,bdsaglam/web_nlg-erx-concat,release_v3.0_en,train[:16],noop,no,bdsaglam/web_nlg-erx-concat,release_v3.0_en,...,sft,llama-3-8b-sft,0.0,tiny,0.076155,0.079555,0.07485,0.253334,0.248191,0.237905


In [5]:
mask = df['name'].isin(['crumb-geum'])
df = df.loc[~mask].copy()


In [6]:
param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]

In [7]:
df['params.program.prompting'] = df['params.program.prompting'].fillna('structured')

In [8]:
df.dropna(subset=param_cols + metric_cols, inplace=True, how="any")
df.drop_duplicates(subset=param_cols, inplace=True, keep='last')

print(f"{len(df)} experiments after preprocessing")
df.head()

2 experiments after preprocessing


Unnamed: 0,id,name,params.task,params.train.dataset.path,params.train.dataset.name,params.train.dataset.split,params.train.optimizer,params.train.ensemble,params.evaluation.dataset.path,params.evaluation.dataset.name,...,params.program.prompting,params.lm.model,params.lm.temperature,params.run,metrics.exact.precision,metrics.exact.recall,metrics.exact.f1,metrics.fuzzy.precision,metrics.fuzzy.recall,metrics.fuzzy.f1
0,workspace,,erx,bdsaglam/web_nlg-erx-concat,release_v3.0_en,train[:100],noop,no,bdsaglam/web_nlg-erx-concat,release_v3.0_en,...,structured,llama-3-8b,0.0,1,0.014586,0.013411,0.013894,0.298027,0.27368,0.282035
1,ff400268c8547f877d473be79df51695b9a1d9ed,round-duff,erx,bdsaglam/web_nlg-erx-concat,release_v3.0_en,train[:16],noop,no,bdsaglam/web_nlg-erx-concat,release_v3.0_en,...,sft,llama-3-8b-sft,0.0,tiny,0.076155,0.079555,0.07485,0.253334,0.248191,0.237905


In [9]:
for col in param_cols:
    values = list(df[col].unique())
    print(f"- {col}: {values}")
    print()

- params.task: ['erx']

- params.train.dataset.path: ['bdsaglam/web_nlg-erx-concat']

- params.train.dataset.name: ['release_v3.0_en']

- params.train.dataset.split: ['train[:100]', 'train[:16]']

- params.train.optimizer: ['noop']

- params.train.ensemble: ['no']

- params.evaluation.dataset.path: ['bdsaglam/web_nlg-erx-concat']

- params.evaluation.dataset.name: ['release_v3.0_en']

- params.evaluation.dataset.split: ['dev']

- params.program.prompting: ['structured', 'sft']

- params.lm.model: ['llama-3-8b', 'llama-3-8b-sft']

- params.lm.temperature: [np.float64(0.0)]

- params.run: [1, 'tiny']



In [10]:
df.to_json('exps-2.jsonl', orient='records', lines=True)

In [11]:
raise Exception("stop")

Exception: stop

## Setup remaining experiments

In [12]:
df = pd.read_json('exps.jsonl', orient='records', lines=True)

In [13]:
def produce_experiment_configs(common_params, varying_params):
    # Generate all possible combinations of parameters
    varying_params = {**common_params, **varying_params}
    keys = varying_params.keys()
    values = varying_params.values()
    for instance in itertools.product(*values):
        yield dict(zip(keys, instance))

In [14]:
def produce_all_experiment_configs(common_params: dict, varying_params_list: list[dict]):
    for params in varying_params_list:
        for exp_config in produce_experiment_configs(common_params, params):
            yield exp_config

In [15]:
common_params = {
    "params.task": ["erx"],
    "params.train.dataset.path": ["bdsaglam/web_nlg-erx-concat"],
    "params.train.dataset.name": ["release_v3.0_en"],
    "params.train.dataset.split": ['"train[:100]"'],
    "params.evaluation.dataset.path": ["bdsaglam/web_nlg-erx-concat"],
    "params.evaluation.dataset.name": ["release_v3.0_en"],
    "params.evaluation.dataset.split": ['"dev"'],
    "params.train.ensemble": [
        "no",
        # "yes",
    ],
    "params.lm.temperature": [
        0.0,
        # 0.5,
        # 0.7,
    ],
    "params.run": [
        1,
        # 2,
        # 3,
    ],
}

In [17]:
varying_params_list = [
    {
        "params.train.optimizer": [
            "noop",
            "bfsrs-medium",
            "bfsrs-high",
            'bfsrs-ulti',
            "miprov2-light",
            "miprov2-medium",
        ],
        "params.program.prompting": ["structured"],
        "params.lm.model": [
            # "llama-3-8b",
            # "qwen-2.5-32b",
            "llama-3.3-70b",
        ],
    },
    # {
    #     "params.train.optimizer": ["noop"],
    #     "params.program.prompting": ["sft"],
    #     "params.lm.model": [
    #         "llama-3-8b-sft",
    #     ],
    # },
]

In [18]:
exp_configs = list(produce_all_experiment_configs(common_params, varying_params_list))
target_params = list(exp_configs[0].keys())
print(f"{len(exp_configs)} experiment configurations")
print(target_params)

6 experiment configurations
['params.task', 'params.train.dataset.path', 'params.train.dataset.name', 'params.train.dataset.split', 'params.evaluation.dataset.path', 'params.evaluation.dataset.name', 'params.evaluation.dataset.split', 'params.train.ensemble', 'params.lm.temperature', 'params.run', 'params.train.optimizer', 'params.program.prompting', 'params.lm.model']


In [19]:
if len(df):
    existing_configs = df[target_params].to_dict(orient="records")
    existing_configs[0]
else:
    existing_configs = []

print("Existing exps:", len(existing_configs))

Existing exps: 26


In [20]:
# find the missing configurations
missing_configs = [
    dict(kv)
    for kv in list(
        {tuple(sorted(config.items())) for config in exp_configs}
        - {tuple(sorted(config.items())) for config in existing_configs}
    )
]
print(f"{len(missing_configs)} missing configurations")

6 missing configurations


In [21]:
def make_command(exp_config):
    lines = ["dvc exp run --queue"]
    for target_param in target_params:
        arg_name = target_param.split(".", 1)[-1]
        arg_value = exp_config[target_param]
        lines.append(f"-S {arg_name}='{arg_value}'")

    command = " \\\n    ".join(lines)
    return command

In [22]:
with open("run.sh", "w") as f:
    f.write("#!/bin/sh\n\n")
    for exp_config in missing_configs:
        f.write(make_command(exp_config))
        f.write("\n\n")

## Inspect

In [23]:
df[['name', 'params.program.prompting', 'params.run', 'params.train.optimizer', 'params.lm.model', 'params.lm.temperature', *metric_cols]].sort_values(by='metrics.fuzzy.f1', ascending=False)

Unnamed: 0,name,params.program.prompting,params.run,params.train.optimizer,params.lm.model,params.lm.temperature,metrics.exact.precision,metrics.exact.recall,metrics.exact.f1,metrics.fuzzy.precision,metrics.fuzzy.recall,metrics.fuzzy.f1
24,reference,structured,1,noop,llama-3-8b,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,moral-prof,sft,high,noop,llama-3-8b-sft,0.0,0.94566,0.939192,0.941871,0.980038,0.973275,0.97606
2,spicy-teff,sft,medium,noop,llama-3-8b-sft,0.0,0.826799,0.806647,0.814956,0.942006,0.920719,0.92908
0,lathy-jaws,sft,low,noop,llama-3-8b-sft,0.0,0.6301,0.622543,0.623483,0.850379,0.847531,0.844171
19,alone-mesh,structured,1,bfsrs-medium,qwen-2.5-32b,0.0,0.366102,0.358353,0.361248,0.793954,0.783383,0.786429
4,metal-pons,structured,1,bfsrs-medium,qwen-2.5-32b,0.0,0.377839,0.367084,0.370645,0.797541,0.779191,0.783746
3,gawsy-paps,structured,1,bfsrs-ulti,llama-3-8b,0.0,0.320626,0.319126,0.318514,0.698233,0.695948,0.693846
20,slack-poss,structured,1,bfsrs-medium,llama-3-8b,0.0,0.270941,0.26343,0.266274,0.686891,0.66189,0.671487
15,veiny-mina,structured,1,bfsrs-high,llama-3-8b,0.0,0.280462,0.275396,0.276618,0.67608,0.664274,0.66651
9,modal-huck,structured,1,bfsrs-high,llama-3-8b,0.5,0.274224,0.268814,0.269906,0.666549,0.656943,0.657637
