In [1]:
import pandas as pd


In [None]:
df = pd.read_json('exps.jsonl', lines=True)

df.dropna(inplace=True, how='any')

mask = df['params.evaluation.dataset.split'] == 'dev'
df = df[mask].copy()

cols2drop = ['id']
for col in df.columns:
    if len(df[col].unique()) == 1:
        cols2drop.append(col)
df.drop(columns=cols2drop, inplace=True)

df

Unnamed: 0,name,params.train.optimizer,params.program.prompting,params.lm.model,params.run,metrics.exact.precision,metrics.exact.recall,metrics.exact.f1,metrics.fuzzy.precision,metrics.fuzzy.recall,metrics.fuzzy.f1
1,lathy-jaws,noop,sft,llama-3-8b-sft,low,0.6301,0.622543,0.623483,0.850379,0.847531,0.844171
2,bluff-dita,bfsrs-high,structured,llama-3-8b,1,0.280462,0.275396,0.276618,0.67608,0.664274,0.66651
3,irony-gust,miprov2-medium,structured,llama-3-8b,1,0.250988,0.233233,0.240662,0.653248,0.607018,0.625459
4,outer-ring,bfsrs-medium,structured,llama-3-8b,1,0.255153,0.246698,0.249767,0.651775,0.629756,0.637432
5,moral-prof,noop,sft,llama-3-8b-sft,high,0.94566,0.939192,0.941871,0.980038,0.973275,0.97606
6,spicy-teff,noop,sft,llama-3-8b-sft,medium,0.826799,0.806647,0.814956,0.942006,0.920719,0.92908
7,epoxy-dele,miprov2-light,structured,llama-3-8b,1,0.164665,0.15979,0.161087,0.549267,0.538236,0.540178
8,girly-cuss,noop,structured,llama-3-8b,1,0.018212,0.01703,0.0175,0.356735,0.335013,0.34174


In [3]:
mapping = {
    'structured': 'prompting',
    'sft': 'sft',
}
df['params.program.prompting'] = df['params.program.prompting'].map(mapping.get)

In [4]:
df['params.lm.model'] = df['params.lm.model'].map(lambda x: x.replace('-sft', ''))

In [5]:
mapping = {
    1: 100,
    'low': 100,
    'medium': 1000,
    'high': 8870,
}

def get_adaptation_n_sample(row):
    if row['params.program.prompting'] == 'sft':
        return mapping[row['params.run']]
    
    if row['params.program.prompting'] == 'prompting':
        optimizer = row['params.train.optimizer']
        if optimizer == 'noop':
            return 0
        if optimizer == 'miprov2-light':
            return 4
        if optimizer == 'miprov2-medium':
            return 8
        if optimizer == 'bfsrs-light':
            return 4
        if optimizer == 'bfsrs-medium':
            return 8
        if optimizer == 'bfsrs-high':
            return 16
        if optimizer == 'bfsrs-ulti':
            return 32
    
    
    raise ValueError(f'Unknown method: {row["params.program.prompting"]}')

df['params.adaptation.n_sample'] = df.apply(get_adaptation_n_sample, axis=1)

In [6]:
param_cols = [col for col in df.columns if 'params' in col]
metric_cols = [col for col in df.columns if 'metrics' in col]

In [7]:
column_rename_mapping = {col: col.replace("params.", "") for col in df.columns}
df.rename(columns=column_rename_mapping, inplace=True)
df.head()

Unnamed: 0,name,train.optimizer,program.prompting,lm.model,run,metrics.exact.precision,metrics.exact.recall,metrics.exact.f1,metrics.fuzzy.precision,metrics.fuzzy.recall,metrics.fuzzy.f1,adaptation.n_sample
1,lathy-jaws,noop,sft,llama-3-8b,low,0.6301,0.622543,0.623483,0.850379,0.847531,0.844171,100
2,bluff-dita,bfsrs-high,prompting,llama-3-8b,1,0.280462,0.275396,0.276618,0.67608,0.664274,0.66651,16
3,irony-gust,miprov2-medium,prompting,llama-3-8b,1,0.250988,0.233233,0.240662,0.653248,0.607018,0.625459,8
4,outer-ring,bfsrs-medium,prompting,llama-3-8b,1,0.255153,0.246698,0.249767,0.651775,0.629756,0.637432,8
5,moral-prof,noop,sft,llama-3-8b,high,0.94566,0.939192,0.941871,0.980038,0.973275,0.97606,8870


In [8]:
column_rename_mapping = {
    "train.optimizer": "prompt.optimizer",
    "program.prompting": "adaptation.method",
}
df.rename(columns=column_rename_mapping, inplace=True)

In [9]:
col_order = [
    'adaptation.method',
    'adaptation.n_sample',
    'prompt.optimizer',
    'lm.model',
    'metrics.exact.f1',
    'metrics.fuzzy.f1',
]

In [10]:
# set precision to 2 for metric cols
df[metric_cols] = df[metric_cols].round(2)

In [11]:
for model in df['lm.model'].unique():
    _df = df[df['lm.model'] == model]
    display(_df[col_order].sort_values(by='metrics.fuzzy.f1', ascending=False))

Unnamed: 0,adaptation.method,adaptation.n_sample,prompt.optimizer,lm.model,metrics.exact.f1,metrics.fuzzy.f1
5,sft,8870,noop,llama-3-8b,0.94,0.98
6,sft,1000,noop,llama-3-8b,0.81,0.93
1,sft,100,noop,llama-3-8b,0.62,0.84
2,prompting,16,bfsrs-high,llama-3-8b,0.28,0.67
4,prompting,8,bfsrs-medium,llama-3-8b,0.25,0.64
3,prompting,8,miprov2-medium,llama-3-8b,0.24,0.63
7,prompting,4,miprov2-light,llama-3-8b,0.16,0.54
8,prompting,0,noop,llama-3-8b,0.02,0.34
