In [1]:
import json, glob, pandas as pd

In [2]:
pd.set_option('display.precision', 3)

In [3]:
BENCHMARK_SIZE = 1000.

In [4]:
def unilateral_performance(df):
    model_evals = df.groupby('model_name')['evaluation'].value_counts().unstack(fill_value=0)
    model_evals["correct (t)"] = model_evals["t"] / BENCHMARK_SIZE
    model_evals["not attempted (n)"] = model_evals["n"] / BENCHMARK_SIZE
    model_evals["incorrect (f)"] = model_evals["f"] / BENCHMARK_SIZE
    model_evals["correct given attempted"] = model_evals["t"] / (model_evals["t"] + model_evals["f"])
    model_evals["F score"] = (2.*model_evals["t"]) / ((2.*model_evals["t"]) + (2.*model_evals["f"]) + model_evals["n"])
    model_evals = model_evals.sort_values("correct given attempted", ascending=False)
    return model_evals

In [5]:
def bilateral_performance(df):
    model_evals = df.groupby('model_name')['evaluation'].value_counts().unstack(fill_value=0)
    model_evals["correct (t)"] = model_evals["t"] / BENCHMARK_SIZE
    model_evals["inconsistent (b)"] = model_evals["b"] / BENCHMARK_SIZE
    model_evals["unknown (n)"] = model_evals["n"] / BENCHMARK_SIZE
    model_evals["incorrect (f)"] = model_evals["f"] / BENCHMARK_SIZE
    model_evals["not attempted (b+n)"] = (model_evals["b"] + model_evals["n"]) / BENCHMARK_SIZE
    model_evals["correct given attempted"] = model_evals["t"] / (model_evals["t"] + model_evals["f"])
    model_evals["F score"] = (2.*model_evals["t"]) / ((2.*model_evals["t"]) + (2.*model_evals["f"]) + (model_evals["b"] + model_evals["n"]))
    model_evals = model_evals.sort_values("correct given attempted", ascending=False)
    return model_evals

In [6]:
unilateral_results = []
for file in glob.glob("experiments/unilateral/*.json"):
   unilateral_results += json.load(open(file, "r"))
df_unilateral = pd.DataFrame(unilateral_results)
unilateral_performance(df_unilateral)

evaluation,f,n,t,correct (t),not attempted (n),incorrect (f),correct given attempted,F score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mistralai/Mistral-7B-Instruct-v0.3,155,0,845,0.845,0.0,0.155,0.845,0.845
gpt-4o-2024-11-20,285,0,715,0.715,0.0,0.285,0.715,0.715
mistralai/Mixtral-8x7B-Instruct-v0.1,377,12,611,0.611,0.012,0.377,0.618,0.615
gpt-4o-mini,524,2,474,0.474,0.002,0.524,0.475,0.474
claude-3-5-haiku-20241022,879,1,120,0.12,0.001,0.879,0.12,0.12


In [7]:
bilateral_results = []
for file in glob.glob("experiments/bilateral/*.json"):
   bilateral_results += json.load(open(file, "r"))
df_bilateral = pd.DataFrame(bilateral_results)
bilateral_performance(df_bilateral)

evaluation,b,f,n,t,correct (t),inconsistent (b),unknown (n),incorrect (f),not attempted (b+n),correct given attempted,F score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gpt-4o-mini,129,80,244,547,0.547,0.129,0.244,0.08,0.373,0.872,0.672
mistralai/Mistral-7B-Instruct-v0.3,203,88,119,590,0.59,0.203,0.119,0.088,0.322,0.87,0.703
gpt-4o-2024-11-20,125,141,399,335,0.335,0.125,0.399,0.141,0.524,0.704,0.454
mistralai/Mixtral-8x7B-Instruct-v0.1,212,149,299,340,0.34,0.212,0.299,0.149,0.511,0.695,0.457
claude-3-5-haiku-20241022,82,125,559,234,0.234,0.082,0.559,0.125,0.641,0.652,0.344
