In [1]:
import json, glob, pandas as pd

In [2]:
pd.set_option('display.precision', 3)

In [3]:
BENCHMARK_SIZE = 1000.

In [4]:
def unilateral_performance(df):
    model_evals = df.groupby('model_name')['evaluation'].value_counts().unstack(fill_value=0)
    model_evals["correct (t)"] = model_evals["t"] / BENCHMARK_SIZE
    model_evals["not attempted (n)"] = model_evals["n"] / BENCHMARK_SIZE
    model_evals["incorrect (f)"] = model_evals["f"] / BENCHMARK_SIZE
    model_evals["correct given attempted"] = model_evals["t"] / (model_evals["t"] + model_evals["f"])
    model_evals["F score"] = (2.*model_evals["t"]) / ((2.*model_evals["t"]) + (2.*model_evals["f"]) + model_evals["n"])
    return model_evals

In [5]:
def bilateral_performance(df):
    model_evals = df.groupby('model_name')['evaluation'].value_counts().unstack(fill_value=0)
    model_evals["correct (t)"] = model_evals["t"] / BENCHMARK_SIZE
    model_evals["inconsistent (b)"] = model_evals["b"] / BENCHMARK_SIZE
    model_evals["unknown (n)"] = model_evals["n"] / BENCHMARK_SIZE
    model_evals["incorrect (f)"] = model_evals["f"] / BENCHMARK_SIZE
    model_evals["not attempted (b+n)"] = (model_evals["b"] + model_evals["n"]) / BENCHMARK_SIZE
    model_evals["correct given attempted"] = model_evals["t"] / (model_evals["t"] + model_evals["f"])
    model_evals["F score"] = (2.*model_evals["t"]) / ((2.*model_evals["t"]) + (2.*model_evals["f"]) + (model_evals["b"] + model_evals["n"]))
    return model_evals

In [6]:
unilateral_results = []
for file in glob.glob("experiments/unilateral/*.json"):
   unilateral_results += json.load(open(file, "r"))
df_unilateral = pd.DataFrame(unilateral_results)
unilateral_performance(df_unilateral)

evaluation,f,n,t,correct (t),not attempted (n),incorrect (f),correct given attempted,F score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt-4o-2024-11-20,285,0,715,0.715,0.0,0.285,0.715,0.715
gpt-4o-mini,524,2,474,0.474,0.002,0.524,0.475,0.474
mistralai/Mistral-7B-Instruct-v0.3,155,0,845,0.845,0.0,0.155,0.845,0.845
mistralai/Mixtral-8x7B-Instruct-v0.1,377,12,611,0.611,0.012,0.377,0.618,0.615


In [7]:
bilateral_results = []
for file in glob.glob("experiments/bilateral/*.json"):
   bilateral_results += json.load(open(file, "r"))
df_bilateral = pd.DataFrame(bilateral_results)
bilateral_performance(df_bilateral)

evaluation,b,f,n,t,correct (t),inconsistent (b),unknown (n),incorrect (f),not attempted (b+n),correct given attempted,F score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gpt-4o-2024-11-20,128,213,409,250,0.25,0.128,0.409,0.213,0.537,0.54,0.342
gpt-4o-mini,357,295,273,75,0.075,0.357,0.273,0.295,0.63,0.203,0.109
mistralai/Mistral-7B-Instruct-v0.3,664,95,22,219,0.219,0.664,0.022,0.095,0.686,0.697,0.333
mistralai/Mixtral-8x7B-Instruct-v0.1,188,59,189,564,0.564,0.188,0.189,0.059,0.377,0.905,0.695
