In [1]:
import json, pandas as pd
from IPython.display import display, Markdown

In [2]:
experiments = {
    "gpt-3.5-turbo": json.load(open('experiments/gpt-3.5-turbo-owl-inf.json', 'r')),
    "gpt-4o-2024-05-13": json.load(open('experiments/gpt-4o-2024-05-13-owl-inf.json', 'r')),
    "gpt-4-0125-preview": json.load(open('experiments/gpt-4-0125-preview-owl-inf.json', 'r')),
    "mistralai/Mistral-7B-Instruct-v0.3": json.load(open('experiments/Mistral-7B-Instruct-v0.3-owl-inf.json', 'r')),
    "mistralai/Mixtral-8x7B-Instruct-v0.1": json.load(open('experiments/Mixtral-8x7B-Instruct-v0.1-owl-inf.json', 'r')),
    "meta-llama/Meta-Llama-3-70b-Instruct": json.load(open('experiments/Meta-Llama-3-70B-Instruct-owl-inf.json', 'r')),
    "claude-3-opus-20240229": json.load(open('experiments/claude-3-opus-20240229-owl-inf.json', 'r')),
    "claude-3-5-sonnet-20240620": json.load(open('experiments/claude-3-5-sonnet-20240620-owl-inf.json', 'r')),
    "claude-3-haiku-20240307": json.load(open('experiments/claude-3-haiku-20240307-owl-inf.json', 'r')),
}

In [3]:
dfs = [ pd.DataFrame.from_records(triples) for triples in [ experiments[model] for model in experiments ] ]
df = pd.concat(dfs)

In [4]:
# Convert 'predicted' column from string to integer, replacing errors with NaN
df['answer'] = pd.to_numeric(df['answer'], errors='coerce')

# Drop any rows where 'predicted' is now NaN
df = df.dropna(subset=['answer'])

# Convert 'predicted' to integer type
df['answer'] = df['answer'].astype(int)

# Now perform the groupby and sum operation
result = df.groupby('model')['answer'].sum().reset_index()

# Calculate 'N'
result['FN'] = len(dfs[0]) - result['answer']

# Calculate false negative rate (FNR)
result["FNR"] = (len(dfs[0]) - result["answer"]) / len(dfs[0])

# Display result
result[['model', 'FN', 'FNR']].sort_values('FNR', ascending=True)

Unnamed: 0,model,FN,FNR
0,claude-3-5-sonnet-20240620,0,0.0
1,claude-3-haiku-20240307,0,0.0
4,gpt-4-0125-preview,0,0.0
5,gpt-4o-2024-05-13,0,0.0
6,meta-llama/Meta-Llama-3-70B-Instruct,0,0.0
2,claude-3-opus-20240229,1,0.125
3,gpt-3.5-turbo,3,0.375
8,mistralai/Mixtral-8x7B-Instruct-v0.1,3,0.375
7,mistralai/Mistral-7B-Instruct-v0.3,4,0.5
