In [22]:
import json, pycm, pandas as pd

In [35]:
wikidata_runs = [
    ("google/gemma-7b-it", json.load(open('experiments/gemma-7b-it-wikidata.json', 'r'))),
    ("gpt-3.5-turbo", json.load(open('experiments/gpt-3.5-turbo-wikidata.json', 'r'))),
    ("gpt-4-0125-preview", json.load(open('experiments/gpt-4-0125-preview-wikidata.json', 'r'))),
    ("mistralai/Mistral-7B-Instruct-v0.2", json.load(open('experiments/Mistral-7B-Instruct-v0.2-wikidata.json', 'r'))),
    ("mistralai/Mixtral-8x7B-Instruct-v0.1", json.load(open('experiments/Mixtral-8x7B-Instruct-v0.1-wikidata.json', 'r'))),
    # ("meta-llama/Llama-2-70b-chat-hf", json.load(open('experiments/Llama-2-70b-chat-hf-wikidata.json', 'r'))),
    ("claude-3-opus-20240229", json.load(open('experiments/claude-3-opus-20240229-wikidata.json', 'r'))),
]

In [62]:
display_summary_stats(wikidata_runs)


Unnamed: 0,Model,ACC,AUC,F1 Macro,Cohen's kappa
2,gpt-4-0125-preview,0.835793,0.83104,0.831243,0.662487
5,claude-3-opus-20240229,0.813996,0.820338,0.812275,0.626301
3,mistralai/Mistral-7B-Instruct-v0.2,0.769596,0.772741,0.767704,0.536801
4,mistralai/Mixtral-8x7B-Instruct-v0.1,0.758364,0.767966,0.757278,0.519068
1,gpt-3.5-turbo,0.705882,0.698231,0.697979,0.395967
0,google/gemma-7b-it,0.70412,0.678096,0.680917,0.369313


In [41]:
caligraph_runs = [
    ("google/gemma-7b-it", json.load(open('experiments/gemma-7b-it-caligraph.json', 'r'))),
    ("gpt-3.5-turbo", json.load(open('experiments/gpt-3.5-turbo-caligraph.json', 'r'))),
    ("gpt-4-0125-preview", json.load(open('experiments/gpt-4-0125-preview-caligraph.json', 'r'))),
    ("mistralai/Mistral-7B-Instruct-v0.2", json.load(open('experiments/Mistral-7B-Instruct-v0.2-caligraph.json', 'r'))),
    ("mistralai/Mixtral-8x7B-Instruct-v0.1", json.load(open('experiments/Mixtral-8x7B-Instruct-v0.1-caligraph.json', 'r'))),
    # ("meta-llama/Llama-2-70b-chat-hf", json.load(open('experiments/Llama-2-70b-chat-hf-wikidata.json', 'r'))),
    ("claude-3-opus-20240229", json.load(open('experiments/claude-3-opus-20240229-caligraph.json', 'r'))),
]

In [61]:
display_summary_stats(caligraph_runs)


Unnamed: 0,Model,ACC,AUC,F1 Macro,Cohen's kappa
2,gpt-4-0125-preview,0.948953,0.947859,0.948724,0.8975
5,claude-3-opus-20240229,0.935864,0.935912,0.935758,0.871518
4,mistralai/Mixtral-8x7B-Instruct-v0.1,0.897606,0.896734,0.897238,0.794521
1,gpt-3.5-turbo,0.850785,0.850839,0.850585,0.701188
3,mistralai/Mistral-7B-Instruct-v0.2,0.813333,0.811448,0.81018,0.625192
0,google/gemma-7b-it,0.68435,0.668402,0.640247,0.347324


In [60]:
def confusion_matrix(model):
    df = pd.DataFrame.from_records(model)
    return pycm.ConfusionMatrix(df["actual"].tolist(), df["predicted"].tolist(), digit=2, classes=[ '1', '0' ])

def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

def display_summary_stats(runs):
    cms = [ ( model[0], confusion_matrix(model[1])) for model in runs ]
    stats = [ 
                { 
                    "Model": cm[0],
                    "ACC": cm[1].ACC['1'], 
                    "AUC": cm[1].AUC['1'], 
                    "F1 Macro": cm[1].F1_Macro,
                    "Cohen's kappa": cm[1].Kappa
                } for cm in cms 
            ]
    stats_df = pd.DataFrame.from_records(stats)
    df_for_table = stats_df[["Model", "ACC", "AUC", "F1 Macro", "Cohen's kappa"]].sort_values(by="AUC", ascending=False)
    styled_df = df_for_table.style.apply(highlight_max, subset=['ACC', 'AUC', 'F1 Macro', "Cohen's kappa"])
    return styled_df