In [1]:
import json, pycm, pandas as pd
import ipywidgets as widgets
from functools import partial
from IPython.display import display, clear_output

In [2]:
def get_classification(callback, **kwargs):
    button_yes = widgets.Button(description="positive")
    button_no = widgets.Button(description="negative")
    output = widgets.Output()
    question_label = widgets.Label(f'Is {err["entity"]} a(n) {err["concept"]}?')
    
    def on_button_clicked(is_yes, b):
        with output:
            clear_output()
            print("Your answer was:", "positive" if is_yes else "negative")
        callback("positive" if is_yes else "negative", **kwargs)
    
    button_yes.on_click(partial(on_button_clicked, True))
    button_no.on_click(partial(on_button_clicked, False))
    display(question_label, button_yes, button_no, output)

def handle_response(response, err=None):
    err["human"] = response

In [3]:
wd_results = json.load(open("gpt-4-0125-preview-wikidata.json", "r"))
cg_results = json.load(open("gpt-4-0125-preview-caligraph.json", "r"))

In [4]:
wd_df = pd.DataFrame.from_records(sum(wd_results, []))
cg_df = pd.DataFrame.from_records(sum(cg_results, []))

In [32]:
wd_fp_df = wd_df[((wd_df["actual"] == 'negative') & (wd_df["predicted"] == 'positive')) ]
cg_fp_df = cg_df[((cg_df["actual"] == 'negative') & (cg_df["predicted"] == 'positive')) ]
wd_fn_df = wd_df[((wd_df["actual"] == 'positive') & (wd_df["predicted"] == 'negative')) ]
cg_fn_df = cg_df[((cg_df["actual"] == 'positive') & (cg_df["predicted"] == 'negative')) ]

In [15]:
wd_err_df = wd_df[((wd_df["actual"] == 'negative') & (wd_df["predicted"] == 'positive')) | ((wd_df["actual"] == 'positive') & (wd_df["predicted"] == 'negative')) ]
cg_err_df = cg_df[((cg_df["actual"] == 'positive') & (cg_df["predicted"] == 'negative')) | ((cg_df["actual"] == 'negative') & (cg_df["predicted"] == 'positive')) ]

In [23]:
wd_err_df[["concept", "entity", "actual", "predicted"]].to_csv('wd_err.csv', index=False)
cg_err_df[["concept", "entity", "actual", "predicted"]].to_csv('cg_err.csv', index=False)

In [24]:
wd_annotated_df = pd.read_csv('wd_err_annotated.csv')
cg_annotated_df = pd.read_csv('cg_err_annotated.csv')

In [26]:
wd_hvkg_cm = pycm.ConfusionMatrix(wd_annotated_df["actual"].tolist(), wd_annotated_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
wd_hvllm_cm = pycm.ConfusionMatrix(wd_annotated_df["predicted"].tolist(), wd_annotated_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
cg_hvkg_cm = pycm.ConfusionMatrix(cg_annotated_df["actual"].tolist(), cg_annotated_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
cg_hvllm_cm = pycm.ConfusionMatrix(cg_annotated_df["predicted"].tolist(), cg_annotated_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])

In [37]:
analysis = [
    { "kg": "Wikidata", "FP": len(wd_fp_df), "FN": len(wd_fn_df), "k_human-kg": wd_hvkg_cm.Kappa, "k_human-llm": wd_hvllm_cm.Kappa },
    { "kg": "CaLiGraph",  "FP": len(cg_fp_df), "FN": len(cg_fn_df), "k_human-kg": cg_hvkg_cm.Kappa, "k_human-llm": cg_hvllm_cm.Kappa }
]
analysis_df = pd.DataFrame.from_records(analysis)
analysis_df

Unnamed: 0,kg,FP,FN,k_human-kg,k_human-llm
0,Wikidata,46,90,0.242658,-0.241515
1,CaLiGraph,27,50,-0.295206,0.197691
