In [1]:
import pycm, pandas as pd

In [2]:
wd_df = pd.read_csv('error-analysis/wd_err_annotated.csv')
cg_df = pd.read_csv('error-analysis/cg_err_annotated.csv')
all_df = pd.concat([wd_df, cg_df])

In [3]:
wd_fp_df = wd_df[((wd_df["actual"] == 'negative') & (wd_df["predicted"] == 'positive')) ]
cg_fp_df = cg_df[((cg_df["actual"] == 'negative') & (cg_df["predicted"] == 'positive')) ]
wd_fn_df = wd_df[((wd_df["actual"] == 'positive') & (wd_df["predicted"] == 'negative')) ]
cg_fn_df = cg_df[((cg_df["actual"] == 'positive') & (cg_df["predicted"] == 'negative')) ]

In [4]:
wd_hvkg_cm = pycm.ConfusionMatrix(wd_df["actual"].tolist(), wd_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
wd_hvllm_cm = pycm.ConfusionMatrix(wd_df["predicted"].tolist(), wd_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
cg_hvkg_cm = pycm.ConfusionMatrix(cg_df["actual"].tolist(), cg_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])
cg_hvllm_cm = pycm.ConfusionMatrix(cg_df["predicted"].tolist(), cg_df["human"].tolist(), digit=2, classes=[ 'positive', 'negative' ])

In [5]:
analysis = [
    { 
        "kg": "Wikidata", 
        "N": len(wd_fp_df) + len(wd_fn_df),
        "FP": len(wd_fp_df), 
        "FN": len(wd_fn_df), 
        "human-kg kappa": wd_hvkg_cm.Kappa, 
        "human-llm kappa": wd_hvllm_cm.Kappa,
        "missing data": f'{wd_df["missing data"].sum()} ({float(wd_df["missing data"].sum())/float(len(wd_df)):.1%})',
        "missing relation": f'{wd_df["missing relation"].sum()} ({float(wd_df["missing relation"].sum())/float(len(wd_df)):.1%})',
        "incorrect relation": f'{wd_df["incorrect relation"].sum()} ({float(wd_df["incorrect relation"].sum())/float(len(wd_df)):.1%})',
        "incorrect reasoning": f'{wd_df["incorrect reasoning"].sum()} ({float(wd_df["incorrect reasoning"].sum())/float(len(wd_df)):.1%})',
    },
    { 
        "kg": "CaLiGraph",  
        "N": len(cg_fp_df) + len(cg_fn_df),
        "FP": len(cg_fp_df), 
        "FN": len(cg_fn_df), 
        "human-kg kappa": cg_hvkg_cm.Kappa, 
        "human-llm kappa": cg_hvllm_cm.Kappa,
        "missing data": f'{cg_df["missing data"].sum()} ({float(cg_df["missing data"].sum())/float(len(cg_df)):.1%})',
        "missing relation": f'{cg_df["missing relation"].sum()} ({float(cg_df["missing relation"].sum())/float(len(cg_df)):.1%})',
        "incorrect relation": f'{cg_df["incorrect relation"].sum()} ({float(cg_df["incorrect relation"].sum())/float(len(cg_df)):.1%})',
        "incorrect reasoning": f'{cg_df["incorrect reasoning"].sum()} ({float(cg_df["incorrect reasoning"].sum())/float(len(cg_df)):.1%})',
    },
    { 
        "kg": "TOTAL",  
        "N": len(wd_fp_df) + len(wd_fn_df) + len(cg_fp_df) + len(cg_fn_df),
        "FP": len(wd_fp_df) + len(cg_fp_df), 
        "FN": len(wd_fn_df) + len(cg_fn_df), 
        "human-kg kappa": "", 
        "human-llm kappa": "",
        "missing data": f'{all_df["missing data"].sum()} ({float(all_df["missing data"].sum())/float(len(all_df)):.1%})',
        "missing relation": f'{all_df["missing relation"].sum()} ({float(all_df["missing relation"].sum())/float(len(all_df)):.1%})',
        "incorrect relation": f'{all_df["incorrect relation"].sum()} ({float(all_df["incorrect relation"].sum())/float(len(all_df)):.1%})',
        "incorrect reasoning": f'{all_df["incorrect reasoning"].sum()} ({float(all_df["incorrect reasoning"].sum())/float(len(all_df)):.1%})',
    },
]
analysis_df = pd.DataFrame.from_records(analysis)
analysis_df

Unnamed: 0,kg,N,FP,FN,human-kg kappa,human-llm kappa,missing data,missing relation,incorrect relation,incorrect reasoning
0,Wikidata,136,46,90,0.235294,-0.235294,34 (25.0%),15 (11.0%),33 (24.3%),54 (39.7%)
1,CaLiGraph,77,27,50,-0.295206,0.197691,28 (36.4%),19 (24.7%),20 (26.0%),10 (13.0%)
2,TOTAL,213,73,140,,,62 (29.1%),34 (16.0%),53 (24.9%),64 (30.0%)
