In [11]:
from pathlib import Path
import json, csv, math
from collections import Counter, defaultdict
from google.colab import files

FOLDER_PATH = Path("/content/Labled Data")
ANNOTATOR1 = None
ANNOTATOR2 = None
OUT_CSV    = Path("/content/kappa_summary.csv")

LABEL_MAP = {"Acceptable":0, "Needs Revision":1, "Unacceptable":2}
CLASSES = [0,1,2]

def is_leaf_record(d):
    if not isinstance(d, dict): return False
    if "name" not in d: return False
    desc = d.get("description", None)
    if isinstance(desc, list) and any(isinstance(x, dict) for x in desc): return False
    return True

def extract_votes(obj):
    out = {}
    def walk(node, path=""):
        if isinstance(node, dict):
            if is_leaf_record(node) and "vote" in node:
                v = node.get("vote")
                if isinstance(v, list) and v and isinstance(v[0], dict):
                    labs = v[0].get("labels", [])
                    if labs: out[path] = labs[0]
            for k, v in node.items(): walk(v, path + "/" + str(k))
        elif isinstance(node, list):
            for i, x in enumerate(node): walk(x, path + f"/{i}")
    walk(obj, "")
    return out

def cohen_kappa(labels_a, labels_b, classes):
    n = len(labels_a)
    conf = defaultdict(int)
    count_a = Counter()
    count_b = Counter()
    for a,b in zip(labels_a, labels_b):
        conf[(a,b)] += 1
        count_a[a] += 1
        count_b[b] += 1
    po = sum(conf[(c,c)] for c in classes) / n
    pe = sum((count_a[c]/n) * (count_b[c]/n) for c in classes)
    if math.isclose(1.0 - pe, 0.0):
        return 1.0 if math.isclose(po, 1.0) else 0.0
    return (po - pe) / (1 - pe)

annotators = [d.name for d in FOLDER_PATH.iterdir() if d.is_dir() and not d.name.startswith('.')]
annotators.sort()
if len(annotators) < 2:
    raise RuntimeError("Need at least two annotator subfolders.")

a1 = ANNOTATOR1 or annotators[0]
a2 = ANNOTATOR2 or annotators[1]
provinces = sorted({p.name for p in (FOLDER_PATH / a1).glob("*.json")})

votes = {ann:{} for ann in [a1,a2]}
for ann in [a1,a2]:
    for prov in provinces:
        fp = FOLDER_PATH / ann / prov
        if fp.exists():
            with open(fp, "r", encoding="utf-8") as f:
                data = json.load(f)
            votes[ann][prov] = extract_votes(data)

rows = []
overall_a, overall_b = [], []
for prov in provinces:
    v1 = votes[a1].get(prov, {})
    v2 = votes[a2].get(prov, {})
    keys = sorted(set(v1.keys()) & set(v2.keys()))
    la = [LABEL_MAP[v1[k]] for k in keys if v1.get(k) in LABEL_MAP and v2.get(k) in LABEL_MAP]
    lb = [LABEL_MAP[v2[k]] for k in keys if v1.get(k) in LABEL_MAP and v2.get(k) in LABEL_MAP]
    display_name = Path(prov).stem
    if la and lb:
        kappa = cohen_kappa(la, lb, CLASSES)
        rows.append({"province": display_name, "N": len(la), "kappa": round(kappa, 4)})
        overall_a.extend(la)
        overall_b.extend(lb)
    else:
        rows.append({"province": display_name, "N": 0, "kappa": ""})

overall_kappa = cohen_kappa(overall_a, overall_b, CLASSES) if overall_a else None
rows.append({"province": "__OVERALL__", "N": len(overall_a), "kappa": round(overall_kappa,4) if overall_kappa is not None else ""})

with open(OUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.DictWriter(f, fieldnames=["province","N","kappa"])
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

files.download(str(OUT_CSV))
print(f"Annotators: {a1} vs {a2}")
print(f"Overall Cohen's kappa: {overall_kappa}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Annotators: Alireza vs Asal
Overall Cohen's kappa: 0.9493811906256961
