In [37]:
import json
import pickle
import numpy as np
from typing import Dict, List, Tuple

EPS = 1e-6

In [42]:
def load_manual_labels_json(file_path: str) -> Dict:
    with open(file_path, "r") as f:
        data = json.load(f)
    data = {int(k): v for k, v in data.items()}
    return data


def load_llm_pkl(file_path: str) -> Dict:
    with open(file_path, "rb") as f:
        llm_file = pickle.load(f)

    llm_dict = {}

    for _, value in enumerate(llm_file):
        sample_idx = int(value[0])
        results = value[1:]
        llm_dict[sample_idx] = results

    return llm_dict


file1 = "llm_autointerp/manual_labels_can_final.json"
file2 = "llm_autointerp/llm_results.pkl"

manual_file = load_manual_labels_json(file1)

llm_file = load_llm_pkl(file2)


In [43]:
print(llm_file.keys())

dict_keys([63, 13, 9, 4, 17, 2, 65, 53, 1, 15, 27, 67, 69, 59, 87, 47, 85, 81, 71, 31, 21, 45, 83, 37, 33, 57, 35, 51, 11, 55, 3, 75, 49, 19, 73, 77, 0, 79, 39, 41, 89, 7, 6, 29, 8, 23, 61, 25, 43, 5])


In [59]:
sample_idx = 1

print(f"##### Example Prompts\n {manual_file[sample_idx]['example_prompts'][0]}\n\n")
print(f"##### Manual chain of thought\n{manual_file[sample_idx]['chain_of_thought']}\n\n")
print(f"##### LLM chain of thought\n{llm_file[sample_idx][0]}\n\n")
print(f"manual labels {manual_file[sample_idx]['per_class_scores']}")
print(f"LLM labels    {llm_file[sample_idx][1]}")

##### Example Prompts
 


Example 1: a variance to permit construction of the << dental>>(4) << office>>(1) by the Marion County Board of Zoning Appeals.
Subsequently on September 11, 1959, the Director of the Metropolitan Planning Department filed an affidavit to appeal said decision to the Metropolitan Board of Zoning Appeals, as authorized by statute.
"53-969. Petitions for variance.  The city and county board of zoning appeals and the metropolitan board of zoning appeals are hereby authorized to grant height, bulk, area and use variances in the manner hereinafter set forth. Both city or county board of zoning appeals and the metropolitan board of zoning appeals may grant petitions for variance in their entirety or in




Example 2: eliminate << dental>>(4), << vision>>(1) and << pharmacy>>(0) benefits for low-income adults.

The Trump administration had argued Obama's Medicaid expansion essentially created a new program under Medicaid. Allowing states to cover low-income adults wi

In [48]:
def extract_scores_manual(data: Dict, is_valid: List[bool]) -> Dict[str, List[int]]:
    manual_labels = {}
    for idx in sorted(data.keys()):
        idx = int(idx)
        item = data[idx]
        if idx >= len(is_valid):
            continue
        if is_valid[idx]:
            for category, score in item["per_class_scores"].items():
                if category not in manual_labels:
                    manual_labels[category] = []
                manual_labels[category].append(score)
    return manual_labels


def extract_scores_llm(data: List[Tuple[str, Dict[str, int], bool, str]]) -> Dict[str, List[int]]:
    is_valid = []
    result = {}
    for idx in sorted(data.keys()):
        idx_results = data[idx]
        scores = idx_results[1]  # The scores dictionary is the second element of each tuple
        if scores is None:
            is_valid.append(False)
        else:
            is_valid.append(True)
            for category, score in scores.items():
                if category not in result:
                    result[category] = []
                result[category].append(score)
    return result, is_valid


def extract_paired_llm_manual_scores(
    llm_data: dict[int, tuple[str, dict[str, int], bool, str]],
    manual_data: dict[int, dict],
) -> dict[str, list[int]]:
    is_valid = []
    llm_results_per_class = {}
    manual_results_per_class = {}

    for idx in sorted(llm_data.keys()):
        llm_results = llm_data[idx]
        manual_labels = manual_data[idx]

        llm_scores = llm_results[1]  # The scores dictionary is the second element of each tuple
        manual_scores = manual_labels["per_class_scores"]

        if llm_scores is None:
            is_valid.append(False)
        else:
            is_valid.append(True)
            for category in llm_scores.keys():
                if category not in llm_results_per_class:
                    llm_results_per_class[category] = []
                    manual_results_per_class[category] = []
                llm_results_per_class[category].append(llm_scores[category])
                manual_results_per_class[category].append(manual_scores[category])
    return llm_results_per_class, manual_results_per_class, is_valid


# llm_labels, is_valid = extract_scores_llm(llm_file)
# manual_labels = extract_scores_manual(manual_file, is_valid)

llm_labels, manual_labels, is_valid = extract_paired_llm_manual_scores(llm_file, manual_file)

test_key = "dentist"

print(llm_labels[test_key][:])
print(manual_labels[test_key][:])

[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0]
[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0]


In [54]:
def cohens_kappa(scores1: Dict[str, List[int]], scores2: Dict[str, List[int]]) -> Dict[str, float]:
    def kappa(a: np.ndarray, b: np.ndarray) -> float:
        n = len(a)
        categories = np.unique(np.concatenate([a, b]))
        n_categories = len(categories)

        # Observed agreement
        observed = np.sum(a == b) / n

        # Expected agreement
        expected = sum((np.sum(a == i) / n) * (np.sum(b == i) / n) for i in categories)

        # Compute kappa
        kappa = (observed - expected) / (1 - expected + EPS)
        return kappa

    results = {}
    for category in scores1.keys():
        a = np.array(scores1[category])
        b = np.array(scores2[category])
        results[category] = kappa(a, b)

    return results


def compute_kappa_for_files(file1: str, file2: str) -> Dict[str, float]:
    manual_labels = load_manual_labels_json(file1)
    llm_labels = load_llm_pkl(file2)

    print(f"Length of manual labels: {len(manual_labels)}")
    print(f"Length of LLM labels: {len(llm_labels)}")

    # Find overlapping keys
    # overlap = set(data1.keys()) & set(data2.keys())
    # print(f'Number of shared keys: {len(overlap)}')
    # data1_overlap, data2_overlap = {}, {}
    # for key in overlap:
    #     data1_overlap[key] = data1[key]
    #     data2_overlap[key] = data2[key]

    llm_labels, manual_labels, is_valid_llm_output = extract_paired_llm_manual_scores(llm_file, manual_file)

    print(f"Number of invalid valid scores: {len(is_valid_llm_output) - sum(is_valid_llm_output)}")

    return cohens_kappa(llm_labels, manual_labels)


scores = cohens_kappa(llm_labels, manual_labels)
for class_name in scores:
    print(f"{class_name}: {scores[class_name]:.4f}")
# kappa_scores = compute_kappa_for_files(file1, file2)

# print("Cohen's Kappa scores for each category:")
# for category, score in kappa_scores.items():
#     print(f"{category}: {score:.4f}")

gender: 0.5452
professor: 0.6449
nurse: -0.0208
accountant: 0.1929
architect: 0.0000
attorney: 0.6390
dentist: 1.0000
filmmaker: 0.3758


In [58]:
nonzeros = {"llm": {}, "manual": {}}

for category in llm_labels.keys():
    nonzero_llm = 0
    nonzero_manual = 0

    for llm_score, manual_score in zip(llm_labels[category], manual_labels[category]):
        if llm_score > 0:
            nonzero_llm += 1
        if manual_score > 0:
            nonzero_manual += 1
    print(category, nonzero_llm, nonzero_manual)

gender 9 11
professor 4 5
nurse 2 2
accountant 2 7
architect 2 0
attorney 7 5
dentist 2 2
filmmaker 7 7
