In [40]:
import json
import itertools
import numpy as np
from pathlib import Path

def extract_labels(entry):
    keys_to_check = [
        "time_period", "materiality", "region", "colour",
        "purpose", "themes",
    ]
    labels = set()
    for key in keys_to_check:
        labels.update(entry.get(key, []))
    return labels

def compute_jaccard(set1, set2):
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 1.0

def load_files(file_paths):
    data_by_file = {}
    for path in file_paths:
        with open(path, "r") as f:
            data_by_file[path.name] = json.load(f)
    return data_by_file

def compute_jaccard_stats(data_by_file):
    file_names = list(data_by_file.keys())
    ids = list(data_by_file[file_names[0]].keys())  # Assumes all files have same IDs
    results = {}

    for id_ in ids:
        label_sets = {
            fname: extract_labels(data_by_file[fname][id_])
            for fname in file_names
        }

        scores = []
        for f1, f2 in itertools.combinations(file_names, 2):
            score = compute_jaccard(label_sets[f1], label_sets[f2])
            scores.append(score)

        mean_score = float(np.mean(scores))
        std_score = float(np.std(scores))
        results[id_] = {
            "mean_jaccard": round(mean_score, 4),
            # "std_jaccard": round(std_score, 4),
            "num_pairs": len(scores)
        }

    return results

# Example usage
def main(folder_path):
    file_paths = list(Path(folder_path).glob("*.json"))
    data_by_file = load_files(file_paths)
    results = compute_jaccard_stats(data_by_file)
    return results

# Call main with a folder path:
input_file = "output/Objectifying_China/tagged/test_seeds_no_explain/temp1.2"
final_results = main(input_file)
results = json.dumps(final_results, indent=2)


In [41]:
# total_score = 0
# for item in final_results.values(): 
#     total_score += item["mean_jaccard"]

print(f"mean of the mean score: {np.mean([item['mean_jaccard'] for item in final_results.values()])}")
print(f"std of the std score: {np.std([item['mean_jaccard'] for item in final_results.values()])}")


mean of the mean score: 0.8119846153846152
std of the std score: 0.11473438788486043


In [5]:
final_results

{'lnwpgxpl': {'mean_jaccard': 0.9048, 'std_jaccard': 0.0673, 'num_pairs': 3},
 'a8s3x6t7': {'mean_jaccard': 0.9259, 'std_jaccard': 0.0524, 'num_pairs': 3},
 'vc5p22wp': {'mean_jaccard': 1.0, 'std_jaccard': 0.0, 'num_pairs': 3},
 'qb4wa7zc': {'mean_jaccard': 1.0, 'std_jaccard': 0.0, 'num_pairs': 3},
 'im6cgdm0': {'mean_jaccard': 0.8274, 'std_jaccard': 0.0552, 'num_pairs': 3},
 'lhzzzkix': {'mean_jaccard': 0.8519, 'std_jaccard': 0.1048, 'num_pairs': 3},
 'ygehf8bq': {'mean_jaccard': 1.0, 'std_jaccard': 0.0, 'num_pairs': 3},
 'uwcf2d0r': {'mean_jaccard': 0.8667, 'std_jaccard': 0.0943, 'num_pairs': 3},
 'z9xpyepf': {'mean_jaccard': 0.9394, 'std_jaccard': 0.0429, 'num_pairs': 3},
 'c9c55t0n': {'mean_jaccard': 1.0, 'std_jaccard': 0.0, 'num_pairs': 3},
 'lhrmwftm': {'mean_jaccard': 1.0, 'std_jaccard': 0.0, 'num_pairs': 3},
 'c49bx3b3': {'mean_jaccard': 0.9167, 'std_jaccard': 0.0589, 'num_pairs': 3},
 'if7eszt6': {'mean_jaccard': 0.8667, 'std_jaccard': 0.0943, 'num_pairs': 3},
 'uiilau7k': {'m