# Improve Case Data

### Changes:
* **Remove duplicate cases:** Some cases have the same id. They are usually identical in terms of text content, but some link to more images than others. We look for the case with the highest word count, followed by the highest image count, in order to preserve as much data as possible.
* **Group regions into broader categories:** The old data had 34 categories. To reduce granularity, these are grouped into 9 broader categories.

In [214]:
import json
from pathlib import Path

DATA_DIR = Path("../data/processed")

with open(DATA_DIR / "cases_cleaned.json", encoding="utf-8") as f:
    cases_cleaned = json.load(f)

with open(DATA_DIR / "cases_summary.json", encoding="utf-8") as f:
    cases_summary = json.load(f)

In [215]:
def remove_duplicates(cases):
    id_to_case = {}
    for case in cases:
        cid = case["id"]
        id_to_case.setdefault(cid, []).append(case)

    unique_cases = []
    for cid, case_list in id_to_case.items():
        # sort by word_count, then imageCount, and only keep the top one
        case_list.sort(
            key=lambda c: (
                c.get("word_count", 0),
                c.get("imageCount", 0)
            ),
            reverse=True
        )
        unique_cases.append(case_list[0])
    return unique_cases

cases_cleaned = remove_duplicates(cases_cleaned)
cases_summary = remove_duplicates(cases_summary)

print(f"After removing duplicates: {len(cases_cleaned)} cleaned cases, {len(cases_summary)} summary cases.")

After removing duplicates: 5872 cleaned cases, 5872 summary cases.


In [216]:
FINE_TO_BROAD = {
    "Brain/Head": "Head & Neck",
    "Face/Sinus": "Head & Neck",
    "Orbit/Eye": "Head & Neck",
    "Neck (non-spine)": "Head & Neck",
    "Spine—Cervical": "Spine",
    "Spine—Thoracic": "Spine",
    "Spine—Lumbar": "Spine",
    "Spine—Sacrum/SI": "Spine",
    "Spine—Unspecified": "Spine",
    "Chest": "Chest",
    "Breast": "Chest",
    "Abdomen": "Abdomen",
    "Liver": "Abdomen",
    "Spleen": "Abdomen",
    "Pancreas": "Abdomen",
    "Gallbladder/Biliary": "Abdomen",
    "Bowel/Colon": "Abdomen",
    "Small Intestine": "Abdomen",
    "Kidney": "Abdomen",
    "Pelvis": "Pelvis",
    "Bladder": "Pelvis",
    "Urogenital": "Pelvis",
    "Aorta": "Vascular",
    "Vessels—Other": "Vascular",
    "Shoulder": "Upper Extremity",
    "Arm/Humerus": "Upper Extremity",
    "Elbow": "Upper Extremity",
    "Forearm": "Upper Extremity",
    "Wrist/Hand": "Upper Extremity",
    "Hip": "Lower Extremity",
    "Knee": "Lower Extremity",
    "Leg (Tibia/Fibula)": "Lower Extremity",
    "Ankle/Foot": "Lower Extremity",
    "Soft Tissue/Skin": "Soft Tissue",
}

def compute_broad_regions(fine_regions):
    region_dict = {}
    for fine_region in fine_regions:
        broad_region = FINE_TO_BROAD[fine_region]
        region_dict.setdefault(broad_region, []).append(fine_region)
    return region_dict

def refine_regions(cases):
    for case in cases:
        fine_regions = case["regions"]
        new_regions = compute_broad_regions(fine_regions)
        case["regions"] = new_regions

refine_regions(cases_cleaned)
refine_regions(cases_summary)

print("Refined regions to broad categories.")
print("Example case regions:", cases_cleaned[4]["regions"])

Refined regions to broad categories.
Example case regions: {'Abdomen': ['Abdomen', 'Kidney'], 'Spine': ['Spine—Lumbar', 'Spine—Unspecified'], 'Vascular': ['Aorta', 'Vessels—Other']}


In [217]:
# Save the improved data back to JSON files
with open(DATA_DIR / "cases_cleaned_v2.json", "w", encoding="utf-8") as f:
    json.dump(cases_cleaned, f, ensure_ascii=False, indent=2)

with open(DATA_DIR / "cases_summary_v2.json", "w", encoding="utf-8") as f:
    json.dump(cases_summary, f, ensure_ascii=False, indent=2)

print("Saved improved data to cases_cleaned_v2.json and cases_summary_v2.json.")

Saved improved data to cases_cleaned_v2.json and cases_summary_v2.json.
