# Combine Metadata Formats

This notebook combines the two different metadata formats used for the analytics dashboard and the case page into a single format. They are combined in the following way:

* Age and gender are taken from the case page version due to better coverage compared to the analytics version
* Modalities and regions are taken from the analytics version due to support for multiple modalities/regions per case
* Remaining fields are taken from the case page version, with the addition of the date and word count fields from the analytics version

Since all metadata is now combined, there is no need for a separate `cases_metadata.json` file, so only `cases_summary.json` and `cases_cleaned.json` are created.

In [None]:
import json
from pathlib import Path

# --- Load input files ---
with open("../data/processed/old/old_cases_cleaned.json", "r", encoding="utf-8") as f:
    cleaned = json.load(f)

with open("../data/processed/old/old_cases_summary.json", "r", encoding="utf-8") as f:
    summary = json.load(f)

with open("../data/processed/old/old_cases_metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

# --- Build a quick lookup for metadata by case id ---
meta_by_id = {m["case_id"]: m for m in metadata}

def merge_fields(case, meta):
    """Merge metadata fields and rename modality/body_region fields."""
    case = dict(case)  # make a shallow copy
    case.pop("modality_guess", None)
    case.pop("body_region", None)
    
    # Convert patient_age to int if possible
    age = case.get("patient_age")
    if isinstance(age, str):
        age = age.strip()
        case["patient_age"] = int(age) if age.isdigit() else None
    elif isinstance(age, (float, int)):
        case["patient_age"] = int(age)
    else:
        case["patient_age"] = None
    
    # Add metadata fields
    case["modalities"] = meta.get("modalities", [])
    case["regions"] = meta.get("regions", [])
    case["added_on"] = meta.get("added_on")
    case["last_edited_on"] = meta.get("last_edited_on")
    case["word_count"] = meta.get("word_count")
    return case


# --- Combine data ---
new_cleaned = [merge_fields(c, meta_by_id.get(c["id"], {})) for c in cleaned]
new_summary = [merge_fields(s, meta_by_id.get(s["id"], {})) for s in summary]

# --- Save output ---
with open("../data/processed/cases_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(new_cleaned, f, ensure_ascii=False, indent=2)

with open("../data/processed/cases_summary.json", "w", encoding="utf-8") as f:
    json.dump(new_summary, f, ensure_ascii=False, indent=2)

print("✅ Created cases_cleaned.json and cases_summary.json")


✅ Created cases_cleaned.json and cases_summary.json
