# MedPix Metadata Extraction

## Imports

In [47]:
import json
from pathlib import Path
import re
from collections import Counter

## 1. Load Raw Data

In [48]:
data_path = Path('../data/archive/Cases.json')

with open(data_path, 'r', encoding='utf-8') as f:
    raw_cases = json.load(f)

print(f"Total number of cases: {len(raw_cases)}")

Total number of cases: 7432


## 2. Define Extraction Functions

In [49]:
def extract_added_on(text: str):
    if not isinstance(text, str) or not text:
        return None
    m = re.search(r"added on\s+(\d{4}-\d{2}-\d{2})", text, flags=re.I)
    return m.group(1) if m else None

def extract_last_edited_on(text: str):
    if not isinstance(text, str) or not text:
        return None
    m = re.search(r"last edited on\s+(\d{4}-\d{2}-\d{2})", text, flags=re.I)
    return m.group(1) if m else None

def extract_age(text: str):
    if not isinstance(text, str) or not text:
        return None
    t = text.lower()
    m = re.search(r"\b(\d{1,3})\s*(?:y[/\.-]?\s?o|yo|yr old|years?\s*old)\b", t)
    if not m:
        return None
    try:
        val = int(m.group(1))
        return val if 0 <= val <= 120 else None
    except ValueError:
        return None

def extract_gender(text: str):
    if not isinstance(text, str) or not text:
        return None
    t = text.lower()
    male = re.search(r"\b(male|man)\b", t) is not None
    female = re.search(r"\b(female|woman)\b", t) is not None
    if male and not female:
        return "Male"
    if female and not male:
        return "Female"
    return None  # ambiguous or not found

def extract_modalities(text: str):
    """
    Extract a list of imaging modalities mentioned in the text.
    Returns a list like ["CT", "MRI"] or [] if none found.
    """
    if not isinstance(text, str) or not text:
        return []
    t = text.lower()

    modalities = set()

    if re.search(r"\bct(\s+ivp)?\b|\bcta\b", t):
        modalities.add("CT")
    if re.search(r"\bmri\b|\bmr\b", t):
        modalities.add("MRI")
    if re.search(r"x-?\s?ray|\bradiograph(s)?\b|\bplain film\b", t):
        modalities.add("X-ray")
    if re.search(r"\bultrasound\b|\bus\b|\bsonograph", t):
        modalities.add("Ultrasound")
    if re.search(r"\bfluoro|\bdmx\b", t):
        modalities.add("Fluoroscopy")
    if re.search(r"\bpet\b|\bnuclear\b|\bspect\b", t):
        modalities.add("PET/Nuclear")
    if re.search(r"\bangiogram\b|\bmra\b", t):
        modalities.add("Angiogram")
    if re.search(r"\bmammogram\b|\bmammography\b", t):
        modalities.add("Mammogram")
    if re.search(r"\bdexa\b|\bbone scan\b", t):
        modalities.add("Bone Scan")

    return list(modalities)

def extract_regions(text: str):
    """
    Extract a list of body regions mentioned in the text.
    Returns a list like ["Abdomen", "Chest"] or [] if none found.
    """
    if not isinstance(text, str) or not text:
        return []

    REGION_PATTERNS = {
        "Abdomen": r"\babdomen\b|\babdominal\b|rlq|llq|ruq|luq",
        "Pelvis": r"\bpelvi(c|s)\b|\bpelvic\b",
        "Chest": r"\bchest\b|\bthorax\b|\blungs?\b|\bpulmonary\b",
        "Spine—Cervical": r"\bcervical\b|\bc\d-?\d?\b",
        "Spine—Thoracic": r"\bthoracic spine\b|\bt\d-?\d?\b",
        "Spine—Lumbar": r"\blumbar\b|\bl\d-?\d?\b",
        "Spine—Sacrum/SI": r"\bsacrum\b|\bsacral\b|\bsacroiliac\b|\bsi joint\b",
        "Spine—Unspecified": r"\bspine\b|\bspinal\b",
        "Brain/Head": r"\bbrain\b|\bhead\b|\bcranial\b|\bintracranial\b",
        "Face/Sinus": r"\bsinus(es)?\b|\bparanasal\b|\bmaxillary sinus\b|\bethmoid\b|\bfrontal sinus\b|\bmaxillofacial\b",
        "Orbit/Eye": r"\borbit(al)?\b|\bglobe\b|\boptic (nerve|canal)\b|\bretro[- ]?orbital\b",
        "Neck (non-spine)": r"\bneck\b|\blarynx\b|\bpharyn[gx]\b",
        "Aorta": r"\baortic\b|\babdominal aortic aneurysm\b|\baaa\b|\baorta\b",
        "Vessels—Other": r"\b(ivc|iliac|carotid|subclavian|femoral artery|popliteal|mesenteric)\b",
        "Kidney": r"\bkidney\b|\brenal\b",
        "Bladder": r"\bbladder\b|\bvesical\b",
        "Liver": r"\bliver\b|\bhepatic\b",
        "Spleen": r"\bspleen\b|\bsplenic\b",
        "Pancreas": r"\bpancreas\b|\bpancreatic\b",
        "Gallbladder/Biliary": r"\bgallbladder\b|\bchole(cyst|doch)\b|\bbiliary\b",
        "Bowel/Colon": r"\bcolon\b|\bcolonic\b|\bappendix\b|\bappendicitis\b|\bdiverticul",
        "Small Intestine": r"\bsmall (bowel|intestin\w*)\b|\bileum\b|\bileal\b|\bjejunum\b|\bduodenum\b",
        "Hip": r"\bhip\b|\bacetabul",
        "Knee": r"\bknee\b|\bmenisc|\bacl\b|\bpcl\b|\bpatell",
        "Shoulder": r"\bshoulder\b|\brotator cuff\b",
        "Wrist/Hand": r"\bwrist\b|\bcarpal\b|\bmetacarp|\bhand\b",
        "Ankle/Foot": r"\bankle\b|\bfoot\b|\bmetatars|\btalus\b|\bcalcaneus\b",
        "Leg (Tibia/Fibula)": r"\btibia\b|\bfibula\b|\bleg\b(?!ion)",
        "Arm/Humerus": r"\bhumerus\b|\bupper arm\b",
        "Forearm": r"\bradius\b|\bulna\b|\bforearm\b",
        "Elbow": r"\belbow\b|\boleranon\b",
        "Urogenital": r"\badnex\w*\b|\buterus\b|\bovary\b|\bcervix\b|\buterine\b|\bprostate\b|\btest(i|es)\b|\bscrotum\b|\binguin",
        "Breast": r"\bbreast\b|\bmammary\b|\bmammogram\b|\bmammography\b",
        "Soft Tissue/Skin": r"\bsoft[- ]tissue\b|\bsubcutaneous\b|\bcutaneous\b|\bskin\b",
    }

    hits = []
    t = text.lower()
    for name, pattern in REGION_PATTERNS.items():
        if re.search(pattern, t):
            hits.append(name)

    # remove duplicates, preserve order
    seen = set()
    return [h for h in hits if not (h in seen or seen.add(h))]

def extract_word_count(text):
    """Return length of the case text in words."""
    if not isinstance(text, str) or not text.strip():
        return 0
    # Count words (split on whitespace)
    return len(text.split())

def extract_image_count(case):
    """Return the number of images in a case."""
    imgs = case.get("Image Paths")
    if isinstance(imgs, list):
        return len(imgs)
    return 0

def extract_case_id(case_folder):
    match = re.search(r'case_(-?\d+)', case_folder)
    return match.group(1) if match else None

## 3. Build Metadata Records

In [50]:
cases_metadata = []
for index, case in enumerate(raw_cases):
    text = case.get("Case Title", "") or ""
    folder = case.get("Case Folder", "") or ""

    case_id = extract_case_id(folder)
    if not case_id:
        case_id = f"case_{index}"
    
    cases_metadata.append({
        "case_id": case_id,
        "added_on": extract_added_on(text),
        "last_edited_on": extract_last_edited_on(text),
        "age": extract_age(text),
        "gender": extract_gender(text),
        "modalities": extract_modalities(text),
        "regions": extract_regions(text),
        "image_count": extract_image_count(case),
        "word_count": extract_word_count(text),
    })

# Preview a few
print("Sample metadata records (first 3):")
for rec in cases_metadata[:3]:
    print(rec)

Sample metadata records (first 3):
{'case_id': '8892378009084536600', 'added_on': '2018-03-06', 'last_edited_on': '2024-01-21', 'age': 14, 'gender': None, 'modalities': ['Fluoroscopy', 'CT', 'X-ray', 'MRI'], 'regions': ['Abdomen', 'Chest', 'Spine—Cervical', 'Spine—Thoracic', 'Spine—Lumbar', 'Spine—Unspecified', 'Brain/Head', 'Neck (non-spine)', 'Kidney', 'Bladder', 'Liver', 'Hip', 'Knee', 'Shoulder', 'Wrist/Hand', 'Leg (Tibia/Fibula)', 'Soft Tissue/Skin'], 'image_count': 23, 'word_count': 4806}
{'case_id': '-16278608286148448', 'added_on': '2019-07-26', 'last_edited_on': None, 'age': 27, 'gender': 'Male', 'modalities': ['CT'], 'regions': ['Abdomen', 'Pelvis', 'Chest', 'Spine—Unspecified', 'Brain/Head', 'Neck (non-spine)', 'Kidney', 'Bladder', 'Hip'], 'image_count': 7, 'word_count': 648}
{'case_id': '-9029866025949687595', 'added_on': None, 'last_edited_on': None, 'age': None, 'gender': None, 'modalities': ['CT'], 'regions': ['Abdomen'], 'image_count': 2, 'word_count': 100}


## 4. Save as JSON

In [51]:
out_path = Path("../data/processed/cases_metadata.json")
out_path.parent.mkdir(parents=True, exist_ok=True)

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(cases_metadata, f, ensure_ascii=False, indent=2)

print(f"\n✅ Saved case metadata to {out_path}")


✅ Saved case metadata to ..\data\processed\cases_metadata.json
