# Dataset exploration

In [31]:
import xml.etree.ElementTree as ET

In [32]:
tree = ET.parse('data/annotations.xml')

In [62]:
images = [
    {
        "name": image.attrib["name"],
        "tags": [tag.attrib["label"] for tag in image.findall("tag")]
    }
    for image in tree.findall("image")
]

In [63]:
multiple_quality = []
multiple_views = []
multiple_views_and_extra = []
no_views = []
no_quality = []

acceptable_tags = set([
    "quality 1 (perfect)",
    "quality 2",
    "quality 3",
    "quality 4",
    "quality 5 (unreadable)",
    "ap view",
    "axillary view",
    "scapular y view",
    "grashey view",
    "velpeau",
    "other"
])

for image in images:
    has_velpeau = False
    has_other = False
    view_count = 0
    quality_count = 0

    marks = set()  # use set for uniqueness

    for tag in image["tags"]:
        tag = tag.lower()
        
        if tag not in acceptable_tags:
            marks.add("unknown_tag")
            continue

        if "view" in tag.lower():
            view_count += 1
            if view_count > 1:
                marks.add("multiple_views")
                multiple_views.append(image)

        if "quality" in tag.lower():
            quality_count += 1
            if quality_count > 1:
                marks.add("multiple_quality")
                multiple_quality.append(image)

        if tag == "velpeau":
            has_velpeau = True
        if tag == "other":
            has_other = True

    # post-processing classifications
    if quality_count == 0:
        marks.add("no_quality")
        no_quality.append(image)

    if view_count == 0:
        marks.add("no_views")
        no_views.append(image)

    if view_count > 0 and (has_velpeau or has_other):
        marks.add("multiple_views_and_extra")
        multiple_views_and_extra.append(image)

    # convert set back to list before saving
    image["marks"] = list(marks)


In [64]:
print("\n=== Classification Stats ===")
print(f"Total images: {len(images)}")
print(f"  Multiple quality: {len(multiple_quality)}")
print(f"  Multiple views: {len(multiple_views)}")
print(f"  Multiple views + extra: {len(multiple_views_and_extra)}")
print(f"  No views: {len(no_views)}")
print(f"  No quality: {len(no_quality)}")

# Optional: show quick IDs or indexes for debugging
def summarize(lst, name, key="id"):
    ids = [str(img.get(key, i)) for i, img in enumerate(lst)]
    print(f"{name} ({len(lst)}): {', '.join(ids[:10])}{' ...' if len(ids) > 10 else ''}")

print("\n=== Quick Lists ===")
summarize(multiple_quality, "Multiple quality")
summarize(multiple_views, "Multiple views")
summarize(multiple_views_and_extra, "Multiple views + extra")
summarize(no_views, "No views")
summarize(no_quality, "No quality")


=== Classification Stats ===
Total images: 4211
  Multiple quality: 2
  Multiple views: 2
  Multiple views + extra: 40
  No views: 270
  No quality: 2342

=== Quick Lists ===
Multiple quality (2): 0, 1
Multiple views (2): 0, 1
Multiple views + extra (40): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ...
No views (270): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ...
No quality (2342): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ...


# Dataset Binary True Labels

In [68]:
from load_dataset import make_dataset
import json

BASE_DIR = "./data"
IMAGE_DIR = "./data/images"

dataset_settings = {
    "image_size": (224, 224),
    "label_map": {
        'quality 1 (perfect)':0,
        'quality 2': None,
        'quality 3': None,
        'quality 4': 1,
        'quality 5 (unreadable)': 1,
    }
}

In [69]:
dataset = make_dataset(BASE_DIR, IMAGE_DIR, 
                       image_size=dataset_settings["image_size"],
                       label_map=dataset_settings["label_map"])

In [76]:
pairs = []
hashes = {}

for path, label in zip(dataset.images, dataset.labels):
    pairs.append({ "path": path, "label": label.item() })
    hashes[path] = label.item()

json_str = json.dumps(pairs, indent=2)

print(json_str[:300], "...\n\n")
print("PAIRS", len(pairs))

[
  {
    "path": "MURA-v1.1/Negative/train/XR_SHOULDER/patient00007/study2_negative/image2.png",
    "label": 1.0
  },
  {
    "path": "MURA-v1.1/Negative/train/XR_SHOULDER/patient00074/study4_negative/image3.png",
    "label": 0.0
  },
  {
    "path": "MURA-v1.1/Negative/train/XR_SHOULDER/patient0 ...


PAIRS 732


In [30]:
with open("binary_dataset.json", "w") as file:
    file.write(json_str) 

In [71]:
images_set = set(dataset.images)
images_binary = [image for image in images if image["name"] in images_set]

assert len(images_binary) == len(pairs)

In [92]:
counts_good = {}
counts_bad = {}

counts_good_total = 0
counts_bad_total = 0

for image in images_binary:
    if hashes[image["name"]] == 0:
        counts_good_total += 1
        for tag in image["tags"]:
            counts_good[tag] = counts_good.get(tag, 0) + 1
    else:
        counts_bad_total += 1
        for tag in image["tags"]:
            counts_bad[tag] = counts_bad.get(tag, 0) + 1

counts_good = dict(sorted(counts_good.items()))
counts_bad = dict(sorted(counts_bad.items()))

freqs_good = {key:str(value/counts_good_total*100) + "%" for key, value in counts_good.items()}
freqs_bad = {key:str(value/counts_bad_total*100) + "%" for key, value in counts_bad.items()}

print("0 - no retake: ",json.dumps(freqs_good, indent=2))
print("1 - retake xray: ", json.dumps(freqs_bad, indent=2))

0 - no retake:  {
  "Axillary View": "31.16531165311653%",
  "Grashey View": "68.83468834688347%",
  "quality 1 (perfect)": "100.0%",
  "quality 2": "0.27100271002710025%"
}
1 - retake xray:  {
  "Axillary View": "75.48209366391184%",
  "Grashey View": "24.793388429752067%",
  "other": "5.234159779614325%",
  "quality 4": "81.81818181818183%",
  "quality 5 (unreadable)": "18.181818181818183%"
}
