In [13]:
import json
from pathlib import Path
import shutil

import json, shutil
from pathlib import Path

def has_seg(a):
    s = a.get("segmentation")
    if isinstance(s, list):   # polygons
        return len(s) > 0
    if isinstance(s, dict):   # RLE
        return bool(s.get("counts")) and bool(s.get("size"))
    return False

def copy_images_with_seg(
    annotation_json: str,
    source_dir: str,
    dest_dir: str,
    exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff")
):
    ann = json.loads(Path(annotation_json).read_text(encoding="utf-8"))
    imgs = ann.get("images", [])
    anns = ann.get("annotations", [])

    # ids of images that have at least one valid segmentation
    ids_with_seg = {a["image_id"] for a in anns if "image_id" in a and has_seg(a)}

    # map image id -> file_name (basename)
    id2name = {img["id"]: Path(str(img["file_name"]).strip()).name for img in imgs if "id" in img and "file_name" in img}

    # target basenames, lowercased
    targets = {id2name[i].lower() for i in ids_with_seg if i in id2name}

    src_root = Path(source_dir)
    dst_root = Path(dest_dir)
    dst_root.mkdir(parents=True, exist_ok=True)

    # index all source images by basename (case-insensitive)
    index = {}
    for p in src_root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            index[p.name.lower()] = p

    copied, missing = 0, []
    for name in sorted(targets):
        src = index.get(name)
        if src:
            shutil.copy2(src, dst_root / src.name)
            copied += 1
        else:
            missing.append(name)

    print(f"Images in JSON: {len(imgs)}")
    print(f"Images with ≥1 valid segmentation: {len(ids_with_seg)}")
    print(f"Copied: {copied}")
    print(f"Missing: {len(missing)}")
    if missing:
        print("Missing examples (first 20):")
        for m in missing[:20]:
            print("  -", m)

copy_images_with_seg(
    # annotation_json="./Data/1200CNV.json",
    # annotation_json="./Data/CNV_521+Outlier (1).json", 
    # annotation_json="./Data/550Noel.json",
    # annotation_json ="./Data/instances_Train.json",
    annotation_json ="./Data/subset_340.json",
    source_dir='/Users/ammaster10/Downloads/Test/oct2017/OCT2017 /train/CNV',
    dest_dir="./340_Test"
)

Images in JSON: 340
Images with ≥1 valid segmentation: 340
Copied: 340
Missing: 0


In [14]:
def count_files_in_folder(folder_path, exts=None):
    """
    Count the number of files in a folder, optionally filtering by extension.
    
    Args:
        folder_path (str): Path to the folder
        exts (tuple, optional): File extensions to include. If None, count all files.
        
    Returns:
        int: Number of files in the folder
    """
    path = Path(folder_path)
    if not path.exists():
        print(f"Folder {folder_path} does not exist")
        return 0
    
    if exts:
        # Count only files with specified extensions
        count = sum(1 for f in path.glob("*") if f.is_file() and f.suffix.lower() in exts)
    else:
        # Count all files
        count = sum(1 for f in path.glob("*") if f.is_file())
    
    print(f"Found {count} files in {folder_path}")
    return count

# Example usage
# file_count = count_files_in_folder("/Users/ammaster10/Desktop/OCT2017/train/CNV", exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))
# file_count = count_files_in_folder('/Users/ammaster10/Downloads/Test/oct2017/OCT2017 /train/CNV', exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))
file_count = count_files_in_folder('./340_Test', exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))

Found 340 files in ./340_Test


In [2]:
import json
from pathlib import Path

ann = json.loads(Path("./Data/instances_Train.json").read_text(encoding="utf-8"))

imgs = ann.get("images", [])
anns = ann.get("annotations", [])

def has_seg(a):
    s = a.get("segmentation")
    if isinstance(s, list):   # polygons
        return len(s) > 0
    if isinstance(s, dict):   # RLE
        return bool(s.get("counts")) and bool(s.get("size"))
    return False

img_count            = len(imgs)
ann_count            = len(anns)
uniq_imgids_any      = len({a["image_id"] for a in anns if "image_id" in a})
uniq_imgids_with_seg = len({a["image_id"] for a in anns if "image_id" in a and has_seg(a)})

print("Images:", img_count)
print("Annotations:", ann_count)
print("Unique image_ids (any):", uniq_imgids_any)
print("Unique image_ids (with seg):", uniq_imgids_with_seg)

Images: 1191
Annotations: 819
Unique image_ids (any): 819
Unique image_ids (with seg): 819


In [6]:
import json
from pathlib import Path

# <<< set your input files here >>>
INPUTS = [
    "./Data/1700Training.json",
    "./Validation/TEMP_VAL_45.json"
]
OUT_PATH = "./Data/1700Training_WithVal.json"

def load_json(p):
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def merge_coco(paths, out_path):
    # Output skeleton (use first file's info/licenses if present)
    base = load_json(paths[0])
    out = {
        "info": base.get("info", {}),
        "licenses": base.get("licenses", []),
        "images": [],
        "annotations": [],
        "categories": [],
    }

    # Maps for ID remapping
    cat_name_to_newid = {}
    img_fname_to_newid = {}

    # Next IDs
    next_cat_id = 1
    next_img_id = 1
    next_ann_id = 1

    # Build categories map across files by name
    def get_or_add_category(cat):
        nonlocal next_cat_id
        name = cat["name"]
        if name not in cat_name_to_newid:
            cat_name_to_newid[name] = next_cat_id
            out["categories"].append({
                "id": next_cat_id,
                "name": name,
                "supercategory": cat.get("supercategory", "")
            })
            next_cat_id += 1
        return cat_name_to_newid[name]

    # Process each file
    for p in paths:
        data = load_json(p)

        # Build oldCatId -> newCatId for this file
        cat_id_map = {}
        for c in data.get("categories", []):
            cat_id_map[c["id"]] = get_or_add_category(c)

        # Images: dedupe by file_name
        img_id_map = {}
        for img in data.get("images", []):
            fname = img.get("file_name")
            if fname in img_fname_to_newid:
                new_id = img_fname_to_newid[fname]
            else:
                new_id = next_img_id
                img_fname_to_newid[fname] = new_id
                new_img = dict(img)
                new_img["id"] = new_id
                out["images"].append(new_img)
                next_img_id += 1
            img_id_map[img["id"]] = new_id

        # Annotations: remap image_id and category_id, assign new ann id
        for ann in data.get("annotations", []):
            if ann.get("image_id") not in img_id_map:
                # skip orphaned annotation
                continue
            if ann.get("category_id") not in cat_id_map:
                # skip unknown category
                continue
            new_ann = dict(ann)
            new_ann["id"] = next_ann_id
            new_ann["image_id"] = img_id_map[ann["image_id"]]
            new_ann["category_id"] = cat_id_map[ann["category_id"]]
            out["annotations"].append(new_ann)
            next_ann_id += 1

    # Save merged
    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    # Totals
    total_images = len(out["images"])
    total_annotations = len(out["annotations"])
    print(f"Saved: {out_path}")
    print(f"Total images: {total_images}")
    print(f"Total annotations: {total_annotations}")
    return total_images, total_annotations

if __name__ == "__main__":
    merge_coco(INPUTS, OUT_PATH)

Saved: ./Data/1700Training_WithVal.json
Total images: 2397
Total annotations: 2575


In [10]:
# split_coco_subset_and_remainder.py
import json, random
from pathlib import Path

def load(p): 
    with open(p, "r", encoding="utf-8") as f: 
        return json.load(f)

def save(obj, p):
    Path(p).parent.mkdir(parents=True, exist_ok=True)
    with open(p, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def build_coco_from_image_ids(data, keep_image_ids, remap_ids=True):
    images_all = {im["id"]: im for im in data["images"]}
    anns_all   = data["annotations"]
    cats_all   = {c["id"]: c for c in data["categories"]}

    # images to keep
    keep_images = [images_all[i] for i in keep_image_ids if i in images_all]

    # annotations to keep
    keep_anns = [a for a in anns_all if a["image_id"] in keep_image_ids]

    # categories used
    used_cat_ids = {a["category_id"] for a in keep_anns}
    keep_cats = [cats_all[cid] for cid in used_cat_ids if cid in cats_all]

    if not remap_ids:
        return {
            "info": data.get("info", {}),
            "licenses": data.get("licenses", []),
            "images": keep_images,
            "annotations": keep_anns,
            "categories": keep_cats,
        }

    # remap IDs to contiguous
    oldimg2new = {oid: i+1 for i, oid in enumerate(sorted(keep_image_ids))}
    oldcat2new = {oid: i+1 for i, oid in enumerate(sorted(used_cat_ids))}

    images_out = []
    for im in keep_images:
        nim = dict(im); nim["id"] = oldimg2new[im["id"]]
        images_out.append(nim)

    anns_out = []
    for i, a in enumerate(keep_anns, start=1):
        na = dict(a)
        na["id"] = i
        na["image_id"] = oldimg2new[a["image_id"]]
        na["category_id"] = oldcat2new[a["category_id"]]
        anns_out.append(na)

    cats_out = []
    # preserve names/supercategory while assigning new contiguous ids
    for old_id, new_id in oldcat2new.items():
        c = cats_all[old_id]
        nc = {"id": new_id, "name": c["name"], "supercategory": c.get("supercategory", "")}
        cats_out.append(nc)

    return {
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "images": images_out,
        "annotations": anns_out,
        "categories": cats_out,
    }

def pick_annotated_images(data, n, seed=42):
    # only images that have at least one annotation
    img_has_ann = {im["id"]: False for im in data["images"]}
    for a in data["annotations"]:
        iid = a.get("image_id")
        if iid in img_has_ann:
            img_has_ann[iid] = True
    pool = [im["id"] for im in data["images"] if img_has_ann.get(im["id"], False)]
    if n > len(pool):
        raise ValueError(f"Requested {n} images but only {len(pool)} annotated images available.")
    random.seed(seed)
    return set(random.sample(pool, n))

def split_subset_and_remainder(in_path, subset_out, remainder_out, n_subset=340, seed=42, remap_ids=True):
    data = load(in_path)

    subset_ids = pick_annotated_images(data, n_subset, seed=seed)
    all_ids = {im["id"] for im in data["images"]}
    remainder_ids = all_ids - subset_ids

    subset_coco = build_coco_from_image_ids(data, subset_ids, remap_ids=remap_ids)
    remainder_coco = build_coco_from_image_ids(data, remainder_ids, remap_ids=remap_ids)

    save(subset_coco, subset_out)
    save(remainder_coco, remainder_out)

    print(f"Saved subset:    {subset_out} | images={len(subset_coco['images'])}, anns={len(subset_coco['annotations'])}")
    print(f"Saved remainder: {remainder_out} | images={len(remainder_coco['images'])}, anns={len(remainder_coco['annotations'])}")

if __name__ == "__main__":
    split_subset_and_remainder(
        in_path="./Data/1700Training_WithVal.json",
        subset_out="./Data/subset_340.json",
        remainder_out="./Data/remainder_1400.json",
        n_subset=340,
        seed=42,
        remap_ids=True,  # set False to keep original IDs
    )

Saved subset:    ./Data/subset_340.json | images=340, anns=506
Saved remainder: ./Data/remainder_1400.json | images=2057, anns=2069
