In [13]:
import json
from pathlib import Path
import shutil

import json, shutil
from pathlib import Path

def has_seg(a):
    s = a.get("segmentation")
    if isinstance(s, list):   # polygons
        return len(s) > 0
    if isinstance(s, dict):   # RLE
        return bool(s.get("counts")) and bool(s.get("size"))
    return False

def copy_images_with_seg(
    annotation_json: str,
    source_dir: str,
    dest_dir: str,
    exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff")
):
    ann = json.loads(Path(annotation_json).read_text(encoding="utf-8"))
    imgs = ann.get("images", [])
    anns = ann.get("annotations", [])

    # ids of images that have at least one valid segmentation
    ids_with_seg = {a["image_id"] for a in anns if "image_id" in a and has_seg(a)}

    # map image id -> file_name (basename)
    id2name = {img["id"]: Path(str(img["file_name"]).strip()).name for img in imgs if "id" in img and "file_name" in img}

    # target basenames, lowercased
    targets = {id2name[i].lower() for i in ids_with_seg if i in id2name}

    src_root = Path(source_dir)
    dst_root = Path(dest_dir)
    dst_root.mkdir(parents=True, exist_ok=True)

    # index all source images by basename (case-insensitive)
    index = {}
    for p in src_root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            index[p.name.lower()] = p

    copied, missing = 0, []
    for name in sorted(targets):
        src = index.get(name)
        if src:
            shutil.copy2(src, dst_root / src.name)
            copied += 1
        else:
            missing.append(name)

    print(f"Images in JSON: {len(imgs)}")
    print(f"Images with ≥1 valid segmentation: {len(ids_with_seg)}")
    print(f"Copied: {copied}")
    print(f"Missing: {len(missing)}")
    if missing:
        print("Missing examples (first 20):")
        for m in missing[:20]:
            print("  -", m)

copy_images_with_seg(
    # annotation_json="./Data/1200CNV.json",
    # annotation_json="./Data/CNV_521+Outlier (1).json", 
    # annotation_json="./Data/550Noel.json",
    annotation_json ="./Data/instances_Train.json",
    source_dir='/Users/ammaster10/Downloads/Test/oct2017/OCT2017 /train/CNV',
    dest_dir="./1200"
)

Images in JSON: 1191
Images with ≥1 valid segmentation: 848
Copied: 848
Missing: 0


In [14]:
def count_files_in_folder(folder_path, exts=None):
    """
    Count the number of files in a folder, optionally filtering by extension.
    
    Args:
        folder_path (str): Path to the folder
        exts (tuple, optional): File extensions to include. If None, count all files.
        
    Returns:
        int: Number of files in the folder
    """
    path = Path(folder_path)
    if not path.exists():
        print(f"Folder {folder_path} does not exist")
        return 0
    
    if exts:
        # Count only files with specified extensions
        count = sum(1 for f in path.glob("*") if f.is_file() and f.suffix.lower() in exts)
    else:
        # Count all files
        count = sum(1 for f in path.glob("*") if f.is_file())
    
    print(f"Found {count} files in {folder_path}")
    return count

# Example usage
# file_count = count_files_in_folder("/Users/ammaster10/Desktop/OCT2017/train/CNV", exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))
# file_count = count_files_in_folder('/Users/ammaster10/Downloads/Test/oct2017/OCT2017 /train/CNV', exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))
file_count = count_files_in_folder('./1200', exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))

Found 1700 files in ./1200


In [2]:
import json
from pathlib import Path

ann = json.loads(Path("./Data/instances_Train.json").read_text(encoding="utf-8"))

imgs = ann.get("images", [])
anns = ann.get("annotations", [])

def has_seg(a):
    s = a.get("segmentation")
    if isinstance(s, list):   # polygons
        return len(s) > 0
    if isinstance(s, dict):   # RLE
        return bool(s.get("counts")) and bool(s.get("size"))
    return False

img_count            = len(imgs)
ann_count            = len(anns)
uniq_imgids_any      = len({a["image_id"] for a in anns if "image_id" in a})
uniq_imgids_with_seg = len({a["image_id"] for a in anns if "image_id" in a and has_seg(a)})

print("Images:", img_count)
print("Annotations:", ann_count)
print("Unique image_ids (any):", uniq_imgids_any)
print("Unique image_ids (with seg):", uniq_imgids_with_seg)

Images: 1191
Annotations: 819
Unique image_ids (any): 819
Unique image_ids (with seg): 819
