In [9]:
import json
from pathlib import Path
import shutil

def copy_cnv_images(annotation_json: str,
                    source_dir: str,
                    dest_dir: str,
                    exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff")):
    """
    Read COCO-style JSON, extract image file_name values,
    find them under source_dir (recursively), and copy to dest_dir.
    """
    ann_path = Path(annotation_json)
    src_root = Path(source_dir)
    dst_root = Path(dest_dir)
    dst_root.mkdir(parents=True, exist_ok=True)

    # --- Load JSON and collect target filenames ---
    with ann_path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    target_names = []
    for img in data.get("images", []):
        fn = str(img.get("file_name", "")).strip()
        if not fn:
            continue
        if not fn.lower().endswith(exts):
            # Skip weird entries that aren't typical image files
            continue
        target_names.append(fn)

    # unique, case-insensitive
    target_names_set = {name.lower() for name in target_names}

    # --- Index all files in source by basename (case-insensitive) ---
    index = {}
    for p in src_root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            index[p.name.lower()] = p

    copied = 0
    missing = []

    # --- Copy over matches ---
    for name in sorted(target_names_set):
        if name in index:
            src_path = index[name]
            dst_path = dst_root / src_path.name
            # If you might have duplicate basenames, add subfolders or rename here.
            shutil.copy2(src_path, dst_path)
            copied += 1
        else:
            missing.append(name)

    # --- Report ---
    print(f"Total targets in JSON: {len(target_names_set)}")
    print(f"Copied: {copied}")
    print(f"Missing: {len(missing)}")
    if missing:
        print("Missing examples (first 20):")
        for m in missing[:20]:
            print("  -", m)


copy_cnv_images(
    # annotation_json="/Users/ammaster10/Documents/Github/Year4/CNVresearch/Data/1200CNV.json",
    annotation_json="./Data/CNV_521+Outlier (1).json",
    # annotation_json="./Data/550Noel.json",
    source_dir="/Users/ammaster10/Desktop/OCT2017/train/CNV",
    dest_dir="./1200"
)

Total targets in JSON: 663
Copied: 571
Missing: 92
Missing examples (first 20):
  - cnv-794538-27.jpeg
  - cnv-7997521-6.jpeg
  - cnv-8014630-40.jpeg
  - cnv-8039905-17.jpeg
  - cnv-8056259-3.jpeg
  - cnv-8061223-2.jpeg
  - cnv-8184974-70.jpeg
  - cnv-827677-5.jpeg
  - cnv-829402-16.jpeg
  - cnv-829402-18.jpeg
  - cnv-846962-1.jpeg
  - cnv-846962-10.jpeg
  - cnv-846962-11.jpeg
  - cnv-846962-20.jpeg
  - cnv-846962-5.jpeg
  - cnv-8598714-10.jpeg
  - cnv-8598714-156.jpeg
  - cnv-8598714-29.jpeg
  - cnv-8598714-70.jpeg
  - cnv-8598714-81.jpeg


In [10]:
def count_files_in_folder(folder_path, exts=None):
    """
    Count the number of files in a folder, optionally filtering by extension.
    
    Args:
        folder_path (str): Path to the folder
        exts (tuple, optional): File extensions to include. If None, count all files.
        
    Returns:
        int: Number of files in the folder
    """
    path = Path(folder_path)
    if not path.exists():
        print(f"Folder {folder_path} does not exist")
        return 0
    
    if exts:
        # Count only files with specified extensions
        count = sum(1 for f in path.glob("*") if f.is_file() and f.suffix.lower() in exts)
    else:
        # Count all files
        count = sum(1 for f in path.glob("*") if f.is_file())
    
    print(f"Found {count} files in {folder_path}")
    return count

# Example usage
file_count = count_files_in_folder("./1200", exts=(".jpg", ".jpeg", ".png", ".tif", ".tiff"))

Found 1184 files in ./1200
