### System Setup

In [14]:
# Libraries

import json
import os
import re
from PIL import Image, ImageDraw

In [15]:
# Variables

# Base directory
source_base_dir = "../source"  # Base directory for source data

# JSON file name
json_file_name = "_annotations.coco.json"

In [16]:
# Function to crop and save images
# Function to crop and save images with size filtering
def crop_and_save(image_path, bbox, output_path, min_width=100, min_height=100):
    image = Image.open(image_path)
    left, upper, width, height = bbox
    
    # Skip crops that are smaller than the minimum size
    if width < min_width or height < min_height:
        return False  # Return False to indicate the crop was skipped
        
    right = left + width
    lower = upper + height
    cropped_image = image.crop((left, upper, right, lower))
    cropped_image.save(output_path)
    return True  # Return True to indicate the crop was saved


### Process Images

In [17]:
import random
import os

# Function to process a single directory
def process_directory(source_dir, sample_size=None):
    json_path = os.path.join(source_dir, json_file_name)
    processed_dir = os.path.join(source_dir, 'processed')
    os.makedirs(processed_dir, exist_ok=True)

    # Load annotations from the JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)

    # Create a dictionary to map image_id to file_name
    image_id_to_filename = {image['id']: image['file_name'] for image in data['images']}

    # Dictionary to group bounding boxes by image_id
    image_bboxes = {}

    # Iterate over annotations and group bounding boxes by image_id
    for annotation in data['annotations']:
        image_id = annotation['image_id']
        bbox = annotation['bbox']
        if image_id not in image_bboxes:
            image_bboxes[image_id] = []
        image_bboxes[image_id].append(bbox)
    
    # Select random sample if requested
    if sample_size and len(image_bboxes) > sample_size:
        # Get random sample of image IDs
        sampled_image_ids = random.sample(list(image_bboxes.keys()), sample_size)
        # Filter image_bboxes to only include sampled IDs
        image_bboxes = {img_id: bboxes for img_id, bboxes in image_bboxes.items() 
                      if img_id in sampled_image_ids}

    # Iterate over images and process them
    total_crops = 0
    total_saved = 0
    
    for image_id, bboxes in image_bboxes.items():
        image_filename = image_id_to_filename[image_id]
        image_path = os.path.join(source_dir, image_filename)
        
        # Don't save images with bounding boxes
        # output_path_with_boxes = os.path.join(processed_dir, f"{os.path.splitext(image_filename)[0]}_with_boxes.jpg")
        # draw_bounding_boxes(image_path, bboxes, output_path_with_boxes)
        
        # Crop and save each bounding box individually if they meet the size requirements
        crops_saved = 0
        for i, bbox in enumerate(bboxes):
            output_path = os.path.join(processed_dir, f"{os.path.splitext(image_filename)[0]}_crop_{i}.jpg")
            if crop_and_save(image_path, bbox, output_path, min_width=100, min_height=100):
                crops_saved += 1
        
        total_crops += len(bboxes)
        total_saved += crops_saved
        print(f"Processed {image_filename}: {crops_saved}/{len(bboxes)} crops saved (≥100x100px)")
    
    print(f"Directory {os.path.basename(source_dir)}: {total_saved}/{total_crops} crops saved (≥100x100px)")


In [None]:
# Main loop with random sampling
for subdir in os.listdir(source_base_dir):
    source_dir = os.path.join(source_base_dir, subdir)
    if os.path.isdir(source_dir):
        process_directory(source_dir, 500)  # Process up to 500 random images

Processed image_07038.jpg: 0/1 crops saved (≥100x100px)
Processed image_06647.jpg: 0/6 crops saved (≥100x100px)
Processed image_07561.jpg: 0/22 crops saved (≥100x100px)
Processed image_07099.jpg: 2/2 crops saved (≥100x100px)
Processed image_07001.jpg: 0/9 crops saved (≥100x100px)
Processed image_07658.jpg: 0/28 crops saved (≥100x100px)
Processed image_07944.jpg: 0/6 crops saved (≥100x100px)
Processed image_06945.jpg: 0/1 crops saved (≥100x100px)
Processed image_08014.jpg: 0/3 crops saved (≥100x100px)
Processed image_07854.jpg: 1/153 crops saved (≥100x100px)
Directory valid: 3/231 crops saved (≥100x100px)
Processed image_06186.jpg: 0/6 crops saved (≥100x100px)
Processed image_06033.jpg: 0/4 crops saved (≥100x100px)
Processed image_06424.jpg: 0/55 crops saved (≥100x100px)
Processed image_06124.jpg: 0/1 crops saved (≥100x100px)
Processed image_06298.jpg: 0/25 crops saved (≥100x100px)
Processed image_06358.jpg: 0/7 crops saved (≥100x100px)
Processed image_06221.jpg: 0/14 crops saved (≥100x