# Dataset preparation

## Import Libraries

In [1]:
import os
import random
import shutil
from pathlib import Path
import yaml # PyYAML: pip install pyyaml
import cv2
import time

## Divide image dataset into training and validation datasets

### Configuration 

In [2]:
# 1. PATHS
# Path to the directory containing species folders
INPUT_BASE_DIR = Path("./images") 

# Path where the structured YOLO dataset will be created
OUTPUT_BASE_DIR = Path("./datasets/bird_dataset_yolo")

# 2. DATASET PARAMETERSก
# Define species and assign class IDs (MUST start from 0)
CLASS_MAPPING = {
    "Australasian_Bittern": 0,
    "Australian_Painted_Snipe": 1,
    "Gang-gang_Cockatoo":2,
    "Grey_Falcon":3,
    "Painted_Honeyeater":4,
    "Pilotbird":5,
    "Plains_Wanderer":6,
    "Southern_Whiteface":7,
    "Swift_Parrot":8,
    "White-throated_Needletail":9
}

# Define the split ratio for training data (e.g., 0.8 means 80% train, 20% val)
TRAIN_RATIO = 0.8

# Allowed image extensions
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png'}

def create_yolo_label_file(label_path, class_id):
    """
    Creates a YOLO format label file assuming the object covers the whole image.
    """
    # Format: class_id center_x center_y width height (normalized)
    yolo_line = f"{class_id} 0.5 0.5 1.0 1.0\n"
    try:
        with open(label_path, 'w') as f:
            f.write(yolo_line)
    except Exception as e:
        print(f"  Error writing label file {label_path}: {e}")

### Execution

In [3]:
# Begin dividing image dataset into training and validation datasets
print("Starting dataset preparation...")

# 1. Create Output Directories
img_train_dir = OUTPUT_BASE_DIR / "images" / "train"
lbl_train_dir = OUTPUT_BASE_DIR / "labels" / "train"
img_val_dir = OUTPUT_BASE_DIR / "images" / "val"
lbl_val_dir = OUTPUT_BASE_DIR / "labels" / "val"

for dir_path in [img_train_dir, lbl_train_dir, img_val_dir, lbl_val_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"Created directory: {dir_path}")

total_images_processed = 0
total_train_images = 0
total_val_images = 0

# 2. Process Each Species Folder
print("\nProcessing species folders...")
for species_name, class_id in CLASS_MAPPING.items():
    species_input_dir = INPUT_BASE_DIR / species_name
    print(f"\nProcessing species: '{species_name}' (Class ID: {class_id})")
    print(f"  Input directory: {species_input_dir}")

    if not species_input_dir.is_dir():
        print(f"  Warning: Input directory not found for '{species_name}'. Skipping.")
        continue

    # Get all image files for the current species
    image_files = [f for f in species_input_dir.iterdir() if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS]
    num_images = len(image_files)
    print(f"  Found {num_images} images for this species.")

    if num_images == 0:
        print("  No images found. Skipping.")
        continue

    # Shuffle the image files randomly
    random.shuffle(image_files)

    # Calculate split index
    split_index = int(num_images * TRAIN_RATIO)
    train_files = image_files[:split_index]
    val_files = image_files[split_index:]

    print(f"  Splitting into {len(train_files)} train and {len(val_files)} validation images.")

    # Process Training Files
    print("  Processing training set...")
    for img_path in train_files:
        try:
            # Copy image
            dest_img_path = img_train_dir / img_path.name
            shutil.copy2(img_path, dest_img_path) # copy2 preserves metadata

            # Create label file
            label_filename = img_path.stem + ".txt"
            dest_lbl_path = lbl_train_dir / label_filename
            create_yolo_label_file(dest_lbl_path, class_id)

            total_train_images += 1
            total_images_processed +=1
        except Exception as e:
            print(f"    Error processing {img_path.name} for training: {e}")


    # Process Validation Files
    print("  Processing validation set...")
    for img_path in val_files:
        try:
            # Copy image
            dest_img_path = img_val_dir / img_path.name
            shutil.copy2(img_path, dest_img_path)

            # Create label file
            label_filename = img_path.stem + ".txt"
            dest_lbl_path = lbl_val_dir / label_filename
            create_yolo_label_file(dest_lbl_path, class_id)

            total_val_images += 1
            total_images_processed +=1
        except Exception as e:
             print(f"    Error processing {img_path.name} for validation: {e}")

print(f"\nProcessed {total_images_processed} total images.")
print(f"  Training images: {total_train_images}")
print(f"  Validation images: {total_val_images}")

# 3. Create data.yaml file
print("\nCreating data.yaml file...")
yaml_path = OUTPUT_BASE_DIR / "data.yaml"

# Create the 'names' list in the correct order of class IDs
names_list = [name for name, idx in sorted(CLASS_MAPPING.items(), key=lambda item: item[1])]

yaml_content = {
    'path': str(OUTPUT_BASE_DIR.resolve()), # Absolute path recommended by Ultralytics
    'train': str((OUTPUT_BASE_DIR / "images" / "train").relative_to(OUTPUT_BASE_DIR)), #'images/train',
    'val': str((OUTPUT_BASE_DIR / "images" / "val").relative_to(OUTPUT_BASE_DIR)), #'images/val',
    'nc': len(CLASS_MAPPING),
    # 'test': '', # Add if you create a test set
    'names': names_list
}

try:
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_content, f, sort_keys=False, default_flow_style=False)
    print(f"Successfully created data.yaml at: {yaml_path}")
except Exception as e:
    print(f"Error creating data.yaml: {e}")

print("\nDataset preparation finished!")
print(f"YOLO dataset structure created at: {OUTPUT_BASE_DIR}")

Starting dataset preparation...
Created directory: datasets\bird_dataset_yolo\images\train
Created directory: datasets\bird_dataset_yolo\labels\train
Created directory: datasets\bird_dataset_yolo\images\val
Created directory: datasets\bird_dataset_yolo\labels\val

Processing species folders...

Processing species: 'Australasian_Bittern' (Class ID: 0)
  Input directory: images\Australasian_Bittern
  Found 74 images for this species.
  Splitting into 59 train and 15 validation images.
  Processing training set...
  Processing validation set...

Processing species: 'Australian_Painted_Snipe' (Class ID: 1)
  Input directory: images\Australian_Painted_Snipe
  Found 45 images for this species.
  Splitting into 36 train and 9 validation images.
  Processing training set...
  Processing validation set...

Processing species: 'Gang-gang_Cockatoo' (Class ID: 2)
  Input directory: images\Gang-gang_Cockatoo
  Found 398 images for this species.
  Splitting into 318 train and 80 validation images.
 

## Resize images

### Configuration 

In [4]:
# Path to the base directory of your YOLO dataset
DATASET_BASE_DIR = Path("./datasets/bird_dataset_yolo")

# Target size (width and height) for resizing
TARGET_SIZE = 320 # (e.g., 320 or 416)

# Subdirectories containing images to resize
IMAGE_SUBDIRS = ["train", "val"] # Add "test" if you have it

# Allowed image extensions
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png'}

def resize_image(image_path, target_size):
    """Reads, resizes, and overwrites an image file."""
    try:
        # Ensure the path is treated as a string for OpenCV functions
        img_path_str = str(image_path)

        img = cv2.imread(img_path_str)
        if img is None:
            # This error is now explicitly caught and reported in main
            return False, f"Could not read image {image_path.name}"

        # Get original dimensions (optional, for info)
        # original_height, original_width = img.shape[:2]

        # Resize the image - cv2.resize expects (width, height)
        # cv2.INTER_AREA is generally recommended for shrinking images
        resized_img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA)

        # Overwrite the original file
        success = cv2.imwrite(img_path_str, resized_img)
        if not success:
             # This error is now explicitly caught and reported in main
             return False, f"Failed to write resized image {image_path.name}"

        # print(f"    Resized {image_path.name} from {original_width}x{original_height} to {target_size}x{target_size}")
        return True, None # Success, no error message

    except Exception as e:
        # Catch any other unexpected errors during processing
        return False, f"Error processing {image_path.name}: {e}"

### Execution

In [5]:
# Begin resizing images
print("Starting image resizing process...")
print(f"Target size: {TARGET_SIZE}x{TARGET_SIZE}")
print(f"Dataset base directory: {DATASET_BASE_DIR}")
print("\n WARNING: This script will overwrite original images in place! ")
print("Make sure you have a backup if needed.")
time.sleep(3) # Pause for 3 seconds to allow cancellation

images_processed = 0
images_failed = 0
failed_files_details = [] # List to store details of failed files
start_time = time.time()

images_root_dir = DATASET_BASE_DIR / "images"

for subdir_name in IMAGE_SUBDIRS:
    current_dir = images_root_dir / subdir_name
    print(f"\nProcessing directory: {current_dir}")

    if not current_dir.is_dir():
        print(f"  Warning: Subdirectory '{subdir_name}' not found. Skipping.")
        continue

    image_files = [f for f in current_dir.iterdir() if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS]
    num_images = len(image_files)
    print(f"  Found {num_images} images.")

    if num_images == 0:
        continue # Skip if no images found

    for i, img_path in enumerate(image_files):
        # Update progress indicator on the same line
        print(f"  Processing image {i+1}/{num_images}: {img_path.name}...", end='\r', flush=True)

        success, error_msg = resize_image(img_path, TARGET_SIZE)

        if success:
            images_processed += 1
        else:
            images_failed += 1
            # Store the path and the error message for reporting
            failed_files_details.append({'path': str(img_path), 'error': error_msg})
            # Print the specific error immediately when it occurs, on a new line
            print(f"\n    ERROR: {error_msg}") # Print error immediately

    # Clear the progress indicator line after finishing a directory
    print(f"\n  Finished processing directory: {current_dir}")


end_time = time.time()
duration = end_time - start_time

print("\n--- Resizing Summary ---")
print(f"Successfully processed and resized: {images_processed} images")
print(f"Failed to process:                 {images_failed} images")

# --- Report Failed Files ---
if failed_files_details:
    print("\n--- Failed File Details ---")
    print("The following image files could not be processed and may need review or removal:")
    for item in failed_files_details:
        print(f"  - Path:  {item['path']}")
        print(f"    Error: {item['error']}")
        # Suggestion for removal command (use with caution!)
        # print(f"    (To remove on Linux/macOS: rm \"{item['path']}\")")
        # print(f"    (To remove on Windows: del \"{item['path']}\")")
    print("\nConsider checking these files for corruption or unsupported formats.")
elif images_failed > 0:
     print("\nNOTE: Some files failed, but details were not captured (check script logic).")
else:
     print("\nNo files failed during processing.")
# --- End Report ---

print(f"\nTotal time taken:                  {duration:.2f} seconds")
print("Image resizing complete.")

Starting image resizing process...
Target size: 320x320
Dataset base directory: datasets\bird_dataset_yolo

Make sure you have a backup if needed.

Processing directory: datasets\bird_dataset_yolo\images\train
  Found 937 images.
  Processing image 937/937: White-throated Needletail (Hirundapus caudacutus)_v9.jpg......
  Finished processing directory: datasets\bird_dataset_yolo\images\train

Processing directory: datasets\bird_dataset_yolo\images\val
  Found 239 images.
  Processing image 239/239: White-throated Needletail (Hirundapus caudacutus)_v8.jpg......
  Finished processing directory: datasets\bird_dataset_yolo\images\val

--- Resizing Summary ---
Successfully processed and resized: 1176 images
Failed to process:                 0 images

No files failed during processing.

Total time taken:                  19.25 seconds
Image resizing complete.


In [6]:
len(CLASS_MAPPING)

10