In [None]:
# "pre-processing_px-128_step_automated-labels_pc-150[dot]ipynb"
# Script 2a: Pollen Crop Preprocessing
# Resizes and converts existing pollen crops to target format (128x128, 1-ch Greyscale).
# 
# 

import os
import cv2
import numpy as np
from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter!
from PIL import Image, UnidentifiedImageError # Use Pillow for potentially wider format support initially

def preprocess_pollen_crops(input_dir, output_dir, target_size=(128, 128)):
    """
    Reads pollen crops, resizes them, converts to single-channel greyscale, 
    and saves them to the output directory.

    Args:
        input_dir (str): Path to the directory containing the original pollen crops 
                         (e.g., the 61,671 filtered images). Assumes subfolders are okay.
        output_dir (str): Path to the directory where preprocessed images will be saved.
        target_size (tuple): Target (height, width) for resizing (e.g., (128, 128)).
    """
    print("--- Starting Pollen Crop Preprocessing ---")
    print(f"Input Directory: {input_dir}")
    print(f"Output Directory: {output_dir}")
    print(f"Target Size: {target_size}")

    if not os.path.isdir(input_dir):
        print(f"Error: Input directory not found: {input_dir}")
        return

    try:
        os.makedirs(output_dir, exist_ok=True)
        print(f"Created/Ensured output directory exists: {output_dir}")
    except OSError as e:
        print(f"Error creating output directory: {e}")
        return

    processed_count = 0
    error_count = 0
    image_files = []

    # Find all image files in the input directory and its subdirectories
    print("Scanning for image files...")
    for root, _, files in os.walk(input_dir):
        for f in files:
            if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.tif')):
                image_files.append(os.path.join(root, f))
    
    if not image_files:
        print("Error: No image files found in the input directory or subdirectories.")
        return
        
    print(f"Found {len(image_files)} potential image files. Starting preprocessing...")

    # Process each image file
    for img_path in tqdm(image_files, desc="Preprocessing Pollen Crops"):
        try:
            # Read image using Pillow first (handles more formats potentially), then convert
            # This also handles the 3-channel greyscale case implicitly by converting to 'L'
            with Image.open(img_path).convert('L') as img_pil: # Convert to 'L' (8-bit greyscale)
                 # Resize using Pillow (offers various resampling filters)
                 # ANTIALIAS is generally high quality for downscaling
                 img_resized_pil = img_pil.resize((target_size[1], target_size[0]), Image.Resampling.LANCZOS) # Pillow uses (W, H)
                 
                 # Convert Pillow image to NumPy array for saving with OpenCV if needed,
                 # or just save directly with Pillow
                 # img_np_gray = np.array(img_resized_pil) # This will be HxW

            # --- Save the processed image ---
            # Construct output path (preserve filename)
            base_filename = os.path.basename(img_path)
            output_filepath = os.path.join(output_dir, base_filename)
            
            # Ensure output format is consistent, e.g., PNG
            if not output_filepath.lower().endswith(".png"):
                 base, _ = os.path.splitext(output_filepath)
                 output_filepath = base + ".png"

            # Save using Pillow
            img_resized_pil.save(output_filepath, format='PNG')
            
            # # Alternative: Save using OpenCV (if conversion to NumPy was done)
            # cv2.imwrite(output_filepath, img_np_gray) 

            processed_count += 1

        except FileNotFoundError:
            print(f"  Warning: File not found during processing: {img_path}. Skipping.")
            error_count += 1
        except UnidentifiedImageError:
            print(f"  Warning: Cannot identify image file (corrupt or unsupported): {img_path}. Skipping.")
            error_count += 1
        except Exception as e:
            print(f"  Error processing file {img_path}: {e}")
            error_count += 1

    # --- Final Summary ---
    print(f"\n--- Preprocessing Finished ---")
    print(f"Successfully processed and saved: {processed_count} images.")
    print(f"Encountered errors for: {error_count} images.")
    print(f"Preprocessed images saved to: {output_dir}")

# ==============================================================================
# --- User Configuration Section ---
# ==============================================================================

# 1. Define input and output paths 
#    Input directory containing the original (filtered) pollen crops.
#    Can contain subfolders, the script will scan recursively.
# default : 'C:\path\to\your\filtered_pollen_crops_61k'
input_pollen_crop_dir = r"C:\Users\praam\Desktop\havetai+vetcyto\task-05_dataset\vet_images_sliced\TrainingStepSet_automated-labels_T_full-size_150-pc_undivided_categorized\all-classes-mixed\square" # <--- MODIFY THIS PATH

#    Output directory where 128x128 greyscale PNGs will be saved.
# default : 'C:\path\to\your\preprocessed_pollen_crops_128'
output_preprocessed_crop_dir = r"C:\Users\praam\Desktop\havetai+vetcyto\task-05_dataset\preprocessing_px-128_step_automated-labels_pc-150" # <--- MODIFY THIS PATH

# 2. Define Target Size
#    The desired (height, width) for the preprocessed images.
target_size = (128, 128)

# ==============================================================================
# --- Script Execution ---
# ==============================================================================

# Run the preprocessing function
preprocess_pollen_crops(
    input_dir=input_pollen_crop_dir,
    output_dir=output_preprocessed_crop_dir,
    target_size=target_size
)

# After running, check the 'output_preprocessed_crop_dir' for the 128x128 greyscale images.
# These are ready to be used by the DataLoader in the WGAN-GP training script.
