In [2]:
import os
from pathlib import Path
from PIL import Image
from collections import Counter
import json # Used for pretty printing dictionaries

# --- Configuration ---
# NOTE: Set this to the path where your 'cracked' and 'non-cracked' folders reside.
# Updated path based on user input (C:\Users\acking\Desktop\project\DeepCrack-An-SDNET2018-Implementation\raw data\Walls)
DATA_ROOT = r'C:\Users\acking\Desktop\project\DeepCrack-An-SDNET2018-Implementation\raw data\Walls'

# Accepted image extensions (case-insensitive)
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp']

def analyze_folder(folder_path: Path, category: str) -> dict:
    """
    Analyzes all image files within a specified folder path.

    Args:
        folder_path (Path): The Path object for the category folder.
        category (str): The name of the category (e.g., 'cracked').

    Returns:
        dict: A dictionary containing collected statistics.
    """
    if not folder_path.is_dir():
        print(f"ERROR: Folder not found at {folder_path}")
        return {}

    print(f"\n--- Analyzing '{category}' category at: {folder_path} ---")

    stats = {
        'total_files': 0,
        'valid_images': 0,
        'errors': 0,
        'sizes': Counter(),  # Stores (width, height) tuples
        'modes': Counter()   # Stores color modes (e.g., 'RGB', 'L')
    }

    # Use rglob to recursively find all files with specified extensions
    for file_path in folder_path.rglob('*'):
        stats['total_files'] += 1
        if file_path.suffix.lower() not in IMAGE_EXTENSIONS:
            continue

        try:
            with Image.open(file_path) as img:
                # Store size as a string for cleaner Counter keys and output
                size_str = f"{img.size[0]}x{img.size[1]}"
                stats['sizes'][size_str] += 1
                stats['modes'][img.mode] += 1
                stats['valid_images'] += 1
        except Exception as e:
            # Catch errors like file not being a valid image or corruption
            print(f"Warning: Could not process {file_path.name}. Error: {e}")
            stats['errors'] += 1

    return stats

def print_summary(results: dict):
    """Prints a formatted summary report of the dataset analysis."""

    print("\n=======================================================")
    print(f"           DATASET PRE-TRAINING ANALYSIS")
    print("=======================================================")

    total_images_dataset = 0
    total_files_dataset = 0

    # 1. Category Summary
    for category, stats in results.items():
        total_images_dataset += stats.get('valid_images', 0)
        total_files_dataset += stats.get('total_files', 0)

        print(f"\n--- Category: {category.upper()} ---")
        print(f"Total Files Scanned: {stats.get('total_files', 0)}")
        print(f"Valid Images Found:  {stats.get('valid_images', 0)}")
        print(f"Files Skipped/Errors: {stats.get('errors', 0)}")

        if stats['valid_images'] > 0:
            print("\n  >> IMAGE DIMENSIONS (Size Distribution):")
            # Get the 5 most common sizes
            common_sizes = stats['sizes'].most_common(5)
            for size, count in common_sizes:
                percentage = (count / stats['valid_images']) * 100
                print(f"    - {size}: {count} images ({percentage:.2f}%)")
            
            # Check for uniformity
            if len(stats['sizes']) == 1:
                 print("  * Uniformity Check: All images have the same size.")
            else:
                 print(f"  * Uniformity Check: Found {len(stats['sizes'])} unique sizes. Standardizing image size during pre-processing is recommended.")


            print("\n  >> COLOR MODES (RGB, Grayscale 'L', etc.):")
            for mode, count in stats['modes'].items():
                percentage = (count / stats['valid_images']) * 100
                print(f"    - {mode}: {count} images ({percentage:.2f}%)")
            
            # Check for uniformity
            if len(stats['modes']) > 1:
                print("  * Color Mode Check: Found mixed color modes. Ensure you convert all images (e.g., to 'RGB') before feeding them to the network.")
            else:
                print("  * Color Mode Check: All images have the same mode.")


    # 2. Overall Dataset Summary
    print("\n=======================================================")
    print("                OVERALL DATASET STATS")
    print("=======================================================")
    
    # Calculate category balance
    cracked_count = results.get('cracked', {}).get('valid_images', 0)
    non_cracked_count = results.get('non-cracked', {}).get('valid_images', 0)

    if total_images_dataset > 0:
        cracked_pct = (cracked_count / total_images_dataset) * 100
        non_cracked_pct = (non_cracked_count / total_images_dataset) * 100
    else:
        cracked_pct = 0
        non_cracked_pct = 0

    print(f"Total Valid Images in Dataset: {total_images_dataset}")
    print(f"Total Files Scanned (including non-images): {total_files_dataset}")
    print(f"\nClass Distribution:")
    print(f"  - Cracked:     {cracked_count} ({cracked_pct:.2f}%)")
    print(f"  - Non-cracked: {non_cracked_count} ({non_cracked_pct:.2f}%)")

    # Final advice
    if abs(cracked_pct - non_cracked_pct) > 10 and total_images_dataset > 0:
        print("\n*** ACTION RECOMMENDED: CLASS IMBALANCE DETECTED! ***")
        print("The classes are imbalanced. Consider using techniques like data augmentation, class weighting, or over/undersampling during training.")
    elif total_images_dataset == 0:
        print("\n*** WARNING: ZERO VALID IMAGES FOUND. ***")
        print("Please check that the 'DATA_ROOT' path and the folder names ('cracked', 'non-cracked') are correct.")
    else:
        print("\nData distribution appears reasonably balanced.")


def main():
    """Main function to run the analysis."""
    data_root_path = Path(DATA_ROOT)
    
    # Define the two mandatory categories
    cracked_path = data_root_path / 'cracked'
    non_cracked_path = data_root_path / 'non-cracked'
    
    results = {}
    
    # Analyze both folders
    results['cracked'] = analyze_folder(cracked_path, 'cracked')
    results['non-cracked'] = analyze_folder(non_cracked_path, 'non-cracked')
    
    # Print combined summary
    print_summary(results)

if __name__ == "__main__":
    # Ensure you have Pillow installed: pip install Pillow
    # Set up a dummy folder structure for testing the script logic
    if not Path(DATA_ROOT).exists():
         print(f"Note: Could not find the main data folder at: '{DATA_ROOT}'.")
         print("      Please ensure this path exists and contains the 'cracked' and 'non-cracked' subfolders.")
         # Removed dummy folder creation since we are using a specific, absolute path now
    
    main()



--- Analyzing 'cracked' category at: C:\Users\acking\Desktop\project\DeepCrack-An-SDNET2018-Implementation\raw data\Walls\cracked ---

--- Analyzing 'non-cracked' category at: C:\Users\acking\Desktop\project\DeepCrack-An-SDNET2018-Implementation\raw data\Walls\non-cracked ---

           DATASET PRE-TRAINING ANALYSIS

--- Category: CRACKED ---
Total Files Scanned: 3851
Valid Images Found:  3851
Files Skipped/Errors: 0

  >> IMAGE DIMENSIONS (Size Distribution):
    - 256x256: 3851 images (100.00%)
  * Uniformity Check: All images have the same size.

  >> COLOR MODES (RGB, Grayscale 'L', etc.):
    - RGB: 3851 images (100.00%)
  * Color Mode Check: All images have the same mode.

--- Category: NON-CRACKED ---
Total Files Scanned: 14287
Valid Images Found:  14287
Files Skipped/Errors: 0

  >> IMAGE DIMENSIONS (Size Distribution):
    - 256x256: 14287 images (100.00%)
  * Uniformity Check: All images have the same size.

  >> COLOR MODES (RGB, Grayscale 'L', etc.):
    - RGB: 14287 imag

# create a function for  reducing the brightness 

In [None]:
import os
import numpy as np
from pathlib import Path
from PIL import Image, ImageEnhance
from typing import Tuple, List

# --- Configuration ---
# Data root path from the previous analysis step
DATA_ROOT = r'C:\Users\acking\Desktop\project\DeepCrack-An-SDNET2018-Implementation\raw data\Walls'

# Standard size required for the neural network input (H, W)
# UPDATED: Changed from (128, 128) to (256, 256) based on your input.
TARGET_SIZE: Tuple[int, int] = (256, 256) 

# Factor to increase brightness: 1.0 is no change, 1.5 is a 50% increase
BRIGHTNESS_FACTOR: float = 2 

# Accepted image extensions
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp']

def load_and_preprocess_data(root_dir: str) -> Tuple[np.ndarray, np.ndarray]:
    """
    Loads images from 'cracked' and 'non-cracked' subfolders, applies 
    brightness enhancement, resizing, and normalization, and prepares 
    NumPy arrays for model training.

    Args:
        root_dir (str): The path containing the 'cracked' and 'non-cracked' folders.

    Returns:
        Tuple[np.ndarray, np.ndarray]: X (image data) and y (labels).
    """
    root_path = Path(root_dir)
    data: List[np.ndarray] = []
    labels: List[int] = []
    
    # Define categories and their corresponding labels
    categories = {
        'cracked': 1,
        'non-cracked': 0
    }
    
    print(f"Starting data preprocessing...")
    print(f"Target Size: {TARGET_SIZE}, Brightness Factor: {BRIGHTNESS_FACTOR}")
    
    for category, label in categories.items():
        folder_path = root_path / category
        
        if not folder_path.is_dir():
            print(f"Warning: Category folder not found: {folder_path}. Skipping.")
            continue
            
        print(f"\nProcessing category '{category}' (Label {label})...")
        count = 0

        # Recursively find all valid image files
        for file_path in folder_path.rglob('*'):
            if file_path.suffix.lower() not in IMAGE_EXTENSIONS:
                continue

            try:
                # 1. Load and Standardize Size (and convert to RGB for consistency)
                with Image.open(file_path).convert('RGB') as img:
                    # Resize to the target size (which is now 256x256)
                    img_resized = img.resize(TARGET_SIZE)
                    
                    # 2. Increase Brightness (Enhancement)
                    enhancer = ImageEnhance.Brightness(img_resized)
                    img_enhanced = enhancer.enhance(BRIGHTNESS_FACTOR)
                    
                    # 3. Normalize (Convert to array and scale)
                    # Convert PIL image to NumPy array (dtype float32 for deep learning)
                    img_array = np.array(img_enhanced, dtype=np.float32)
                    
                    # Normalize pixel values from 0-255 to 0.0-1.0
                    normalized_array = img_array / 255.0
                    
                    # 4. Collect Data and Labels
                    data.append(normalized_array)
                    labels.append(label)
                    count += 1
                    
            except Exception as e:
                print(f"Error processing {file_path.name}: {e}. Skipping.")

        print(f"Successfully processed {count} images for '{category}'.")

    if not data:
        print("\nFATAL: No valid images were processed. Check DATA_ROOT path and subfolders.")
        return np.array([]), np.array([])
        
    # Convert lists to final NumPy arrays
    X = np.array(data)
    y = np.array(labels)
    
    return X, y

def main():
    """Executes the data preprocessing and reports the output array shapes."""
    
    # Check if the main data root exists before starting
    if not Path(DATA_ROOT).is_dir():
        print(f"ERROR: The main data root directory does not exist at: {DATA_ROOT}")
        print("Please verify the path and ensure it contains 'cracked' and 'non-cracked' folders.")
        return

    # Load and preprocess the data
    X, y = load_and_preprocess_data(DATA_ROOT)
    
    print("\n=======================================================")
    print("           PREPROCESSED DATA SUMMARY")
    print("=======================================================")

    if X.size > 0:
        # X shape is (Number of Images, Height, Width, Channels)
        print(f"Image Data (X) Shape: {X.shape}") 
        # y shape is (Number of Images,)
        print(f"Label Data (y) Shape: {y.shape}")
        
        print("\nData is now ready for model training:")
        print("  - Images are resized to {}x{}.".format(*TARGET_SIZE))
        print(f"  - Brightness increased by {BRIGHTNESS_FACTOR * 100 - 100:.0f}%.")
        print("  - Pixel values are normalized to the [0.0, 1.0] range.")
        print("You can now split X and y into training and validation sets.")
    else:
        print("No data was successfully loaded or processed.")

if __name__ == "__main__":
    # Ensure you have the required libraries installed:
    # pip install Pillow numpy
    main()
