In [1]:
import os
import sys
import json
import random
from pathlib import Path
import shutil
import hashlib
from concurrent.futures import ProcessPoolExecutor, as_completed

# Add src/ to sys.path
project_root = os.path.abspath("drive/MyDrive/restaurAr-T")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from tqdm import tqdm

# Local import
from maskerada import (
    ScratchDamage,
    WaterDiscolouration,
    CraquelureDamage,
    NoDamage
)

# Diagnostic: List contents of src_path to check for 'maskerada'
print(f"\nContents of {src_path}:")
!ls -F "{src_path}"


Contents of /content/drive/MyDrive/restaurAr-T/src:
maskerada.py  __pycache__/


In [2]:
def get_deterministic_seed(image_path):
    """
    Generate a reproducible integer seed based on the SHA-256 hash of the image path.

    This ensures consistent damage generation per image, even across runs.

    Args:
        image_path (str): Full path to the input image file.

    Returns:
        int: A 32-bit integer seed derived from the image path.
    """
    return int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)

In [3]:
def process_image(args):
    """
    Applies a specific damage class to an image, generates a damage mask,
    and saves both the damaged image and mask to disk.

    Args:
        args (tuple): A 3-tuple containing:
            - image_path (str): Full path to the image to process.
            - base_output_directory (str): Base directory to save outputs.
            - damage_class (class or callable): A damage class or factory.

    Returns:
        dict or None: A dictionary containing metadata about the processed image,
                      or None if processing failed or yielded no result.
                      Fields include:
                      - filename: Original image filename
                      - damage_type: Class name of the applied damage
                      - damaged_image: Path to saved damaged image
                      - damage_mask: Path to saved damage mask
    """
    image_path, base_output_directory, damage_class = args
    original_image_filename = os.path.basename(image_path)
    image_identifier = Path(original_image_filename).stem

    try:
        seed = get_deterministic_seed(image_path)
        random.seed(seed)
        image = Image.open(image_path).convert("RGB")

        damage_instance = damage_class() if callable(damage_class) else damage_class
        damaged_image, damage_mask = damage_instance.apply(image)

        if damaged_image is None or damage_mask is None:
            print(f"[WARNING] {damage_instance.__class__.__name__} returned None for {original_image_filename}")
            return None

        damage_type_name = damage_instance.__class__.__name__
        damage_suffix = damage_type_name.lower().replace("damage", "")

        damage_img_dir = os.path.join(base_output_directory, "generated-damaged-images")
        damage_mask_dir = os.path.join(base_output_directory, "generated-damage-masks")
        os.makedirs(damage_img_dir, exist_ok=True)
        os.makedirs(damage_mask_dir, exist_ok=True)

        base_filename = f"{image_identifier}-{damage_suffix}"
        damaged_image_path = os.path.join(damage_img_dir, base_filename + ".png")
        damage_mask_path = os.path.join(damage_mask_dir, base_filename + "-mask.png")

        damaged_image.convert("RGB").save(damaged_image_path)
        damage_mask.save(damage_mask_path)

        return {
            "filename": original_image_filename,
            "damage_type": damage_type_name,
            "damaged_image": damaged_image_path,
            "damage_mask": damage_mask_path,
        }

    except Exception as e:
        print(f"[ERROR] Failed to process {original_image_filename}: {e}")
        return None

In [4]:
def process_all_images(
    image_directory,
    base_output_directory,
    metadata_output_path,
    damage_classes,
    sample_size=None,
    max_workers=None
):
    """
    Applies damage to a set of input images according to fixed class proportions
    and saves metadata about the generated outputs.

    Each image is assigned to a damage type based on the following distribution:
        - Craquelure: 30%
        - Scratch: 30%
        - Water Damage: 30%
        - No Damage: 10%

    Damage application is done in parallel using multiprocessing.

    Args:
        image_directory (str): Directory containing original input images.
        base_output_directory (str): Directory where damaged images and masks are saved.
        metadata_output_path (str): Path to save a JSON metadata file summarizing results.
        damage_classes (list): List of all possible damage classes (including create_craquelure).
                               This argument is not used to assign classes randomly.
        sample_size (int or None): If specified, subsample N images from the directory.
        max_workers (int or None): Number of worker processes to use in parallel.

    Returns:
        None: Outputs are saved to disk (damaged images, masks, and metadata).
    """
    image_paths = sorted([
        str(p) for p in Path(image_directory).glob("*")
        if p.suffix.lower() in [".jpg", ".jpeg", ".png"]
    ])

    if sample_size:
        random.seed(42)
        image_paths = random.sample(image_paths, sample_size)

    total_images = len(image_paths)

    class_distribution = {
        "Craquelure": 0.30,
        "Scratch": 0.30,
        "WaterDiscolouration": 0.30,
        "NoDamage": 0.10,
    }

    damage_class_map = {
        "Craquelure": create_craquelure,
        "Scratch": ScratchDamage,
        "WaterDiscolouration": WaterDiscolouration,
        "NoDamage": NoDamage,
    }

    class_counts = {
        cls: int(total_images * prop)
        for cls, prop in class_distribution.items()
    }

    random.seed(42)
    random.shuffle(image_paths)

    assigned_tasks = []
    start = 0
    for class_name, count in class_counts.items():
        selected_images = image_paths[start:start + count]
        for img_path in selected_images:
            assigned_tasks.append((img_path, base_output_directory, damage_class_map[class_name]))
        start += count

    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_image, task) for task in assigned_tasks]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            result = future.result()
            if result:
                results.append(result)

    if metadata_output_path:
        with open(metadata_output_path, "w") as f:
            json.dump(results, f, indent=2)

In [5]:
def create_craquelure():
    """
    Factory function for CraquelureDamage to ensure it's pickleable in multiprocessing.
    """
    return CraquelureDamage(crack_mask_dir="drive/MyDrive/restaurAr-T/data/crack-masks")

In [6]:
process_all_images(
    image_directory="drive/MyDrive/restaurAr-T/data/img/full",
    base_output_directory="drive/MyDrive/restaurAr-T/data/v6-damaged-and-mask-dataset",
    metadata_output_path="drive/MyDrive/restaurAr-T/data/v6-damage_metadata.json",
    damage_classes=[
        ScratchDamage,
        WaterDiscolouration,
        create_craquelure,
        NoDamage
    ],
    sample_size=None,
    max_workers=12       # or None for default
)

Processing: 100%|██████████| 47343/47343 [3:14:51<00:00,  4.05it/s]
