In [17]:
# Ensure src is in the import path
import os
import sys

project_root = os.path.abspath("drive/MyDrive/restaurAr-T")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

In [18]:
import json
import random
import numpy as np
from PIL import Image, ImageDraw, ImageFilter

# Import damage classes
from maskerada import ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage

In [19]:
def apply_and_save_damage_set(
    original_image_pil,
    image_identifier,
    damage_applicator_instance,
    damage_type_name,
    base_output_directory
):
    """
    Applies a given damage to an image, saves the damaged image into
    a subdirectory.
    """
    print(f"Applying {damage_type_name} damage to '{image_identifier}'...")

    image_to_damage = original_image_pil.copy()
    damaged_image, damage_mask = damage_applicator_instance.apply(image_to_damage)

    # Define output subdirectories
    damaged_output_dir = os.path.join(
        base_output_directory, "generated-damaged-images"
    )
    os.makedirs(damaged_output_dir, exist_ok=True)

    damage_mask_dir = os.path.join(
        base_output_directory, "generated-damage-masks"
    )
    os.makedirs(damage_mask_dir, exist_ok=True)

    # Prepare filename
    id_str = str(image_identifier)
    id_str_base, id_str_ext = os.path.splitext(id_str)

    if id_str_ext.lower() in (
        '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'
    ):
        id_str = id_str_base

    damage_type_name = "".join(
        c if c.isalnum() else '_' for c in damage_type_name
    )
    base_filename = f"{id_str}-{damage_type_name}"
    damaged_filename = f"{base_filename}.png"
    damaged_image_path = os.path.join(damaged_output_dir, damaged_filename)
    mask_filename = f"{base_filename}-mask.png"
    mask_image_path = os.path.join(damage_mask_dir, mask_filename)

    # Save damaged image
    try:
        damaged_image.save(damaged_image_path)
        damage_mask.save(mask_image_path)
    except Exception as e:
        print(
            f"  Error saving images for {damage_type_name} "
            f"on {image_identifier}: {e}"
        )

    return {
        "image_identifier": image_identifier,
        "damage_type": damage_type_name,
        "original_image_path": (
            original_image_pil.filename
            if hasattr(original_image_pil, 'filename') else None
        ),
        "damaged_image_path": damaged_image_path,
        "mask_image_path": mask_image_path,
    }


In [23]:
# Load your original image
original_image = Image.open("drive/MyDrive/restaurAr-T/data/sample-image-canvas-oil-Le-Moulin-de-la-Galette-1876.png")

# Define damage instances
damage_types = [
    ScratchDamage(),
    WaterDiscolouration(),
    CraquelureDamage(crack_mask_dir=os.path.join(project_root, "data", "crack-masks")),
    NoDamage()
]

# Output base directory
base_output_directory = "drive/MyDrive/restaurAr-T/data/data/sample-damaged-and-mask-data"

# Metadata collector
damage_metadata = []

# Apply each damage separately
for dmg_instance in damage_types:
    result = apply_and_save_damage_set(
        original_image_pil=original_image,
        image_identifier="your_image",
        damage_applicator_instance=dmg_instance,
        damage_type_name=dmg_instance.__class__.__name__,
        base_output_directory=base_output_directory
    )
    damage_metadata.append(result)

# Save metadata to JSON
with open("drive/MyDrive/restaurAr-T/data/damage_metadata.json", "w") as f:
    json.dump(damage_metadata, f, indent=2)

Applying ScratchDamage damage to 'your_image'...
Applying WaterDiscolouration damage to 'your_image'...
Applying CraquelureDamage damage to 'your_image'...
Applying NoDamage damage to 'your_image'...


In [27]:
import os

crack_mask_dir_path = os.path.join(project_root, "data", "crack-masks")

if os.path.isdir(crack_mask_dir_path):
    print(f"Directory exists: {crack_mask_dir_path}")
    files = os.listdir(crack_mask_dir_path)
    if files:
        print("Contents of the directory:")
        for f in files:
            print(f"- {f}")
    else:
        print("The directory exists but is empty.")
else:
    print(f"Directory does not exist: {crack_mask_dir_path}")
    print("Please ensure this directory exists in your Google Drive and contains crack mask images.")

Directory exists: /content/drive/MyDrive/restaurAr-T/data/crack-masks
Contents of the directory:
- 1014.jpg
- 1012.jpg
- 1011.jpg
- 1003.jpg
- 1001.jpg
- 1009.jpg
- 100.jpg
- 1000.jpg
- 10.jpg
- 1010.jpg
- 1.jpg
- 1004.jpg
- 1008.jpg
- 1006.jpg
- 101.jpg
- 1007.jpg
- 1002.jpg
- 0.jpg
- 1005.jpg
- 1033.jpg
- 1035.jpg
- 1027.jpg
- 1026.jpg
- 1036.jpg
- 1032.jpg
- 1041.jpg
- 103.jpg
- 1034.jpg
- 1029.jpg
- 1028.jpg
- 1031.jpg
- 1030.jpg
- 1022.jpg
- 1016.jpg
- 1024.jpg
- 102.jpg
- 1020.jpg
- 1025.jpg
- 1021.jpg
- 1018.jpg
- 1019.jpg
- 1023.jpg
- 1013.jpg
- 1015.jpg
- 1017.jpg
- 1090.jpg
- 1085.jpg
- 1081.jpg
- 1083.jpg
- 1089.jpg
- 1086.jpg
- 1084.jpg
- 1080.jpg
- 1077.jpg
- 1082.jpg
- 108.jpg
- 1078.jpg
- 1079.jpg
- 1076.jpg
- 1065.jpg
- 1071.jpg
- 1069.jpg
- 1070.jpg
- 1074.jpg
- 1073.jpg
- 1072.jpg
- 107.jpg
- 1067.jpg
- 1075.jpg
- 1063.jpg
- 1068.jpg
- 1066.jpg
- 1064.jpg
- 1059.jpg
- 1062.jpg
- 106.jpg
- 1050.jpg
- 1048.jpg
- 1053.jpg
- 105.jpg
- 1055.jpg
- 1057.jpg
- 1052.jpg
- 1058

In [28]:
import os

image_directory_path = os.path.join(project_root, "data", "img", "full")

if os.path.isdir(image_directory_path):
    print(f"Directory exists: {image_directory_path}")
    files = os.listdir(image_directory_path)
    if files:
        print("Contents of the directory:")
        for f in files:
            print(f"- {f}")
    else:
        print("The directory exists but is empty.")
else:
    print(f"Directory does not exist: {image_directory_path}")
    print("Please ensure this directory exists in your Google Drive and contains your image files.")

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
- afa849096ea57135c0dda1022fd4a6d8c9d2a5a7.jpg
- fac6c1036f7d22c96559d68e4a14721b6a3978e8.jpg
- 5be7a13bc36697e71044841aa01f010ae2545f51.jpg
- 7bf826691dcd7c20b450d3e4042ba55b9a16dfd3.jpg
- 64fc3933bc56ccb9681094aa01f808237ef9adcb.jpg
- 828748e4409a018d3bdd8ed114b6bc9bc21f7735.jpg
- 78e46c7d9ab5dacb1a0cdff528a82dbf33640863.jpg
- e7d9c7425258b2788a2d44d487bb5b0ffc5a8c74.jpg
- 93757ae9b909175cfd502223004021fab31cd79a.jpg
- 87aa48d87d9b36174ef90e3fdb92a0370c2bc6cf.jpg
- 331543115f948bf46c0a567c0dad90cafe088381.jpg
- 5cf9b7e1f6ebc3de6d20532bf27f39ad4ea3bf2c.jpg
- df6ecd67097aa37600df7fc761df233806d8183f.jpg
- 6917a49ca1de2f8c044d54f8925f3811680bec7d.jpg
- ee124065aba882d7f01b34381d76c314859afd5b.jpg
- 391ad3af3149256b302c3244eb5a658711726fdb.jpg
- 1e3c2313ea1d505b65dfc6aaa989230a2bfcc519.jpg
- 94445b91f5715dc00192754a81c720665cf37fc9.jpg
- ebcdfa3832c03759262e41a177e4edd8adef0c1e.jpg
- 04d26bedf99ff55f4355f15c7e5a85

In [32]:
# Define source and output directories
image_directory = "drive/MyDrive/restaurAr-T/data/img/full"
base_output_directory = "drive/MyDrive/restaurAr-T/data/sample-damaged-and-mask-data"
metadata_output_path = "drive/MyDrive/restaurAr-T/data/damage_metadata.json"

# Collect all image paths from the directory
valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
image_paths = [
    os.path.join(image_directory, fname)
    for fname in os.listdir(image_directory)
    if fname.lower().endswith(valid_extensions)
]

# Set a separate seed for sampling
sampling_rng = random.Random(42)

# Sample image paths reproducibly
image_paths = sampling_rng.sample(image_paths, k=5)

# Define damage instances
damage_classes = [ScratchDamage, WaterDiscolouration, CraquelureDamage]

# Metadata collector
damage_metadata = []

# Apply random damage to each image
for img_path in image_paths:
    try:
        original_image = Image.open(img_path)
        damage_class = random.choice(damage_classes)
        damage_instance = damage_class()

        result = apply_and_save_damage_set(
            original_image_pil=original_image,
            image_identifier=os.path.splitext(os.path.basename(img_path))[0],
            damage_applicator_instance=damage_instance,
            damage_type_name=damage_class.__name__,
            base_output_directory=base_output_directory
        )

        damage_metadata.append(result)

    except Exception as e:
        print(f"Error processing '{img_path}': {e}")

# Save metadata to JSON
with open(metadata_output_path, "w") as f:
    json.dump(damage_metadata, f, indent=2)


Applying WaterDiscolouration damage to '20f6803c610f9f5d8ff1b988c9bf4a8b36ef45ff'...
Applying ScratchDamage damage to 'e3165f4e337309690d02128162eac3569c93adff'...
Applying WaterDiscolouration damage to 'bc8f281358fc71cc62cbe78d4b550c5e0f3f4a0d'...
Applying ScratchDamage damage to 'bf77b7d255222474ba2c4c42107fc4ad90a9eee7'...
Applying WaterDiscolouration damage to 'cad97261fcba6d46b84fa585a16307ce0d8df918'...


In [33]:
import os
import sys
project_root = os.path.abspath("..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

import json
import random
import hashlib
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from PIL import Image, ImageDraw, ImageFilter
from tqdm import tqdm

# Import damage classes
from maskerada import ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage

In [34]:
# --- Utility: Generate a reproducible seed per image path ---
def get_deterministic_seed(image_path):
    """
    Generates a reproducible integer seed from the image path.

    This ensures consistent random behavior per image across runs,
    even when processed in parallel.

    Args:
        image_path (str): Full path to the image.

    Returns:
        int: A 32-bit integer seed derived from the image path.
    """
    return int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)

In [35]:
def process_image(args):
    """
    Processes a single image by applying one randomly selected damage type.
    Skips regeneration if output files already exist, but always returns metadata.

    Args:
        args (tuple): Contains:
            - image_path (str): Path to the input image.
            - base_output_directory (str): Where to save results.
            - damage_classes (list): List of available damage classes.

    Returns:
        dict or None: Metadata dictionary if successful, None if errored.
    """
    import hashlib
    image_path, base_output_directory, damage_classes = args

    try:
        seed = int(hashlib.sha256(image_path.encode()).hexdigest(), 16) % (2**32)
        random.seed(seed)

        damage_class = random.choice(damage_classes)
        damage_type_name = damage_class.__name__

        image_identifier = os.path.splitext(os.path.basename(image_path))[0]
        original_image_filename = os.path.basename(image_path)
        damage_type_safe = "".join(c if c.isalnum() else "_" for c in damage_type_name)

        base_filename = f"{image_identifier}-{damage_type_safe}"
        damaged_output_dir = os.path.join(base_output_directory, "generated-damaged-images-v4")
        damage_mask_dir = os.path.join(base_output_directory, "generated-damage-masks-v4")

        damaged_image_path = os.path.join(damaged_output_dir, f"{base_filename}.png")
        mask_image_path = os.path.join(damage_mask_dir, f"{base_filename}-mask.png")

        if not (os.path.exists(damaged_image_path) and os.path.exists(mask_image_path)):
            original_image = Image.open(image_path).convert("RGB")
            damage_instance = damage_class()
            damaged_image, damage_mask = damage_instance.apply(original_image.copy())

            os.makedirs(damaged_output_dir, exist_ok=True)
            os.makedirs(damage_mask_dir, exist_ok=True)

            damaged_image.save(damaged_image_path)
            damage_mask.save(mask_image_path)

        return {
            "image_identifier": image_identifier,
            "original_image_filename": original_image_filename,
            "damage_type": damage_type_name,
            "original_image_path": image_path,
            "damaged_image_path": damaged_image_path,
            "mask_image_path": mask_image_path
        }

    except Exception as e:
        image_filename_fallback = os.path.basename(image_path)
        print(f"[ERROR] {image_filename_fallback}: {e}")
        return None


In [36]:
def process_all_images(
    image_directory,
    base_output_directory,
    metadata_output_path,
    damage_classes,
    sample_size=None,
    max_workers=None
):
    """
    Applies 1 randomly chosen damage type (including NoDamage) per image using multiprocessing.
    Each image may generate multiple variants. Metadata is saved in a JSON file.

    Args:
        image_directory (str): Folder of input images.
        base_output_directory (str): Where to save damaged images and masks.
        metadata_output_path (str): Path to the output .json metadata file.
        damage_classes (list): List of available damage class types (e.g., [Scratch, Water, NoDamage]).
        sample_size (int, optional): Number of images to process. If None, use all.
        max_workers (int, optional): Max number of parallel workers.

    Returns:
        None
    """
    from concurrent.futures import ProcessPoolExecutor, as_completed
    import json
    from tqdm import tqdm
    from collections import Counter

    valid_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')
    image_paths = [
        os.path.join(image_directory, fname)
        for fname in os.listdir(image_directory)
        if fname.lower().endswith(valid_extensions)
    ]

    if sample_size:
        rng = random.Random(42)
        image_paths = rng.sample(image_paths, k=sample_size)

    rng = random.Random(42)
    all_args = []

    for image_path in image_paths:
        # ~10% chance of NoDamage, ~90% chance of real damage
        if rng.random() < 0.1:
            selected_damage = NoDamage
        else:
            real_damages = [d for d in damage_classes if d.__name__ != "NoDamage"]
            selected_damage = rng.choice(real_damages)

        all_args.append((image_path, base_output_directory, [selected_damage]))

    metadata = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_image, args): args for args in all_args}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            result = future.result()
            if result is not None:
                metadata.append(result)

    from collections import Counter
    damage_counter = Counter([entry['damage_type'] for entry in metadata])

    with open(metadata_output_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"Saved {len(metadata)} image variants to '{metadata_output_path}'.")
    print("Damage type distribution:")
    for damage_type, count in damage_counter.items():
        print(f"- {damage_type}: {count}")


In [37]:
process_all_images(
    image_directory="drive/MyDrive/restaurAr-T/data/img/full",
    base_output_directory="drive/MyDrive/restaurAr-T/data/v4-damaged-and-mask-multiprocessing",
    metadata_output_path="drive/MyDrive/restaurAr-T/data/damage_metadata_v4.json",
    damage_classes=[ScratchDamage, WaterDiscolouration, CraquelureDamage, NoDamage],
    sample_size=None,
    max_workers=None
)

Output hidden; open in https://colab.research.google.com to view.

In [38]:
import pandas as pd
with open('drive/MyDrive/restaurAr-T/data/damage_metadata_v4.json', 'r', encoding='utf-8') as f:
    metadata_list = json.load(f)
df = pd.DataFrame(metadata_list)
print(f'Metadata file shape:\n{df.shape}')
print(f'Columns:\n{list(df.columns)}')
df.head()

Metadata file shape:
(33146, 6)
Columns:
['image_identifier', 'original_image_filename', 'damage_type', 'original_image_path', 'damaged_image_path', 'mask_image_path']


Unnamed: 0,image_identifier,original_image_filename,damage_type,original_image_path,damaged_image_path,mask_image_path
0,10cc94d490eda71106e915e3c91d001b80d796d3,10cc94d490eda71106e915e3c91d001b80d796d3.jpg,ScratchDamage,drive/MyDrive/restaurAr-T/data/img/full/10cc94...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...
1,bb2eb0196b361473ca716a544a998b8c0d2b72bc,bb2eb0196b361473ca716a544a998b8c0d2b72bc.jpg,ScratchDamage,drive/MyDrive/restaurAr-T/data/img/full/bb2eb0...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...
2,01a842cede1f52a881d4a93c8caeaf75519fa983,01a842cede1f52a881d4a93c8caeaf75519fa983.jpg,ScratchDamage,drive/MyDrive/restaurAr-T/data/img/full/01a842...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...
3,c8478202da9f21a77e3d3d1935dd10e824a471ee,c8478202da9f21a77e3d3d1935dd10e824a471ee.jpg,ScratchDamage,drive/MyDrive/restaurAr-T/data/img/full/c84782...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...
4,26829871fd311871a4053837adec209c1b620f2e,26829871fd311871a4053837adec209c1b620f2e.jpg,NoDamage,drive/MyDrive/restaurAr-T/data/img/full/268298...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...,drive/MyDrive/restaurAr-T/data/v4-damaged-and-...
