In [8]:
import os
import shutil

import os
import shutil

def flatten_directory(root_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for parent, dirs, files in os.walk(root_dir):
        # Skip the root itself (we only want subdirectories)
        if parent == root_dir:
            continue

        parent_name = os.path.basename(parent)

        for file in files:
            src_path = os.path.join(parent, file)
            new_name = f"{parent_name}_{file}"
            dst_path = os.path.join(output_dir, new_name)

            # Handle duplicate names safely
            base, ext = os.path.splitext(new_name)
            counter = 1
            while os.path.exists(dst_path):
                dst_path = os.path.join(output_dir, f"{base}_{counter}{ext}")
                counter += 1

            shutil.copy2(src_path, dst_path)

    print(f"Flattened files copied to: {output_dir}")

# Example usage
root_dir = "/home/adelb/Documents/Bpartners/Pleiades/dataset/bati_2014_cherbourg/17"
flat_dir = "/home/adelb/Documents/Bpartners/Pleiades/dataset/bati_2014_cherbourg/images"
flatten_directory(root_dir, flat_dir)
print(len(os.listdir(flat_dir)))


Flattened files copied to: /home/adelb/Documents/Bpartners/Pleiades/dataset/bati_2014_cherbourg/images
5810


In [1]:
import os
import cv2
import numpy as np

def filter_by_mask_blackness_and_pairness(images_dir, masks_dir, threshold=0.95):
    """
    Remove pairs (image, mask) where the mask is mostly black.
    threshold = fraction of pixels that are black (0) to consider it 'empty'
    """
    mask_files = sorted([f for f in os.listdir(masks_dir) if os.path.isfile(os.path.join(masks_dir, f))])
    image_files = sorted([f for f in os.listdir(images_dir) if os.path.isfile(os.path.join(images_dir, f))])
    
    removed = 0
    kept = 0

    for mask_name in mask_files:
        mask_path = os.path.join(masks_dir, mask_name)
        image_path = os.path.join(images_dir, mask_name)

        if not os.path.exists(image_path):
            print(f"⚠️ Skipping {mask_name} — no matching image found.")
            continue

        # Read the mask as grayscale
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        if mask is None:
            print(f"⚠️ Could not read mask {mask_name}. Skipping.")
            continue

        # Normalize to [0,1] range if needed
        black_fraction = np.mean(mask == 0)  # consider <10 as black tolerance

        if black_fraction > threshold:
            os.remove(mask_path)
            os.remove(image_path)
            removed += 1
        else:
            kept += 1

    print(f"✅ Done. Kept {kept} pairs, removed {removed} pairs (threshold={threshold}).")

    mask_files = sorted([f for f in os.listdir(masks_dir) if os.path.isfile(os.path.join(masks_dir, f))])
    image_files = sorted([f for f in os.listdir(images_dir) if os.path.isfile(os.path.join(images_dir, f))])

    removed = 0
    kept = 0

    for img_name in image_files:
        
        if img_name not in mask_files:
            image_path = os.path.join(images_dir, img_name)
            os.remove(image_path)
            removed += 1
        else:
            kept += 1

    print(f"✅ Done. Kept {kept} pairs, removed {removed} odd images.")
    
    removed = 0
    kept = 0

    for img_name in image_files:
        image_path = os.path.join(images_dir, img_name)
        img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
        if np.mean(img[:,:, -1] == 0) > threshold:
            
            mask_path = os.path.join(masks_dir, img_name)
            os.remove(image_path)
            os.remove(mask_path)
            removed += 1
        else:
            kept += 1
            
    print(f"✅ Done. Kept {kept} pairs, removed {removed} pairs (threshold={threshold}).")
    

# Example usage
images_path = "/home/adelb/Documents/Bpartners/Pleiades/dataset/bati_2014_cherbourg/images"
masks_path = "/home/adelb/Documents/Bpartners/Pleiades/dataset/bati_2014_cherbourg/masks"

filter_by_mask_blackness_and_pairness(images_path, masks_path, threshold=0.99)


✅ Done. Kept 704 pairs, removed 0 pairs (threshold=0.99).
✅ Done. Kept 704 pairs, removed 0 odd images.
✅ Done. Kept 603 pairs, removed 101 pairs (threshold=0.99).
