In [None]:
import os
from collections import Counter
from tqdm import tqdm
import shutil
from PIL import Image
import numpy as np

def check_folder_file_completeness(dataset_dir, save_problematic=True, max_scene_id=2500):
    required_files = {
        'initial_image.png',
        'initial_labels.json', 
        'meta.json',
        'target_image.png',
        'target_labels.json'
    }
    
    # Get all existing rearrangement directories
    existing_dirs = [d for d in os.listdir(dataset_dir) 
                    if d.startswith('rearrangement_') and os.path.isdir(os.path.join(dataset_dir, d))]
    
    # Extract existing scene IDs
    existing_scene_ids = set()
    for dir_name in existing_dirs:
        scene_id = int(dir_name.replace('rearrangement_', ''))
        existing_scene_ids.add(scene_id)
    
    # Find missing scene IDs
    all_expected_ids = set(range(1, max_scene_id + 1))
    missing_scene_ids = all_expected_ids - existing_scene_ids
    
    complete_folders = []
    incomplete_folders = []
    folder_issues = {}
    
    print(f"Checking scene IDs 1 to {max_scene_id}...")
    print(f"Found {len(existing_dirs)} existing folders")
    print(f"Missing {len(missing_scene_ids)} folders")
    
    # Add missing folders to problematic list
    for scene_id in missing_scene_ids:
        folder_issues[str(scene_id)] = "Missing folder"
        incomplete_folders.append(str(scene_id))
    
    # Check existing folders for file completeness
    for scene_dir in tqdm(existing_dirs, desc="Checking folder contents"):
        scene_path = os.path.join(dataset_dir, scene_dir)
        scene_id = str(int(scene_dir.replace('rearrangement_', '')))
        
        try:
            files_in_folder = set(os.listdir(scene_path))
        except Exception as e:
            folder_issues[scene_id] = f"Error reading folder: {e}"
            incomplete_folders.append(scene_id)
            continue
        
        missing_files = required_files - files_in_folder
        extra_files = files_in_folder - required_files
        
        # A folder is problematic if it has missing files OR extra files
        if missing_files or extra_files:
            issues = []
            if missing_files:
                issues.append(f"Missing: {sorted(missing_files)}")
            if extra_files:
                issues.append(f"Extra: {sorted(extra_files)}")
            
            incomplete_folders.append(scene_id)
            folder_issues[scene_id] = "; ".join(issues)
        else:
            complete_folders.append(scene_id)
    
    if save_problematic and incomplete_folders:
        with open('problematic_scene_ids.txt', 'w') as f:
            f.write(f"# Problematic Scene IDs (1-{max_scene_id})\n")
            f.write(f"# Total problematic: {len(incomplete_folders)}\n")
            f.write(f"# Missing folders: {len(missing_scene_ids)}\n")
            f.write(f"# Incomplete folders: {len(incomplete_folders) - len(missing_scene_ids)}\n")
            f.write("#" + "="*50 + "\n")
            
            for scene_id in sorted(incomplete_folders, key=int):
                issue = folder_issues.get(scene_id, "Unknown issue")
                f.write(f"{scene_id}  # {issue}\n")
        print(f"Saved {len(incomplete_folders)} problematic scene IDs to 'problematic_scene_ids.txt'")
    
    print(f"\n=== FOLDER COMPLETENESS SUMMARY ===")
    print(f"Expected scenes: {max_scene_id}")
    print(f"Existing folders: {len(existing_dirs)}")
    print(f"Complete folders: {len(complete_folders)}")
    print(f"Missing folders: {len(missing_scene_ids)}")
    print(f"Incomplete folders: {len(incomplete_folders) - len(missing_scene_ids)}")
    print(f"Total problematic: {len(incomplete_folders)}")
    print(f"Completion rate: {len(complete_folders)/max_scene_id*100:.2f}%")
    
    if folder_issues:
        print(f"\n=== SAMPLE FOLDER ISSUES ===")
        sample_issues = list(folder_issues.items())[:10]
        for scene_id, issue in sample_issues:
            print(f"Scene {scene_id}: {issue}")
        if len(folder_issues) > 10:
            print(f"... and {len(folder_issues) - 10} more folders with issues")
    
    return {
        'complete_scene_ids': sorted([int(x) for x in complete_folders]),
        'incomplete_scene_ids': sorted([int(x) for x in incomplete_folders]),
        'missing_scene_ids': sorted(list(missing_scene_ids)),
        'folder_issues': folder_issues
    }

def load_problematic_scene_ids(txt_file):
    scene_ids = []
    
    if not os.path.exists(txt_file):
        print(f"File {txt_file} not found!")
        return scene_ids
    
    with open(txt_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#') or not line:
                continue
            scene_id = line.split('#')[0].strip()
            if scene_id.isdigit():
                scene_ids.append(scene_id)
    
    return scene_ids

def is_image_black(image_path, threshold: float = 0.01) -> bool:
    """Check if an image is completely black."""
    try:
        image = Image.open(image_path)
        img_array = np.array(image) / 255.0
        return np.mean(img_array) < threshold
    except Exception as e:
        print(f"Error checking image {image_path}: {e}")
        return False

def find_black_image_scenes(dataset_dir):
    rearrangement_dirs = [d for d in os.listdir(dataset_dir) 
                         if d.startswith('rearrangement_') and os.path.isdir(os.path.join(dataset_dir, d))]
    
    problematic_scenes = []
    
    for scene_dir in tqdm(rearrangement_dirs, desc="Checking for black images"):
        scene_path = os.path.join(dataset_dir, scene_dir)
        scene_id = str(int(scene_dir.replace('rearrangement_', '')))
        
        initial_img = os.path.join(scene_path, 'initial_image.png')
        target_img = os.path.join(scene_path, 'target_image.png')
        
        initial_black = os.path.exists(initial_img) and is_image_black(initial_img)
        target_black = os.path.exists(target_img) and is_image_black(target_img)
        
        if initial_black or target_black:
            problematic_scenes.append(scene_id)
    
    return problematic_scenes

def create_combined_report(dataset_dir, report_file, check_black_images: bool = True, max_scene_id: int = 2500):
    # Get file completeness issues
    file_completeness_result = check_folder_file_completeness(dataset_dir, save_problematic=False, max_scene_id=max_scene_id)
    
    # Get black image issues if enabled
    black_image_scenes = []
    if check_black_images:
        black_image_scenes = find_black_image_scenes(dataset_dir)
    
    # Combine problematic scene IDs
    all_problematic = set(map(str, file_completeness_result['incomplete_scene_ids'])) | set(black_image_scenes)
    folder_issues = file_completeness_result['folder_issues'].copy()
    
    # Add black image issues
    for scene_id in black_image_scenes:
        if scene_id not in folder_issues:
            folder_issues[scene_id] = "Black images"
        else:
            folder_issues[scene_id] += "; Black images"
    
    # Save combined report
    if all_problematic:
        with open(report_file, 'w') as f:
            f.write(f"# Problematic Scene IDs (File Issues{' + Black Images' if check_black_images else ''})\n")
            f.write(f"# Total problematic: {len(all_problematic)}\n")
            f.write(f"# File issues: {len(file_completeness_result['incomplete_scene_ids'])}\n")
            if check_black_images:
                f.write(f"# Black images: {len(black_image_scenes)}\n")
            f.write("#" + "="*50 + "\n")
            
            for scene_id in sorted(all_problematic, key=int):
                issue = folder_issues.get(scene_id, "Unknown issue")
                f.write(f"{scene_id}  # {issue}\n")
        print(f"Saved {len(all_problematic)} problematic scene IDs to '{report_file}'")
    
    print(f"\n=== COMBINED SUMMARY ===")
    print(f"File completeness issues: {len(file_completeness_result['incomplete_scene_ids'])}")
    if check_black_images:
        print(f"Black image issues: {len(black_image_scenes)}")
    print(f"Total unique problematic scenes: {len(all_problematic)}")
    
    return all_problematic

dataset_dir = "dataset-sim"
report_file = "problematic_scene_ids.txt"

# Remove the report file if it exists
if os.path.exists(report_file):
    os.remove(report_file)
    print(f"Removed existing {report_file}")

# Create combined report with black image checking enabled
all_problematic = create_combined_report(dataset_dir, report_file, check_black_images=True)

Checking scene IDs 1 to 2500...
Found 2500 existing folders
Missing 0 folders


Checking folder contents: 100%|██████████| 2500/2500 [00:00<00:00, 11644.07it/s]




=== FOLDER COMPLETENESS SUMMARY ===
Expected scenes: 2500
Existing folders: 2500
Complete folders: 2500
Missing folders: 0
Incomplete folders: 0
Total problematic: 0
Completion rate: 100.00%


Checking for black images: 100%|██████████| 2500/2500 [01:11<00:00, 35.02it/s]


=== COMBINED SUMMARY ===
File completeness issues: 0
Black image issues: 0
Total unique problematic scenes: 0





In [None]:
def remove_scenes_from_file(dataset_dir, txt_file, dry_run: bool = True):
    problematic_scene_ids = load_problematic_scene_ids(txt_file)
    
    if not problematic_scene_ids:
        print("No problematic scene IDs found in the file.")
        return
    
    print(f"Loaded {len(problematic_scene_ids)} scene IDs from {txt_file}")
    
    removed_count = 0
    not_found_count = 0
    
    for scene_id in tqdm(problematic_scene_ids, desc="Processing scenes"):
        folder_name = f"rearrangement_{scene_id.zfill(5)}"
        folder_path = os.path.join(dataset_dir, folder_name)
        
        if os.path.exists(folder_path):
            if dry_run:
                print(f"Would remove: {folder_path}")
            else:
                try:
                    shutil.rmtree(folder_path)
                    print(f"Removed: {folder_path}")
                    removed_count += 1
                except Exception as e:
                    print(f"Error removing {folder_path}: {e}")
        else:
            not_found_count += 1
    
    print(f"\n=== REMOVAL SUMMARY ===")
    if dry_run:
        print(f"DRY RUN: Found {len(problematic_scene_ids)} scenes to process")
        print(f"Folders that would be removed: {len(problematic_scene_ids) - not_found_count}")
        print(f"Missing folders (already absent): {not_found_count}")
        print("Set dry_run=False to actually delete the folders.")
    else:
        print(f"Successfully removed: {removed_count} folders")
        print(f"Missing folders (already absent): {not_found_count}")
        print(f"Total processed: {len(problematic_scene_ids)}")

# First do a dry run to see what would be deleted (run in problematic version)
print("=== DRY RUN ===")
remove_scenes_from_file(dataset_dir, report_file, dry_run=True)

=== DRY RUN ===
File problematic_scene_ids.txt not found!
No problematic scene IDs found in the file.


In [None]:
def copy_problematic_scenes_to_folder(dataset_dir, output_dir, txt_file):
    problematic_scene_ids = load_problematic_scene_ids(txt_file)
    
    if not problematic_scene_ids:
        print("No problematic scene IDs found in the file.")
        return
    
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created output directory: {output_dir}")
    
    copied_count = 0
    not_found_count = 0
    
    for scene_id in tqdm(problematic_scene_ids, desc="Copying scenes"):
        folder_name = f"rearrangement_{scene_id.zfill(5)}"
        source_path = os.path.join(dataset_dir, folder_name)
        dest_path = os.path.join(output_dir, folder_name)
        
        if os.path.exists(source_path):
            try:
                shutil.copytree(source_path, dest_path)
                copied_count += 1
            except Exception as e:
                print(f"Error copying {source_path}: {e}")
        else:
            not_found_count += 1
    
    print(f"\n=== COPY SUMMARY ===")
    print(f"Successfully copied: {copied_count} folders")
    print(f"Missing folders (not found): {not_found_count}")
    print(f"Total processed: {len(problematic_scene_ids)}")

# Copy problematic scenes to a separate folder (run in correct version)
copy_problematic_scenes_to_folder(dataset_dir, output_dir="problematic_scenes", txt_file=report_file)

File problematic_scene_ids.txt not found!
No problematic scene IDs found in the file.
