## Interrogate 

In [3]:
import os
import threading
from queue import Queue
from tqdm import tqdm
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None

# Global counters and lists
total_files = 0
transparent_files = 0
error_files = 0
transparent_image_paths = [] 
transparent_lock = threading.Lock()
error_lock = threading.Lock()
list_lock = threading.Lock() 

def has_transparency(image_path):
    try:
        with Image.open(image_path) as img:
            if img.mode in ('RGBA', 'LA') or ('transparency' in img.info):
                with transparent_lock:
                    global transparent_files
                    transparent_files += 1
                with list_lock:
                    transparent_image_paths.append(image_path)  # Append the path to the list
                return True
    except Exception as e:
        print(f"Error processing {image_path}: {e}. Deleting file.")
        os.remove(image_path)
        with error_lock:
            global error_files
            error_files += 1
        return False
    return False

def worker(queue, pbar):
    while True:
        image_path = queue.get()
        if image_path is None:
            queue.task_done()
            break
        has_transparency(image_path)
        queue.task_done()
        pbar.update(1)

def main(folder_path):
    global total_files
    image_files = [os.path.join(root, file) for root, dirs, files in os.walk(folder_path) for file in files if file.lower().endswith(('.png', '.jpg', '.jpeg', '.webp', '.bmp'))]
    total_files = len(image_files)

    queue = Queue()
    threads = []
    num_worker_threads = 10

    with tqdm(total=total_files, desc="Verifying Images") as pbar:
        for _ in range(num_worker_threads):
            t = threading.Thread(target=worker, args=(queue, pbar))
            t.start()
            threads.append(t)

        for image_path in image_files:
            queue.put(image_path)

        queue.join()

        for _ in range(num_worker_threads):
            queue.put(None)
        for t in threads:
            t.join()

    print(f"Verification complete. Found {transparent_files} transparent images and deleted {error_files} error files out of {total_files}.")

    with open('transparent_images.txt', 'w') as f:
        for path in transparent_image_paths:
            f.write(f"{path}\n")

if __name__ == "__main__":
    image_folder = "/workspace/train_data/finale-dataset"  
    main(image_folder)


Verifying Images: 100%|██████████| 13680/13680 [00:05<00:00, 2449.84it/s]


Verification complete. Found 3408 transparent images and deleted 0 error files out of 13680.


## Proceed cleaning

In [2]:
import os
import threading
from queue import Queue
from tqdm import tqdm
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None

def process_image(image_path):
    try:
        with Image.open(image_path) as img:
            if img.mode in ('RGBA', 'LA') or 'transparency' in img.info:
                if img.mode != 'RGBA':
                    img = img.convert('RGBA')
                
                # Create a white background image
                white_bg = Image.new('RGBA', img.size, 'WHITE')
                img_with_bg = Image.alpha_composite(white_bg, img)
                img_with_bg = img_with_bg.convert('RGB')
                
                img_with_bg.save(image_path)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

def worker(queue, pbar):
    while True:
        image_path = queue.get()
        if image_path is None:
            queue.task_done()
            break
        process_image(image_path)
        queue.task_done()
        pbar.update(1)

def main(file_path):
    with open(file_path, 'r') as f:
        image_paths = [line.strip() for line in f.readlines()]

    total_files = len(image_paths)

    queue = Queue()
    threads = []
    num_worker_threads = 10  

    with tqdm(total=total_files, desc="Processing Images") as pbar:
        for _ in range(num_worker_threads):
            t = threading.Thread(target=worker, args=(queue, pbar))
            t.start()
            threads.append(t)

        for image_path in image_paths:
            queue.put(image_path)

        queue.join()

        for _ in range(num_worker_threads):
            queue.put(None)
        for t in threads:
            t.join()

    print(f"Processing complete. Processed {total_files} images.")

if __name__ == "__main__":
    txt_file = "transparent_images.txt"  
    main(txt_file)


Processing Images: 100%|██████████| 4289/4289 [07:39<00:00,  9.33it/s]

Processing complete. Processed 4289 images.



