## Importing libraries necessary

In [1]:
import os
from concurrent.futures import ThreadPoolExecutor
import humanize
from multiprocessing import Pool
from PIL import Image
import concurrent.futures
import shutil
import pyarrow as pa

In [20]:
Data_directory_1 = "D:/L460"
Data_directory_2 = "D:/L461"
Data_directory_3 = "D:/dataset"

In [3]:
def delete_file(file):
    try:
        os.remove(file)
    except Exception as e:
        print(f"Error deleting file {file}: {e}")

## Deleting XML files :

In [4]:
def delete_xml_files(directory, max_threads=10):
    # Recursively search for all .xml files in the specified directory and its subdirectories
    xml_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".xml"):
                xml_files.append(os.path.join(root, file))

    # Use a thread pool to delete the files with a maximum number of threads
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        for file in xml_files:
            executor.submit(delete_file, file)

    print(f"All {len(xml_files)} .xml files have been deleted from {directory} and its subdirectories.")
    # Recursively delete files in subdirectories
    for root, dirs, files in os.walk(directory):
        for dir in dirs:
            delete_xml_files(os.path.join(root, dir), max_threads)

## Deleting unwanted images :

In [5]:
def delete_unwanted_images(directory, max_threads=10):
    # Recursively search for all non-xml files in the specified directory and its subdirectories
    view_files = []
    stats_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if "VIEW-1_DIFFUSE.JPG" in file:
                view_files.append(os.path.join(root, file))
            elif "STATS.JPG" in file:
                stats_files.append(os.path.join(root, file))

    # Use a thread pool to delete the files with a maximum number of threads
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        for file in view_files:
            executor.submit(delete_file, file)
        for file in stats_files:
            executor.submit(delete_file,file)

    print(f"All {len(view_files)} files containing 'VIEW-1_DIFFUSE.JPG' and {len(stats_files)} files containing 'STATS.JPG' in their name have been deleted from {directory} and its subdirectories.")
    
    # Recursively delete files in subdirectories
    for root, dirs, files in os.walk(directory):
        for dir in dirs:
            delete_unwanted_images(os.path.join(root, dir), max_threads)

## Number of images :

In [6]:
def get_file_paths(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".JPG"):
                yield os.path.join(root, file)

def calculate_stats(directory):
    images_count = 0
    total_size = 0

    # Use a generator to avoid loading all file paths into memory at once
    file_paths = get_file_paths(directory)

    # Use multiprocessing to parallelize the calculation of file sizes
    with Pool() as pool:
        sizes = pool.map(os.path.getsize, file_paths)
        total_size = sum(sizes)
        images_count = len(sizes)

    # Use humanize to format the total size in a human-readable format
    total_size_humanized = humanize.naturalsize(total_size, binary=True)

    print(f"Number of images: {images_count}")
    print(f"Total size of directory {directory}: {total_size_humanized}")

## Cheking if the size of the images is the same :

In [16]:
def check_images_size(directory):
    sizes = {}
    image_paths = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(".jpg"):
                image_paths.append(os.path.join(root, file))

    def get_image_size(path):
        with Image.open(path) as img:
            return img.size

    with ThreadPoolExecutor() as executor:
        future_to_path = {executor.submit(get_image_size, path): path for path in image_paths}
        for future in concurrent.futures.as_completed(future_to_path):
            path = future_to_path[future]
            try:
                size = future.result()
                sizes[path] = size
            except Exception as e:
                print(f"Error getting size for image {path}: {e}")

    # Check if all image sizes are the same
    sizes_set = set(sizes.values())
    if len(sizes_set) == 1:
        print(f"All images in {directory} have size {sizes_set.pop()}")
    else:
        print(f"Images in {directory} have different sizes: {sizes}")


In [7]:
"""def copy_file(src, dst):
    # Open source file in read mode using PyArrow
    with pa.memory_map(src, mode="r") as mmap:
        # Create or open the destination file in write mode using shutil
        with open(dst, mode="wb") as out_file:
            # Write the content of source file to destination file using shutil
            out_file.write(mmap.read_all())


def copy_files(directory, destination):
    pass_folders = []
    fail_folders = []

    for root, dirs, files in os.walk(directory):
        for dir in dirs:
            if dir.lower() == "pass":
                pass_folders.append(os.path.join(root, dir))
            elif dir.lower() == "fail":
                fail_folders.append(os.path.join(root, dir))

    if not os.path.exists(destination):
        os.makedirs(destination)

    pass_destination = os.path.join(destination, "pass")
    if not os.path.exists(pass_destination):
        os.makedirs(pass_destination)

    fail_destination = os.path.join(destination, "fail")
    if not os.path.exists(fail_destination):
        os.makedirs(fail_destination)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Copy pass files
        pass_futures = []
        for folder in pass_folders:
            for file in os.listdir(folder):
                src = os.path.join(folder, file)
                dst = os.path.join(pass_destination, file)
                pass_futures.append(executor.submit(copy_file, src, dst))

        for future in concurrent.futures.as_completed(pass_futures):
            future.result()

        # Copy fail files
        fail_futures = []
        for folder in fail_folders:
            for file in os.listdir(folder):
                src = os.path.join(folder, file)
                dst = os.path.join(fail_destination, file)
                fail_futures.append(executor.submit(copy_file, src, dst))

        for future in concurrent.futures.as_completed(fail_futures):
            future.result()

    print(f"Copied {len(os.listdir(pass_destination))} pass files from {directory} to {pass_destination}")
    print(f"Copied {len(os.listdir(fail_destination))} fail files from {directory} to {fail_destination}")"""

'def copy_file(src, dst):\n    # Open source file in read mode using PyArrow\n    with pa.memory_map(src, mode="r") as mmap:\n        # Create or open the destination file in write mode using shutil\n        with open(dst, mode="wb") as out_file:\n            # Write the content of source file to destination file using shutil\n            out_file.write(mmap.read_all())\n\n\ndef copy_files(directory, destination):\n    pass_folders = []\n    fail_folders = []\n\n    for root, dirs, files in os.walk(directory):\n        for dir in dirs:\n            if dir.lower() == "pass":\n                pass_folders.append(os.path.join(root, dir))\n            elif dir.lower() == "fail":\n                fail_folders.append(os.path.join(root, dir))\n\n    if not os.path.exists(destination):\n        os.makedirs(destination)\n\n    pass_destination = os.path.join(destination, "pass")\n    if not os.path.exists(pass_destination):\n        os.makedirs(pass_destination)\n\n    fail_destination = os.pat

## Applying preparation :

In [8]:
calculate_stats(Data_directory_1)

Number of images: 290033
Total size of directory D:/L460: 206.1 GiB


In [9]:
calculate_stats(Data_directory_2)

Number of images: 155524
Total size of directory D:/L461: 124.5 GiB


In [10]:
delete_xml_files(Data_directory_1,max_threads=5)

All 242356 .xml files have been deleted from D:/L460 and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1 and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2022_52 and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2022_52\Fail and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2022_52\Pass and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2023_01 and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2023_01\Fail and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes\2023_01\Pass and its subdirectories.
All 0 .xml files have been deleted from D:/L460\Line1\D-E\ProcRes

In [11]:
delete_xml_files(Data_directory_2,max_threads=5)

All 148954 .xml files have been deleted from D:/L461 and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1 and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_41 and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_41\Fail and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_41\Pass and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_42 and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_42\Fail and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes\2022_42\Pass and its subdirectories.
All 0 .xml files have been deleted from D:/L461\Line1\D-E\ProcRes

In [12]:
delete_unwanted_images(Data_directory_1,max_threads=5)

All 101794 files containing 'VIEW-1_DIFFUSE.JPG' and 86445 files containing 'STATS.JPG' in their name have been deleted from D:/L460 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L460\Line1 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L460\Line1\D-E and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L460\Line1\D-E\ProcRes and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L460\Line1\D-E\ProcRes\2022_52 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L460\Line1\D-E\ProcRes\2022_52\Fail and its subdirectories.
All 0 files 

In [13]:
delete_unwanted_images(Data_directory_2,max_threads=5)

All 63662 files containing 'VIEW-1_DIFFUSE.JPG' and 28200 files containing 'STATS.JPG' in their name have been deleted from D:/L461 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L461\Line1 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L461\Line1\D-E and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L461\Line1\D-E\ProcRes and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L461\Line1\D-E\ProcRes\2022_41 and its subdirectories.
All 0 files containing 'VIEW-1_DIFFUSE.JPG' and 0 files containing 'STATS.JPG' in their name have been deleted from D:/L461\Line1\D-E\ProcRes\2022_41\Fail and its subdirectories.
All 0 files c

In [14]:
calculate_stats(Data_directory_1)

Number of images: 101794
Total size of directory D:/L460: 27.7 GiB


In [15]:
calculate_stats(Data_directory_2)

Number of images: 63662
Total size of directory D:/L461: 21.7 GiB
