In [2]:
# counting the number of image files in a folder

from pathlib import Path

def count_images_in_folder(folder_path):
    """Count the number of .jpg and .png image files in a folder."""
    
    # Initialize the counter for image files
    image_count = 0

    # Convert the folder_path to a Path object
    folder = Path(folder_path)

    # Check if the folder exists
    if folder.exists() and folder.is_dir():
        # Count .jpg and .png files
        for img_file in folder.rglob("*.jpg"):
            image_count += 1
        for img_file in folder.rglob("*.png"):
            image_count += 1

        print(f"Number of image files in {folder}: {image_count}")
    else:
        print(f"Invalid folder path: {folder_path}")
    
    return image_count




In [5]:
count_images_in_folder("/home/hk-project-p0021769/hgf_grc7525/data/with_labels/CPICS_Validated")

Number of image files in /home/hk-project-p0021769/hgf_grc7525/data/with_labels/CPICS_Validated: 1517303


1517303

# getting all the images in a folder

In [None]:
# adding the image files to a set
def get_image_files(directory):
    """Get all image files (.jpg, .png) in a directory and return their count."""
    
    # Initialize a set to store image files
    image_files = set()

    # Look for .jpg files
    for file in directory.rglob("*.jpg"):
        image_files.add(file.name)  

    # Look for .png files
    for file in directory.rglob("*.png"):
        image_files.add(file.name) 

    return image_files


# checking if there is any overlap into Lraw and other labelled folder in CPICS

In [8]:
# with flag to check if there are common images between LRaw and other folders
from pathlib import Path

def compare_lraw_with_others(base_directory):
    """Compare images in the LRaw folder with images in other folders."""
    
    # Loop through all subfolders in the base directory
    base_path = Path(base_directory)
    for dataset_dir in base_path.iterdir():
        if dataset_dir.is_dir():
            print(f"Processing directory: {dataset_dir.name}")
            
            lraw_dir = dataset_dir / "LRaw"
            if lraw_dir.exists() and lraw_dir.is_dir():
                print(f"Found LRaw folder in {dataset_dir.name}")

                # Get images in the LRaw folder
                lraw_images = get_image_files(lraw_dir)
                
                # Flag to check if there were common images found with any subfolder
                any_common_found = False

                # Get images in other labeled folders
                for subfolder in dataset_dir.iterdir():
                    if subfolder.is_dir() and subfolder.name != "LRaw":
                        #print(f"Processing folder: {subfolder.name}")
                        
                        # Get images in the current folder
                        folder_images = get_image_files(subfolder)
                        #print(f"Number of images in {subfolder.name}: {len(folder_images)}")
                        
                        # Find common images between LRaw and the current folder
                        common_images = lraw_images & folder_images
                        
                        if common_images:
                            any_common_found = True
                            print(f"Found {len(common_images)} common images between LRaw and {subfolder.name}:")
                            for img in sorted(common_images):
                                print(f"  {img}")
                
                # If no common images were found with any subfolder
                if not any_common_found:
                    print(f"No common images between LRaw and any other subfolders in {dataset_dir.name}.")
            else:
                print(f"No LRaw folder found in {dataset_dir.name}.")
                
# Test the function
base_directory = "/home/hk-project-p0021769/hgf_grc7525/data/with_labels/CPICS_Validated"  
#base_directory = '/home/hk-project-p0021769/hgf_diz2155/repos/Masterproject-plankton-dinov2/dinov2/data/datasets/demo_data_for_test'
compare_lraw_with_others(base_directory)


Processing directory: 20141008-14
Found LRaw folder in 20141008-14
No common images between LRaw and any other subfolders in 20141008-14.
Processing directory: 20151008-31
Found LRaw folder in 20151008-31
No common images between LRaw and any other subfolders in 20151008-31.
Processing directory: 20141001-07
Found LRaw folder in 20141001-07
No common images between LRaw and any other subfolders in 20141001-07.
Processing directory: 20151101-12
Found LRaw folder in 20151101-12
No common images between LRaw and any other subfolders in 20151101-12.
Processing directory: 20150101-08
Found LRaw folder in 20150101-08
No common images between LRaw and any other subfolders in 20150101-08.
Processing directory: 20141101-30
Found LRaw folder in 20141101-30
No common images between LRaw and any other subfolders in 20141101-30.
Processing directory: 20141015-31
Found LRaw folder in 20141015-31
No common images between LRaw and any other subfolders in 20141015-31.
Processing directory: 20151113-30


### COMMENT : They dont have any overlapping images in CPICS