In [5]:
import cv2
import numpy as np
from PIL import Image
import os

In [None]:
# Check if two images are similar using ORB feature matching
def are_images_similar(img1_path, img2_path):
    img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
    img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
    
    if img1 is None or img2 is None:
        print("Error: One or both images could not be loaded.")
        return False, []
    orb = cv2.ORB_create()
    kp1, des1 = orb.detectAndCompute(img1, None)
    kp2, des2 = orb.detectAndCompute(img2, None)
    if des1 is None or des2 is None:
        return False, []

    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)

    matches = sorted(matches, key=lambda x: x.distance)

    threshold = 25
    similar_matches = [m for m in matches if m.distance < threshold]

    if len(similar_matches) > 10:
        return True, similar_matches
    else:
        return False, similar_matches

# Check the whole directory for similar images
def find_similar_images(directory):
    image_files = [f for f in os.listdir(directory) if f.endswith('.jpg') or f.endswith('.png')]

    similar_groups = []
    checked_images = set()
    
    for i in range(len(image_files)):
        print(f"Checking image {i + 1}/{len(image_files)}")
        if image_files[i] in checked_images:
            continue
        current_group = [image_files[i]]
        for j in range(i + 1, len(image_files)):
            if image_files[j] in checked_images:
                continue
            img1_path = os.path.join(directory, image_files[i])
            img2_path = os.path.join(directory, image_files[j])
            result, _ = are_images_similar(img1_path, img2_path)
            if result:
                current_group.append(image_files[j])
                checked_images.add(image_files[j])
                print(f"Similar images: {image_files[i]} and {image_files[j]}")
                
        if len(current_group) > 1:
            similar_groups.append(current_group)
        checked_images.add(image_files[i])
        
    return similar_groups

def get_image_size(image_name):
    with Image.open(image_name) as img:
        return img.size[0] * img.size[1]

In [None]:
#  Finding similarities will output similar_images: a list in the format [[fig1, fig2, ...], [fig3, fig4, ...], ...],
#  where each sublist contains similar images.

directory = 'your-imgages-dir' # e.g. '/home/user/dataset/class1','/home/user/dataset/class2',...
similar_images = find_similar_images(directory) 
print("Similar images groups:", similar_images)

# Save the similar images groups to a text file
output_file = "your-output-file.txt"  # e.g. 'similar_images.txt'   
with open(output_file, "w", encoding="utf-8") as f:
    for group in similar_images:
        f.write(", ".join(group) + "\n")

print(f"Similar images groups have been saved to {output_file}")

In [None]:
# Read the similar images list, keep only the one with the largest pixel product

image_directory = r"your-imgages-dir"  # e.g. '/home/user/dataset/class1','/home/user/dataset/class2',...   
images = [[os.path.join(image_directory, image) for image in image_list] for image_list in 
          similar_images]

max_images = []
for image_list in images:
    max_image = max(image_list, key=get_image_size)
    max_images.append(max_image)

for image_list in images:
    for image in image_list:
        if image not in max_images:
            os.remove(image)
