In [8]:
import os
from PIL import Image
import imagehash
import matplotlib.pyplot as plt

IMAGE_DIR = r"D:\images\Darjeeling\Private\Special"
SIMILARITY_THRESHOLD = 3   # 0 = identical, 5–8 = very similar

def get_phash(image_path):
    img = Image.open(image_path).convert("RGB")
    return imagehash.phash(img)

def find_near_duplicates(folder):
    images = []
    hashes = []

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, file)
            images.append(path)
            hashes.append(get_phash(path))

    similar_pairs = []

    for i in range(len(hashes)):
        for j in range(i + 1, len(hashes)):
            distance = hashes[i] - hashes[j]
            if distance <= SIMILARITY_THRESHOLD:
                similar_pairs.append((images[i], images[j], distance))

    return similar_pairs

def show_similar_images(pairs):
    for img1, img2, dist in pairs:
        plt.figure(figsize=(6, 3))

        plt.subplot(1, 2, 1)
        plt.imshow(Image.open(img1))
        plt.title(os.path.basename(img1))
        plt.axis("off")

        plt.subplot(1, 2, 2)
        plt.imshow(Image.open(img2))
        plt.title(f"{os.path.basename(img2)}\nDistance: {dist}")
        plt.axis("off")

        plt.show()

if __name__ == "__main__":
    pairs = find_near_duplicates(IMAGE_DIR)

    if not pairs:
        print("No near-duplicate images found.")
    else:
        print("Near-duplicate image pairs:")
        for p in pairs:
            print(p[0], "<->", p[1], "Distance:", p[2])

        # show_similar_images(pairs)


Near-duplicate image pairs:
D:\images\Darjeeling\Private\Special\hello.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251216-WA0087.jpg Distance: 0
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0041.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0042.jpg Distance: 2
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0041.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0193.jpg Distance: 2
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0041.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0195.jpg Distance: 0
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0042.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0193.jpg Distance: 2
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0042.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0195.jpg Distance: 2
D:\images\Darjeeling\Private\Special\IMG-20251215-WA0162.jpg <-> D:\images\Darjeeling\Private\Special\IMG-20251215-WA0186.jpg Distance: 0
D:\images\Darjeeling

In [3]:
dir = IMAGE_DIR + "\\duplicate"
dir

'D:\\images\\Darjeeling\\duplicate'

In [9]:
import shutil

DUPLICATE_DIR = IMAGE_DIR + "\\duplicate"

os.makedirs(DUPLICATE_DIR, exist_ok=True)

def get_phash(image_path):
    return imagehash.phash(Image.open(image_path).convert("RGB"))

def move_duplicates(folder):
    hashes = {}
    
    for file in os.listdir(folder):
        if not file.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        path = os.path.join(folder, file)
        h = get_phash(path)

        if h in hashes:
            # distance == 0 → identical
            shutil.move(path, os.path.join(DUPLICATE_DIR, file))
            print(f"Moved duplicate: {file}")
        else:
            hashes[h] = path

if __name__ == "__main__":
    move_duplicates(IMAGE_DIR)
    print("Done.")


Moved duplicate: IMG-20251215-WA0186.jpg
Moved duplicate: IMG-20251215-WA0195.jpg
Moved duplicate: IMG-20251215-WA0209.jpg
Moved duplicate: IMG-20251215-WA0212.jpg
Moved duplicate: IMG-20251215-WA0216.jpg
Moved duplicate: IMG-20251215-WA0220.jpg
Moved duplicate: IMG-20251216-WA0045.jpg
Moved duplicate: IMG-20251216-WA0049.jpg
Moved duplicate: IMG-20251216-WA0087.jpg
Done.
