In [2]:
import os
import hashlib
import shutil

In [3]:
def find_images(directory):
    image_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                image_files.append(os.path.join(dirpath, filename))
    return image_files

# Step 2: Function to generate image hashes
def hash_image(filepath):
    hasher = hashlib.md5()
    with open(filepath, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Step 3: Function to compare image hashes and find duplicates
def find_duplicates(image_files):
    hashes = {}
    duplicates = []
    for filepath in image_files:
        image_hash = hash_image(filepath)
        if image_hash in hashes:
            duplicates.append((filepath, hashes[image_hash]))
        else:
            hashes[image_hash] = filepath
    return duplicates

# Step 4: Function to delete duplicate images
def delete_duplicates(duplicates):
    if not os.path.exists("./duplicates/"):
        os.makedirs("./duplicates/")
    for duplicate, original in duplicates:
        # Uncomment the next line to actually delete the file
        # os.remove(duplicate)
        print(f"Duplicate: {duplicate} -> Original: {original}")
        # Optionally, move duplicates to a separate folder for review
        shutil.move(duplicate, "./duplicates/")



In [5]:

# Execute
directory = "/Users/bab226/Pictures/"
image_files = find_images(directory)
duplicates = find_duplicates(image_files)
delete_duplicates(duplicates)

In [6]:
directory

'/Users/bab226/Pictures/'

In [8]:
image_files

['/Users/bab226/Pictures/bryan_photos/20230409_081759.jpg',
 '/Users/bab226/Pictures/bryan_photos/IMG_3795.JPG',
 '/Users/bab226/Pictures/bryan_photos/20221028_140514.jpg',
 '/Users/bab226/Pictures/bryan_photos/20221220_153348.jpg',
 '/Users/bab226/Pictures/bryan_photos/20221021_174820.jpg',
 '/Users/bab226/Pictures/bryan_photos/20230421_163457.jpg',
 '/Users/bab226/Pictures/bryan_photos/20230423_084245.jpg',
 '/Users/bab226/Pictures/conor_photos/20221217_131905.jpg',
 '/Users/bab226/Pictures/conor_photos/20221219_165350.jpg',
 '/Users/bab226/Pictures/conor_photos/20221218_193955.jpg',
 '/Users/bab226/Pictures/conor_photos/20221217_131940.jpg',
 '/Users/bab226/Pictures/stephanie_photos/20221015_191458.jpg',
 '/Users/bab226/Pictures/stephanie_photos/20221018_190200.jpg',
 '/Users/bab226/Pictures/stephanie_photos/20221015_191453.jpg',
 '/Users/bab226/Pictures/stephanie_photos/20221028_133118.jpg',
 '/Users/bab226/Pictures/stephanie_photos/20230128_151550.jpg',
 '/Users/bab226/Pictures/st

In [10]:
hashes = {}
duplicates = []
for filepath in image_files:
    image_hash = hash_image(filepath)
    if image_hash in hashes:
        duplicates.append((filepath, hashes[image_hash]))
    else:
        hashes[image_hash] = filepath

In [13]:
hashes

{'09e8e2c74cb29db0f421982fcdec362e': '/Users/bab226/Pictures/bryan_photos/20230409_081759.jpg',
 'eccd05b005438bc5612f5d8fb2e7d7a8': '/Users/bab226/Pictures/bryan_photos/IMG_3795.JPG',
 '1e789686bcd453c881b7bad3c7a6d7f4': '/Users/bab226/Pictures/bryan_photos/20221028_140514.jpg',
 '25a55aed1227faf53088c406c4844025': '/Users/bab226/Pictures/bryan_photos/20221220_153348.jpg',
 '5e3f202e386f2153fd46e0c3b81ad6ae': '/Users/bab226/Pictures/bryan_photos/20221021_174820.jpg',
 '35475a5b5478922d4693d63f4ec303eb': '/Users/bab226/Pictures/bryan_photos/20230421_163457.jpg',
 'f3ceb858bd148b9f452b46ca8bc9465b': '/Users/bab226/Pictures/bryan_photos/20230423_084245.jpg',
 '3f15ffc58d95bea1e1fc9d8d6ccb61ff': '/Users/bab226/Pictures/conor_photos/20221217_131905.jpg',
 '9527b0cdb7ddac63c2b791aa196bf6b2': '/Users/bab226/Pictures/conor_photos/20221219_165350.jpg',
 '5d864b026bafd025ae2c589b8a15c05c': '/Users/bab226/Pictures/conor_photos/20221218_193955.jpg',
 'c229fed99379e6123b6dfdfb2dc14bf0': '/Users/ba