In [2]:
from PIL import Image
import imagehash
import os
import random
import shutil

DIR_REAL_TRAIN = '/Users/alexchilton/Downloads/working/train'
DIR_REAL_TEST = '/Users/alexchilton/Downloads/working/test'
DIR_REAL_VALIDATION = '/Users/alexchilton/Downloads/working/validation'

'''
# Define paths 
input_dirs = {
    'train': DIR_TRAIN,
    'validation': DIR_VALIDATION,
    'test': DIR_TEST
}
output_dir = OUTPUT_DIR

# Create output directory and subdirectories if they don't exist
for subset in input_dirs.keys():
    subset_dir = os.path.join(output_dir, subset)
    if not os.path.exists(subset_dir):
        os.makedirs(subset_dir)


# Process each directory separately
for subset, dir_path in input_dirs.items():
    process_directory(dir_path, os.path.join(output_dir, subset))

print("Resizing and saving images completed.")
'''

def load_images_with_hashes(directory):
    """ Load all images in the given directory and compute their hashes """
    images_with_hashes = {}
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            images_with_hashes[class_name] = {}
            for file_name in os.listdir(class_path):
                if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                    file_path = os.path.join(class_path, file_name)
                    try:
                        img = Image.open(file_path)
                        img = img.convert('RGB')  # Ensure all images are in RGB format
                        img_hash = imagehash.average_hash(img)
                        images_with_hashes[class_name][file_name] = img_hash
                    except Exception as e:
                        print(f"Error loading image {file_path}: {e}")
    return images_with_hashes

def find_duplicates(train_hashes, test_hashes):
    """ Find duplicate images between the training and test sets """
    duplicates = []
    for class_name, test_images in test_hashes.items():
        train_images = train_hashes.get(class_name, {})
        for test_file, test_hash in test_images.items():
            for train_file, train_hash in train_images.items():
                if test_hash == train_hash:
                    duplicates.append((class_name, train_file))
    return duplicates

def delete_duplicates(duplicates, train_directory):
    """ Delete duplicate images from the training directory """
    for class_name, train_file in duplicates:
        file_path = os.path.join(train_directory, class_name, train_file)
        try:
            os.remove(file_path)
            print(f"Deleted duplicate file: {file_path}")
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")

# Load images and compute hashes
train_hashes = load_images_with_hashes(DIR_REAL_TRAIN)
test_hashes = load_images_with_hashes(DIR_REAL_TEST)

# Find duplicates
duplicates = find_duplicates(train_hashes, test_hashes)

# Delete duplicates
delete_duplicates(duplicates, DIR_REAL_TRAIN)

In [3]:
def delete_all_images(directory):
    """ Delete all images in the given directory """
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for file_name in os.listdir(class_path):
                file_path = os.path.join(class_path, file_name)
                try:
                    os.remove(file_path)
                    print(f"Deleted file: {file_path}")
                except Exception as e:
                    print(f"Error deleting file {file_path}: {e}")

# Delete all images in the validation directory
delete_all_images(DIR_REAL_VALIDATION)

Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_10.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_8.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_9.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_2.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_3.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_1.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_4.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_5.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_7.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/apple/Image_6.JPG
Deleted file: /Users/alexchilton/Downloads/working/validation/turnip/Image_10.jpg
Deleted file: /Users/alexchilton/Downloads/working/validation/turnip/Image_8.jpg
Deleted file: /Users/alexchilton/Dow

In [4]:




def move_random_images(train_directory, validation_directory, num_images=10):
    """ Move a random sample of images from the training directory to the validation directory """
    for class_name in os.listdir(train_directory):
        train_class_path = os.path.join(train_directory, class_name)
        validation_class_path = os.path.join(validation_directory, class_name)
        
        if os.path.isdir(train_class_path):
            # Ensure the validation class directory exists
            os.makedirs(validation_class_path, exist_ok=True)
            
            # List all images in the training class directory
            images = [f for f in os.listdir(train_class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            # Select random images to move
            images_to_move = random.sample(images, min(num_images, len(images)))
            
            for image in images_to_move:
                src_path = os.path.join(train_class_path, image)
                dest_path = os.path.join(validation_class_path, image)
                
                try:
                    shutil.move(src_path, dest_path)
                    print(f"Moved {src_path} to {dest_path}")
                except Exception as e:
                    print(f"Error moving file {src_path} to {dest_path}: {e}")

# Move 10 random images from each class in the training data to the validation data
move_random_images(DIR_REAL_TRAIN, DIR_REAL_VALIDATION)

Moved /Users/alexchilton/Downloads/working/train/apple/Image_23.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_23.jpg
Moved /Users/alexchilton/Downloads/working/train/apple/Image_81.png to /Users/alexchilton/Downloads/working/validation/apple/Image_81.png
Moved /Users/alexchilton/Downloads/working/train/apple/Image_43.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_43.jpg
Moved /Users/alexchilton/Downloads/working/train/apple/Image_51.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_51.jpg
Moved /Users/alexchilton/Downloads/working/train/apple/Image_90.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_90.jpg
Moved /Users/alexchilton/Downloads/working/train/apple/Image_78.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_78.jpg
Moved /Users/alexchilton/Downloads/working/train/apple/Image_55.jpg to /Users/alexchilton/Downloads/working/validation/apple/Image_55.jpg
Moved /Users/alexchilton/Downloads