In [None]:
import os
import time
import requests
from pathlib import Path
from duckduckgo_search import DDGS
from PIL import Image

# List of fruits to be downloaded
fruits = os.listdir("./fruits/train/")

# Base directory for image storage
base_path = Path("./fruits/predict")

# Function to download and save images
def download_images(dest, query, max_results=10):
    """Searches and downloads images from DuckDuckGo."""
    images = []

    with DDGS() as ddgs:
        results = list(ddgs.images(query, max_results=max_results))

    for idx, result in enumerate(results):
        img_url = result["image"]
        try:
            response = requests.get(img_url, timeout=10)
            response.raise_for_status()

            img = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
            img_path = dest / f"{query.replace(' ', '_')}_{idx}.jpg"
            img.save(img_path, "JPEG")
            
            images.append(img_path)
            print(f"[✔] Downloaded {query}: {img_url}")
        except Exception as e:
            print(f"[✘] Failed to download {img_url}: {e}")
    
    return images

# Function to resize images
def resize_images(folder, max_size=400):
    """Resizes images in a folder to max_size pixels while maintaining aspect ratio."""
    for img_file in folder.glob("*.jpg"):
        try:
            img = Image.open(img_file)
            img.thumbnail((max_size, max_size))
            img.save(img_file, "JPEG")
            print(f"[✔] Resized: {img_file}")
        except Exception as e:
            print(f"[✘] Failed to resize {img_file}: {e}")

# Create the base directory if it doesn't exist
base_path.mkdir(parents=True, exist_ok=True)

# Iterate over each fruit and download images
for fruit in fruits:
    # Download images
    images = download_images(base_path, query=f"{fruit} fruit", max_results=10)

    # Resize downloaded images
    resize_images(base_path, max_size=400)

print("✅ All images downloaded and resized!")


In [None]:
import os
import hashlib
from PIL import Image, UnidentifiedImageError
import numpy as np

# Define the paths to the training and validation datasets
train_data_path = './webscrap-fruits/train/'
val_data_path = './webscrap-fruits/test/'

# Function to remove duplicate images
def remove_duplicates(folder_path):
    hash_keys = {}
    duplicates = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith(('jpg', 'jpeg', 'png')):
                file_path = os.path.join(root, filename)
                with open(file_path, 'rb') as f:
                    filehash = hashlib.md5(f.read()).hexdigest()
                if filehash not in hash_keys:
                    hash_keys[filehash] = file_path
                else:
                    duplicates.append(file_path)
    for duplicate in duplicates:
        os.remove(duplicate)
        print(f"Removed duplicate: {duplicate}")

# Function to resize images
def resize_images(folder_path, target_size=(100, 100)):
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith(('jpg', 'jpeg', 'png')):
                file_path = os.path.join(root, filename)
                try:
                    img = Image.open(file_path)
                    img = img.convert("RGB")  # Convert to RGB mode
                    img = img.resize(target_size, Image.LANCZOS)
                    img.save(file_path, "JPEG")
                    print(f"Resized image: {file_path}")
                except UnidentifiedImageError:
                    print(f"Cannot identify image file: {file_path}")
                    os.remove(file_path)

# Remove duplicates and resize images in the training and validation datasets
remove_duplicates(train_data_path)
resize_images(train_data_path)

remove_duplicates(val_data_path)
resize_images(val_data_path)

print("Dataset cleaning completed.")

In [None]:
import os
import shutil

# Define the paths to the training and validation datasets
train_data_path = './fruits/train/'
val_data_path = './fruits/val/'

# Move files from val to train
for fruit_folder in os.listdir(val_data_path):
    val_fruit_path = os.path.join(val_data_path, fruit_folder)
    train_fruit_path = os.path.join(train_data_path, fruit_folder)
    
    # Create the train fruit folder if it doesn't exist
    if not os.path.exists(train_fruit_path):
        os.makedirs(train_fruit_path)
    
    # Move all images from val fruit folder to train fruit folder
    for image in os.listdir(val_fruit_path):
        src_path = os.path.join(val_fruit_path, image)
        dest_path = os.path.join(train_fruit_path, image)
        shutil.move(src_path, dest_path)
        print(f"Moved {image} from {val_fruit_path} to {train_fruit_path}")

# Delete the val folder
shutil.rmtree(val_data_path)
print("Validation folder deleted successfully.")


In [2]:
import os
import shutil
from collections import defaultdict

# Define the paths to the training and test datasets in the fruits-360 folder
train_data_path = './fruits-360/Training/'
test_data_path = './fruits-360/Test/'

# Function to generate fruit mappings based on subdirectories
def generate_fruit_mapping(data_path):
    fruit_mapping = defaultdict(list)
    for subdir in os.listdir(data_path):
        subdir_path = os.path.join(data_path, subdir)
        if os.path.isdir(subdir_path):
            fruit_name = subdir.split()[0]  # Assume the first word is the fruit name
            fruit_mapping[fruit_name].append(subdir)
    return fruit_mapping

# Function to combine images from multiple subdirectories into a single directory
def combine_fruit_images(src_data_path, dest_data_path, fruit_mapping):
    for combined_fruit, subdirs in fruit_mapping.items():
        combined_fruit_path = os.path.join(dest_data_path, combined_fruit)
        os.makedirs(combined_fruit_path, exist_ok=True)
        for subdir in subdirs:
            subdir_path = os.path.join(src_data_path, subdir)
            if os.path.exists(subdir_path):
                for img_file in os.listdir(subdir_path):
                    src_img_path = os.path.join(subdir_path, img_file)
                    dest_img_path = os.path.join(combined_fruit_path, img_file)
                    shutil.move(src_img_path, dest_img_path)
                shutil.rmtree(subdir_path)

# Generate fruit mappings for training and test datasets in the fruits-360 folder
train_fruit_mapping = generate_fruit_mapping(train_data_path)
test_fruit_mapping = generate_fruit_mapping(test_data_path)

# Combine images in the training and test datasets from fruits-360 folder
combine_fruit_images(train_data_path, train_data_path, train_fruit_mapping)
combine_fruit_images(test_data_path, test_data_path, test_fruit_mapping)

print("Fruit images combined successfully!")

Fruit images combined successfully!


In [4]:
import os
import shutil
from collections import defaultdict
from PIL import Image
import hashlib

# Define the paths to the fruits and fruits-360 datasets
fruits_train_data_path = './fruits/train/'
fruits_test_data_path = './fruits/test/'
fruits_360_train_data_path = './fruits-360/Training/'
fruits_360_test_data_path = './fruits-360/Test/'

# Function to generate fruit mappings based on subdirectories
def generate_fruit_mapping(data_path):
    fruit_mapping = defaultdict(list)
    for subdir in os.listdir(data_path):
        subdir_path = os.path.join(data_path, subdir)
        if os.path.isdir(subdir_path):
            fruit_name = subdir.split()[0]  # Assume the first word is the fruit name
            fruit_mapping[fruit_name].append(subdir)
    return fruit_mapping

# Function to check if an image is corrupted
def is_image_corrupted(image_path):
    try:
        img = Image.open(image_path)
        img.verify()
        return False
    except (IOError, SyntaxError) as e:
        return True

# Function to check if an image is a duplicate
def is_duplicate_image(image_path, existing_images_hashes):
    with open(image_path, 'rb') as f:
        img_hash = hashlib.md5(f.read()).hexdigest()
    return img_hash in existing_images_hashes

# Function to combine images from multiple subdirectories into a single directory
def combine_fruit_images(src_data_path, dest_data_path, fruit_mapping, existing_fruits):
    for combined_fruit, subdirs in fruit_mapping.items():
        if combined_fruit.lower() in existing_fruits:
            combined_fruit_path = os.path.join(dest_data_path, combined_fruit.lower())
            os.makedirs(combined_fruit_path, exist_ok=True)
            existing_images_hashes = set()
            for img_file in os.listdir(combined_fruit_path):
                img_path = os.path.join(combined_fruit_path, img_file)
                with open(img_path, 'rb') as f:
                    img_hash = hashlib.md5(f.read()).hexdigest()
                existing_images_hashes.add(img_hash)
            for subdir in subdirs:
                subdir_path = os.path.join(src_data_path, subdir)
                if os.path.exists(subdir_path):
                    for img_file in os.listdir(subdir_path):
                        src_img_path = os.path.join(subdir_path, img_file)
                        if not is_image_corrupted(src_img_path) and not is_duplicate_image(src_img_path, existing_images_hashes):
                            dest_img_path = os.path.join(combined_fruit_path, img_file)
                            shutil.move(src_img_path, dest_img_path)
                    shutil.rmtree(subdir_path)

# Generate fruit mappings for training and test datasets in the fruits-360 folder
train_fruit_mapping = generate_fruit_mapping(fruits_360_train_data_path)
test_fruit_mapping = generate_fruit_mapping(fruits_360_test_data_path)

# Get the list of existing fruits in the fruits/train and fruits/test folders
existing_train_fruits = [d.lower() for d in os.listdir(fruits_train_data_path) if os.path.isdir(os.path.join(fruits_train_data_path, d))]
existing_test_fruits = [d.lower() for d in os.listdir(fruits_test_data_path) if os.path.isdir(os.path.join(fruits_test_data_path, d))]

# Combine images in the training and test datasets from fruits-360 folder to the respective fruits/train and fruits/test folders
combine_fruit_images(fruits_360_train_data_path, fruits_train_data_path, train_fruit_mapping, existing_train_fruits)
combine_fruit_images(fruits_360_test_data_path, fruits_test_data_path, test_fruit_mapping, existing_test_fruits)

print("Fruit images combined successfully!")

Fruit images combined successfully!


In [1]:
import os
import shutil
import numpy as np
from pathlib import Path
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image

# Paths to the dataset
train_dir = Path("fruits/train")  # Adjust if needed

# Get the number of images in each class
class_counts = {cls: len(list((train_dir / cls).glob("*.jpg"))) for cls in os.listdir(train_dir)}
median_count = int(np.median(list(class_counts.values())))  # Use median count to balance

print("Original class distribution:", class_counts)
print("Balancing to:", median_count, "images per class")

# Data augmentation setup for oversampling
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode="nearest"
)

def augment_and_save(img_path, target_dir, target_count):
    """ Augment images until target_count is reached """
    img = Image.open(img_path)
    img_array = np.array(img)  # Convert to NumPy array
    img = img.resize((100, 100))  # Resize if needed

    # Generate augmented images
    img = np.expand_dims(img, axis=0)
    gen = datagen.flow(img, batch_size=1, save_to_dir=target_dir, save_prefix="aug", save_format="jpg")

    for _ in range(target_count):
        next(gen)  # Generates an augmented image

for fruit, count in class_counts.items():
    fruit_dir = train_dir / fruit
    
    if count > median_count:  # Undersampling (Remove extra images)
        excess_images = list(fruit_dir.glob("*.jpg"))[: count - median_count]
        for img in excess_images:
            img.unlink()  # Delete the image file
        print(f"Undersampled {fruit}: Removed {count - median_count} images")
    
    elif count < median_count:  # Oversampling (Generate new images)
        existing_images = list(fruit_dir.glob("*.jpg"))
        num_needed = median_count - count
        for i in range(num_needed):
            augment_and_save(existing_images[i % len(existing_images)], fruit_dir, 1)
        print(f"Oversampled {fruit}: Added {num_needed} images")

print("Dataset is now balanced!")


Original class distribution: {'apple': 2375, 'avocado': 1545, 'banana': 1721, 'blueberry': 1264, 'cantaloupe': 1455, 'cherry': 1755, 'dragonfruit': 828, 'emblic': 817, 'grape': 1975, 'guava': 1310, 'jackfruit': 825, 'kiwi': 1274, 'lychee': 1297, 'mango': 1429, 'orange': 1136, 'papaya': 1314, 'pear': 1993, 'pineapple': 1451, 'pomegranate': 1303, 'raspberry': 1300, 'strawberry': 1687, 'watermelon': 1272}
Balancing to: 1312 images per class
Undersampled apple: Removed 1063 images
Undersampled avocado: Removed 233 images
Undersampled banana: Removed 409 images
Oversampled blueberry: Added 48 images
Undersampled cantaloupe: Removed 143 images
Undersampled cherry: Removed 443 images
Oversampled dragonfruit: Added 484 images
Oversampled emblic: Added 495 images
Undersampled grape: Removed 663 images
Oversampled guava: Added 2 images
Oversampled jackfruit: Added 487 images
Oversampled kiwi: Added 38 images
Oversampled lychee: Added 15 images
Undersampled mango: Removed 117 images
Oversampled o