Importing all the libraries


In [1]:
import pandas as pd
import shutil
import os
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

ModuleNotFoundError: No module named 'pandas'

Apply random augmentations to an image including rotation, mirroring, brightness/contrast,
stretching, and compression.

In [3]:
def augment_image(image):
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        brightness_range=[0.8, 1.2],
        fill_mode='nearest'
    )
    image = np.expand_dims(image, axis=0)
    augmented_image = next(datagen.flow(image, batch_size=1))[0].astype(np.uint8)
    return augmented_image



    Processes images based on metadata so that each class ends up with exactly 5000 images.
    For classes with > 5000 images, it randomly samples 5000 images and copies them.
    For classes with < 5000 images, it copies all originals and augments additional images until 5000.
    Augmented image file names are generated using the original image ID plus a unique augmentation
    counter, and these names are used consistently in the saved file and the metadata CSV.


In [None]:
def process_dataset(metadata_path, image_folder, output_folder, output_metadata_path):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    df = pd.read_csv(metadata_path)
    new_metadata = []
    classes = df['dx'].unique()

    # Define the possible extensions for image files
    possible_extensions = ['.jpg', '.jpeg', '.png']

    for dx_class in classes:
        # Get all image IDs for the current class
        class_images = df[df['dx'] == dx_class]['image_id'].tolist()
        final_images = []  # List to store image IDs that are saved for this class

        if len(class_images) >= 5000:
            # For classes with 5000 or more images, randomly sample 5000 images
            selected_images = random.sample(class_images, 5000)
            for image_id in selected_images:
                # Find the original image file using the possible extensions
                image_path = None
                for ext in possible_extensions:
                    temp_path = os.path.join(image_folder, image_id + ext)
                    if os.path.exists(temp_path):
                        image_path = temp_path
                        break
                if image_path is None:
                    print(f"Image for {image_id} not found. Skipping.")
                    continue
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Could not read image {image_path}. Skipping.")
                    continue
                new_image_id = image_id  # No augmentation needed; keep original name
                new_image_path = os.path.join(output_folder, new_image_id + '.jpg')
                cv2.imwrite(new_image_path, image)
                final_images.append(new_image_id)
        else:
            # For classes with fewer than 5000 images, copy all originals first
            for image_id in class_images:
                image_path = None
                for ext in possible_extensions:
                    temp_path = os.path.join(image_folder, image_id + ext)
                    if os.path.exists(temp_path):
                        image_path = temp_path
                        break
                if image_path is None:
                    print(f"Image for {image_id} not found. Skipping.")
                    continue
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Could not read image {image_path}. Skipping.")
                    continue
                new_image_id = image_id  # Original image name
                new_image_path = os.path.join(output_folder, new_image_id + '.jpg')
                cv2.imwrite(new_image_path, image)
                final_images.append(new_image_id)

            # Initialize an augmentation counter for the class
            aug_counter = 1
            # Augment images until we have exactly 5000 for the class
            while len(final_images) < 5000:
                # Randomly choose one of the original images to augment
                base_image_id = random.choice(class_images)
                image_path = None
                for ext in possible_extensions:
                    temp_path = os.path.join(image_folder, base_image_id + ext)
                    if os.path.exists(temp_path):
                        image_path = temp_path
                        break
                if image_path is None:
                    continue  # Skip if the file is not found
                image = cv2.imread(image_path)
                if image is None:
                    continue
                # Convert image to RGB for augmentation and then back later
                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                augmented_image = augment_image(image_rgb)
                new_image_id = f"{base_image_id}_aug{aug_counter}"
                aug_counter += 1
                new_image_path = os.path.join(output_folder, new_image_id + '.jpg')
                cv2.imwrite(new_image_path, cv2.cvtColor(augmented_image, cv2.COLOR_RGB2BGR))
                final_images.append(new_image_id)

        # Append metadata for all images saved for this class.
        # Their dx value is the same as the current dx_class.
        for image_id in final_images:
            new_metadata.append([image_id, dx_class])

    # Save the metadata CSV
    new_df = pd.DataFrame(new_metadata, columns=['image_id', 'dx'])
    new_df.to_csv(output_metadata_path, index=False)
    print(f"Processed dataset saved to {output_metadata_path}")

process_dataset(
    r"HAM10000_metadata.csv",
    r"R_HAM 10000 images",
    r"Processed_images",
    r"Processed_HAM10000_metadata.csv"
)


Processed dataset saved to Processed_HAM10000_metadata.csv


Paths to the original dataset and metadata

In [5]:
dataset_folder = r"Processed_images"
metadata_path = r"Processed_HAM10000_metadata.csv"


Define output folders for each split

In [None]:
train_folder = r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\Train"
val_folder   = r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\Val"
test_folder  = r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\Test"

# Create the output folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)


In [7]:
# Load the metadata CSV
df = pd.read_csv(metadata_path)

First, split the data into training (70%) and temporary (30%) sets with stratification by 'dx'

In [8]:
df_train, df_temp = train_test_split(df, test_size=0.30, random_state=42, stratify=df['dx'])


Now split the temporary set into validation and test sets.
We want overall 20% for validation and 10% for testing. Since df_temp is 30% of the data.
we split df_temp into ~66.67% validation and ~33.33% test.

In [9]:
df_val, df_test = train_test_split(df_temp, test_size=0.3333, random_state=42, stratify=df_temp['dx'])


Print the number of samples in each split for confirmation

In [10]:
print("Number of training images:", len(df_train))
print("Number of validation images:", len(df_val))
print("Number of testing images:", len(df_test))

Number of training images: 24500
Number of validation images: 7000
Number of testing images: 3500


Function to copy image files based on metadata entries into a specified destination folder

In [11]:
def copy_images(metadata_df, dest_folder):
    for _, row in metadata_df.iterrows():
        image_id = row['image_id']
        # Assuming the images are saved as .jpg files
        src_path = os.path.join(dataset_folder, image_id + ".jpg")
        dest_path = os.path.join(dest_folder, image_id + ".jpg")
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)
        else:
            print(f"Warning: {src_path} not found.")


Copy images for each split

In [12]:
copy_images(df_train, train_folder)
copy_images(df_val, val_folder)
copy_images(df_test, test_folder)

Save the corresponding metadata for each split

In [None]:
df_train.to_csv(r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\train_metadata.csv", index=False)
df_val.to_csv(r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\val_metadata.csv", index=False)
df_test.to_csv(r"C:\Users\arjun\Desktop\skin_Lesion\Skin-Lesion-Classification-and-Segmentation\test_metadata.csv", index=False)

print("Dataset and metadata successfully split into training, validation, and testing sets.")


Dataset and metadata successfully split into training, validation, and testing sets.
