
# <h1> Data preprocessing for Skin Cancer Detection. 
This notebook contains the steps for preprocessing the skin lesion images.

### The following steps will be implemented for preprocessing the images: 
• Load the dataset \
• Remove the duplicates \
• Splitting the dataset into train, validation and test sets \
• Processing the images \
• Resize the images and rescale the picture values \
• And all the necessary steps required for efficient training of the models.


## 1. Import Libraries

First, we need to import the necessary libraries.

In [1]:
import os
import shutil
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array

## 2. Define Directories

Next, we define the directories for the raw data and the processed data.

In [2]:
# Disable oneDNN optimizations
BASE_DIR = r"ML-MinorProject\ISIC-2020 Dataset"
TRAIN_DIR = os.path.join(BASE_DIR, "ISIC_2020_Training_JPEG", "train")
TEST_DIR = os.path.join(BASE_DIR, "ISIC_2020_Test_JPEG", "ISIC_2020_Test_Input")
PROCESSED_TRAIN_DIR = os.path.join(BASE_DIR, "processed_train")
PROCESSED_VAL_DIR = os.path.join(BASE_DIR, "processed_val")
PROCESSED_TEST_DIR = os.path.join(BASE_DIR, "processed_test")

## 3. Load and Preprocess Data

We load the data, remove duplicates, and split it into training, validation, and test sets.

In [3]:
# Load duplicate image list
duplicate_csv_path = os.path.join(BASE_DIR, "ISIC_2020_Training_Duplicates.csv")
duplicate_df = pd.read_csv(duplicate_csv_path)

# Keep only image_name_1 and remove image_name_2
duplicate_images = set(duplicate_df['image_name_1'] + ".jpg")  # Assuming filenames end with .jpg

# Collect image filenames
image_filenames = glob.glob(os.path.join(TRAIN_DIR, "*.jpg"))

# Remove duplicate images
filtered_filenames = [img for img in image_filenames if os.path.basename(img) not in duplicate_images]

# Create dataframe
labels = ["malignant" if "malignant" in fname.lower() else "benign" for fname in filtered_filenames]
data_df = pd.DataFrame({"filename": filtered_filenames, "label": labels})

# Train-validation split
train_df, val_df = train_test_split(data_df, test_size=0.2, random_state=42, stratify=data_df["label"])

FileNotFoundError: [Errno 2] No such file or directory: 'ML-MinorProject\\ISIC-2020 Dataset\\ISIC_2020_Training_Duplicates.csv'

## 4. Create Directories for Organized Dataset

We create directories for the processed training, validation, and test datasets.

In [22]:
# Create directories if they don't exist
os.makedirs(PROCESSED_TRAIN_DIR, exist_ok=True)
os.makedirs(PROCESSED_VAL_DIR, exist_ok=True)
os.makedirs(PROCESSED_TEST_DIR, exist_ok=True)

for category in ["benign", "malignant"]:
    os.makedirs(os.path.join(PROCESSED_TRAIN_DIR, category), exist_ok=True)
    os.makedirs(os.path.join(PROCESSED_VAL_DIR, category), exist_ok=True)
    os.makedirs(os.path.join(PROCESSED_TEST_DIR, category), exist_ok=True)


## 5. Load Test Ground Truth

We load the test ground truth data from the `ISIC_2020_Test_GroundTruth.csv` file.

In [4]:
# Load test ground truth
test_ground_truth_csv_path = os.path.join(BASE_DIR, "ISIC_2020_Test_GroundTruth.csv")
test_ground_truth_df = pd.read_csv(test_ground_truth_csv_path)
# Create the 'label' column based on the 'target' column
test_ground_truth_df["label"] = test_ground_truth_df["target"].apply(lambda x: "malignant" if x == 1 else "benign")

# Map 'image_name' to file paths
test_ground_truth_df["filename"] = test_ground_truth_df["isic_id"].apply(lambda x: os.path.join(TEST_DIR, f"{x}.jpg"))

# Debug: Check the first few rows of the ground truth DataFrame
print("test_ground_truth_df:")
print(test_ground_truth_df.head())

# Filter out rows where the file does not exist in TEST_DIR
test_ground_truth_df = test_ground_truth_df[test_ground_truth_df["filename"].apply(os.path.exists)]

# Debug: Check the number of benign and malignant images
print("Number of benign images in ground truth:", len(test_ground_truth_df[test_ground_truth_df["label"] == "benign"]))
print("Number of malignant images in ground truth:", len(test_ground_truth_df[test_ground_truth_df["label"] == "malignant"]))

# Create test_df directly from the ground truth DataFrame
test_df = test_ground_truth_df[["filename", "label"]]

# Debug: Check the first few rows of test_df
print("test_df:")
print(test_df.head())

# Debug: Check malignant images in test_df
test_df_malignant = test_df[test_df["label"] == "malignant"]
print("test_df_malignant:")
print(test_df_malignant.head())

FileNotFoundError: [Errno 2] No such file or directory: 'ML-MinorProject\\ISIC-2020 Dataset\\ISIC_2020_Test_GroundTruth.csv'

In [34]:
test_df_malignant = test_df[test_df["label"] == "malignant"]
print("test_df_malignant: ", test_df_malignant.head())

test_df_malignant:                                                filename      label
24   G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\...  malignant
66   G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\...  malignant
102  G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\...  malignant
110  G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\...  malignant
192  G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\...  malignant


## 6. Move test images
We now move the test images to `processed_test` directory according to their classes.


In [36]:
# Move test images to processed_test directory
for _, row in test_df.iterrows():
    label = row["label"]
    src_path = row["filename"]
    dest_dir = os.path.join(PROCESSED_TEST_DIR, label)

    # Ensure the destination directory exists
    os.makedirs(dest_dir, exist_ok=True)

    # Check if the source file exists
    if os.path.exists(src_path):
        try:
            # Move the file
            shutil.move(src_path, os.path.join(dest_dir, os.path.basename(src_path)))
            print(f"Moved {src_path} to {dest_dir}")
        except Exception as e:
            print(f"Error moving file {src_path} to {dest_dir}: {e}")
    else:
        print(f"File not found: {src_path}")

# Debugging: Check the number of malignant and benign images in processed_test
print(f"Number of benign images in processed_test: {len(os.listdir(os.path.join(PROCESSED_TEST_DIR, 'benign')))}")
print(f"Number of malignant images in processed_test: {len(os.listdir(os.path.join(PROCESSED_TEST_DIR, 'malignant')))}")

Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\ISIC_0052060.jpg to G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\processed_test\benign
Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\ISIC_0052349.jpg to G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\processed_test\benign
Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\ISIC_0058510.jpg to G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\processed_test\benign
Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\ISIC_0073313.jpg to G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\processed_test\benign
Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\ISIC_0073502.jpg to G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\processed_test\benign
Moved G:\OneDrive\ML-MinorProject\ISIC-2020 Dataset\ISIC_2020_Test_JPEG\ISIC_2020_Test_Input\IS

## 7. Load Train Ground Truth 

In [37]:
# Load train ground truth
train_ground_truth_csv_path = os.path.join(BASE_DIR, "ISIC_2020_Training_GroundTruth.csv")
train_ground_truth_df = pd.read_csv(train_ground_truth_csv_path)

# Create dataframe for train data
train_filenames = glob.glob(os.path.join(TRAIN_DIR, "*.jpg"))
train_ground_truth_df["filename"] = train_ground_truth_df["image_name"].apply(lambda x: os.path.join(TRAIN_DIR, f"{x}.jpg"))
train_ground_truth_df["label"] = train_ground_truth_df["target"].apply(lambda x: "malignant" if x == 1 else "benign")

## 8. Splitting the Train images
We split the train images into train set and validation set.

In [38]:
# Split training data into train and validation sets
train_split_df, val_split_df = train_test_split(
    train_ground_truth_df, test_size=0.2, random_state=42, stratify=train_ground_truth_df["label"]
)

## 9. Move Images
We move the images to `processed_train` images and `processed_val` images in accordance to the split.

In [39]:
# Move training images to processed_train directory
for _, row in train_split_df.iterrows():
    label = row["label"]
    src_path = row["filename"]
    dest_dir = os.path.join(PROCESSED_TRAIN_DIR, label)
    if os.path.exists(src_path):  # Ensure the file exists before moving
        shutil.move(src_path, os.path.join(dest_dir, os.path.basename(src_path)))

# Move validation images to processed_val directory
for _, row in val_split_df.iterrows():
    label = row["label"]
    src_path = row["filename"]
    dest_dir = os.path.join(PROCESSED_VAL_DIR, label)
    if os.path.exists(src_path):  # Ensure the file exists before moving
        shutil.move(src_path, os.path.join(dest_dir, os.path.basename(src_path)))

## Conclusion

The data preprocessing is complete, and the images are ready for training the models.