Let's gather a few datasets from the ISIC database
# Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

To install the isic Datasets we will use their CLI

In [None]:
!pip install isic-cli

# Metadata loading & Analysis

Before we start looking at the photos, let's gather a few datasets and evaluate the demographic metadata that we have.  For the purposes of this project, we're going to focus on age, gender, and the location of the skin lesion.
We'll focus on the following datasets
- [BCN 20000](https://www.nature.com/articles/s41597-024-03387-w)
- [HAM10000](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/DBW86T)
- [ISIC 2024](https://challenge2024.isic-archive.com/)
- [Hospital Italiano de Buenos Aires Skin Lesions](https://www.nature.com/articles/s41597-023-02630-0)

# Dataset Downloads

International Skin Imaging Collaboration (ISIC) archive is a massive resource for images and metadata for our project.  Let's take a quick look at the available data.

In [None]:
!isic collection list

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\BCN\MetaData', exist_ok=True)
BCN_id = 249
!isic metadata download -c {BCN_id} -o "E:\Capstone Skin Cancer Project\Datasets\BCN\MetaData\BCN_Metadata.csv"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\BCN\Image', exist_ok=True)
!isic image download --collections 249 "E:\Capstone Skin Cancer Project\Datasets\BCN\Image"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\HAM\MetaData', exist_ok=True)
HAM_id = 212
!isic metadata download -c {HAM_id} -o "E:\Capstone Skin Cancer Project\Datasets\HAM\MetaData\HAM_Metadata.csv"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\HAM\Image', exist_ok=True)
!isic image download --collections 212 "E:\Capstone Skin Cancer Project\Datasets\HAM\Image"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\BuenosAires\MetaData', exist_ok=True)
BA_id = 390
!isic metadata download -c {BA_id} -o "E:\Capstone Skin Cancer Project\Datasets\BuenosAires\MetaData\BA_Metadata.csv"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\BuenosAires\Image', exist_ok=True)
!isic image download --collections {BA_id} "E:\Capstone Skin Cancer Project\Datasets\BuenosAires\Image"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\Braff\MetaData', exist_ok=True)
Braff_id = 410
!isic metadata download -c {Braff_id} -o "E:\Capstone Skin Cancer Project\Datasets\Braff\MetaData\Braff_Metadata.csv"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\Braff\Image', exist_ok=True)
!isic image download --collections {Braff_id} "E:\Capstone Skin Cancer Project\Datasets\Braff\Image"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\Melo\MetaData', exist_ok=True)
melo_id = 294
!isic metadata download -c {melo_id} -o "E:\Capstone Skin Cancer Project\Datasets\Melo\MetaData\Melo_Metadata.csv"

In [None]:
os.makedirs(r'E:\Capstone Skin Cancer Project\Datasets\Melo\Image', exist_ok=True)
!isic image download --collections {melo_id} "E:\Capstone Skin Cancer Project\Datasets\Melo\Image"

Lets take a look at the data columns that we currently have, then clean the data up so we can keep the items that we will be looking for to see if there's any correlation between the data points and cancer.

In [None]:
BCN = pd.read_csv(r'E:\Capstone Skin Cancer Project\Datasets\BCN\MetaData\BCN_Metadata.csv')
BCN.head(5)

In [None]:
HAM = pd.read_csv(r'E:\Capstone Skin Cancer Project\Datasets\HAM\MetaData\HAM_Metadata.csv')
HAM.head(5)

In [None]:
BA = pd.read_csv(r'E:\Capstone Skin Cancer Project\Datasets\BuenosAires\MetaData\BA_Metadata.csv', low_memory=False)
BA.head(5)

In [None]:
BRAFF = pd.read_csv(r'E:\Capstone Skin Cancer Project\Datasets\Braff\MetaData\Braff_Metadata.csv')
BRAFF.head(5)

In [None]:
MELO = pd.read_csv(r'E:\Capstone Skin Cancer Project\Datasets\Melo\MetaData\Melo_Metadata.csv', low_memory=False)
MELO.head(5)

After all data has been read, we'll make a function to format all of the data

In [None]:
columns_to_keep = ['isic_id', 'age_approx', 'sex', 'anatom_site_general', 'diagnosis_1', 'diagnosis']
translation_dict = {
		'upper extremity': 'Shoulders & Arms',
		'head/neck':       'Head & Neck',
		'palms/soles':     'Palms & Soles',
		'anterior torso':  'Front Torso',
		'lower extremity': 'Legs',
		'oral/genital':    'Mouth & Groin',
		'posterior torso': 'Back',
		'lateral torso':   'Side Torso (Ribs)', }


def dataFormatting(table):
	if 'diagnosis' not in table.columns and 'diagnosis_2' in table.columns:
		table['diagnosis'] = table['diagnosis_2']

	# Update the 'diagnosis' column if it's blank by using the value from 'diagnosis_1'
	table.loc[table['diagnosis'].isna(), 'diagnosis'] = table['diagnosis_1']

	# Normalize the text in the 'diagnosis' column for consistent filtering
	table['diagnosis'] = table['diagnosis'].str.strip().str.lower()

	# Remove rows where diagnosis_1 (Benign/Malignant) is "Indeterminate"
	table = table[table['diagnosis_1'].str.strip() != "Indeterminate"]

	table = table.dropna(subset=columns_to_keep)  # Drop any columns that don't have the data we need.

	formatted_table = table[columns_to_keep]
	formatted_table = formatted_table.rename(columns={'age_approx':          'Age',
	                                                  'sex':                 'Gender',
	                                                  'anatom_site_general': 'Location',
	                                                  'diagnosis_1':         'Benign/Malignant',
	                                                  'diagnosis':           'Diagnosis'
	                                                  })
	formatted_table['Location'] = formatted_table['Location'].replace(translation_dict)
	formatted_table['Age'] = formatted_table['Age'].astype(int)
	return formatted_table

Lets make one more helper function to find out how many rows are in each table

In [None]:
def rowCount(tables):
	for name, table in tables.items():
		print(f"{name} has {table.shape[0]} rows")
	total_rows = sum(table.shape[0] for table in tables.values())
	print(f"There are a total of {total_rows} rows in all tables")

In [None]:
formatted_BCN = dataFormatting(BCN)
formatted_BCN.head(5)

In [None]:
formatted_HAM = dataFormatting(HAM)
formatted_HAM.head(5)

In [None]:
formatted_MELO = dataFormatting(MELO)
formatted_MELO.head(5)

In [None]:
formatted_BA = dataFormatting(BA)
formatted_BA.head(5)

In [None]:
formatted_BRAFF = dataFormatting(BRAFF)
formatted_BRAFF.head(5)

In [None]:
tables = {
		"BCN":   formatted_BCN,
		"HAM":   formatted_HAM,
		"MELO":  formatted_MELO,
		"BRAFF": formatted_BRAFF,
		"BA":    formatted_BA
}
rowCount(tables)

In [None]:
all_data_duplicated = pd.concat([formatted_BCN, formatted_HAM, formatted_MELO, formatted_BRAFF, formatted_BA],
                                ignore_index=True)
all_data = all_data_duplicated.drop_duplicates(subset='isic_id')
all_data.to_csv(r'E:\Capstone Skin Cancer Project\Datasets\all_datasets_combined.csv', index=False)
print(f'There are {len(all_data)} rows in the combined dataset')
duplicate_isic_ids = all_data[all_data['isic_id'].duplicated()]

# Output result
if duplicate_isic_ids.empty:
	print("All isic_id values are unique.")
else:
	print(f"The following isic_id values are duplicated:\n{duplicate_isic_ids}")

In [None]:
# Group data by 'Gender' and 'Benign/Malignant' and count occurrences
gender_bm_counts = all_data.groupby([
		'Gender',
		'Benign/Malignant']).size().reset_index(name='Count')

# Create the bar plot
ax = sns.barplot(
		data=gender_bm_counts,
		x='Benign/Malignant',
		y='Count',
		hue='Gender',
		palette=['#FBE8A1', '#FFDCF4']
)

# Add bar labels
for bar in ax.patches:
	count = int(bar.get_height())
	ax.text(
			bar.get_x() + bar.get_width() / 2,
			count,
			f"{count}",
			ha='center',
			va='bottom',
			fontsize=12,
			color='black'
	)

plt.legend(title="Gender", loc='upper right')

plt.xlabel("")
plt.ylabel("Count")
plt.title("Count of Gender per Benign and Malignant Categories")

plt.show()

In [None]:
malignant_data = all_data[all_data['Benign/Malignant'] == 'Malignant']
age_gender_counts = malignant_data.groupby(['Age', 'Gender']).size().reset_index(name='Count')

male_counts = age_gender_counts[age_gender_counts['Gender'] == 'male']
female_counts = age_gender_counts[age_gender_counts['Gender'] == 'female']

plt.plot(male_counts['Age'], male_counts['Count'], label='Male', color='blue')
plt.plot(female_counts['Age'], female_counts['Count'], label='Female', color='pink')

plt.xlabel('Age')
plt.ylabel('Count of Malignant Cases')
plt.title('Count of Malignant Cases by Age and Gender')
plt.legend(title='Gender')
plt.grid(True)
plt.show()

In [None]:
location_gender_counts = malignant_data.groupby(['Location', 'Gender']).size().reset_index(name='Count')

# Separate the counts by gender
male_location_counts = location_gender_counts[location_gender_counts['Gender'] == 'male']
female_location_counts = location_gender_counts[location_gender_counts['Gender'] == 'female']

# Sort each by count in descending order
male_location_counts = male_location_counts.sort_values(by='Count', ascending=False)
female_location_counts = female_location_counts.sort_values(by='Count', ascending=False)

ax = sns.barplot(
		data=location_gender_counts,
		x='Location',
		y='Count',
		hue='Gender',
		palette=['blue', 'pink']
)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

plt.legend(title="Gender", loc='upper right')
plt.xlabel("Location")
plt.ylabel("Count")
plt.title("Count of Malignant Diagnoses by Location and Gender")

plt.tight_layout()
plt.show()

max_length = max(len(male_location_counts), len(female_location_counts))

print(f"{'Male:':<30}{'Female:'}")

for i in range(max_length):
	male_str = f"{male_location_counts.iloc[i]['Location']}: {male_location_counts.iloc[i]['Count']}" if i < len(
			male_location_counts) else ""
	female_str = f"{female_location_counts.iloc[i]['Location']}: {female_location_counts.iloc[i]['Count']}" if i < len(
			female_location_counts) else ""
	print(f"{male_str:<30}{female_str}")

In [None]:
Benign_data = all_data[all_data['Benign/Malignant'] == 'Benign']
age_gender_counts = Benign_data.groupby(['Age', 'Gender']).size().reset_index(name='Count')

male_counts = age_gender_counts[age_gender_counts['Gender'] == 'male']
female_counts = age_gender_counts[age_gender_counts['Gender'] == 'female']

plt.plot(male_counts['Age'], male_counts['Count'], label='Male', color='blue')
plt.plot(female_counts['Age'], female_counts['Count'], label='Female', color='pink')

plt.xlabel('Age')
plt.ylabel('Count of Benign Cases')
plt.title('Count of Benign Cases by Age and Gender')
plt.legend(title='Gender')
plt.grid(True)
plt.show()

In [None]:
Benign_location_gender_counts = Benign_data.groupby(['Location', 'Gender']).size().reset_index(name='Count')

# Separate the counts by gender
Benign_male_location_counts = Benign_location_gender_counts[Benign_location_gender_counts['Gender'] == 'male']
Benign_female_location_counts = Benign_location_gender_counts[Benign_location_gender_counts['Gender'] == 'female']

# Sort each by count in descending order
Benign_male_location_counts = Benign_male_location_counts.sort_values(by='Count', ascending=False)
Benign_female_location_counts = Benign_female_location_counts.sort_values(by='Count', ascending=False)

ax = sns.barplot(
		data=Benign_location_gender_counts,
		x='Location',
		y='Count',
		hue='Gender',
		palette=['blue', 'pink']
)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

plt.legend(title="Gender", loc='upper right')
plt.xlabel("Location")
plt.ylabel("Count")
plt.title("Count of Benign Diagnoses by Location and Gender")

plt.tight_layout()
plt.show()

max_length = max(len(Benign_male_location_counts), len(Benign_female_location_counts))

print(f"{'Male:':<30}{'Female:'}")

for i in range(max_length):
	male_str = f"{Benign_male_location_counts.iloc[i]['Location']}: {Benign_male_location_counts.iloc[i]['Count']}" if i < len(
			Benign_male_location_counts) else ""
	female_str = f"{Benign_female_location_counts.iloc[i]['Location']}: {Benign_female_location_counts.iloc[i]['Count']}" if i < len(
			Benign_female_location_counts) else ""
	print(f"{male_str:<30}{female_str}")

In [None]:
from sklearn.preprocessing import OneHotEncoder

metadata_path = r"E:\Capstone Skin Cancer Project\Datasets\all_datasets_combined.csv"

# Load the dataset
all_data = pd.read_csv(metadata_path)

categorical_features = ['Gender', 'Location', 'Diagnosis']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_features = encoder.fit_transform(all_data[categorical_features])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Concatenate encoded metadata with the original DataFrame
all_data_encoded = pd.concat([all_data, encoded_df], axis=1)

# Drop original categorical columns
all_data_encoded.drop(columns=categorical_features, inplace=True)

all_data_encoded.head()

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical features
numerical_features = ['Age']

# Initialize the scaler
scaler = StandardScaler()

# Normalize numerical columns
all_data_encoded[numerical_features] = scaler.fit_transform(all_data_encoded[numerical_features])

# Display normalized dataset
all_data_encoded.head()

In [None]:
# Convert labels to numerical format
all_data_encoded['label'] = all_data_encoded['Benign/Malignant'].map({'Benign': 0, 'Malignant': 1})

# Drop the original label column
all_data_encoded.drop(columns=['Benign/Malignant'], inplace=True)

# Display the dataset with labels
all_data_encoded.head()

In [None]:
all_data_encoded.to_csv(r"E:\Capstone Skin Cancer Project\Datasets\all_data_encoded.csv", index=False)

In [None]:
image_dirs = [
		r"E:\Capstone Skin Cancer Project\Datasets\BCN\Image",
		r"E:\Capstone Skin Cancer Project\Datasets\Braff\Image",
		r"E:\Capstone Skin Cancer Project\Datasets\BuenosAires\Image",
		r"E:\Capstone Skin Cancer Project\Datasets\HAM\Image",
		r"E:\Capstone Skin Cancer Project\Datasets\Melo\Image"
]
# Store all available images in a set
image_files = set()
for directory in image_dirs:
	for file in os.listdir(directory):
		if file.endswith(".jpg"):
			image_files.add(file)

# Check for missing images
missing_images = all_data_encoded[~all_data_encoded['isic_id'].apply(lambda x: f"{x}.jpg").isin(image_files)]

if not missing_images.empty:
	print("🚨 Missing images found! Listing them below:")
	print(missing_images['isic_id'].tolist())


In [None]:
# Store full image paths in the metadata
def find_image_path(isic_id):
	filename = f"{isic_id}.jpg"
	for directory in image_dirs:
		full_path = os.path.join(directory, filename)
		if os.path.exists(full_path):
			return full_path
	return None

In [None]:
# Add image path column
all_data_encoded['image_path'] = all_data_encoded['isic_id'].apply(find_image_path)

In [None]:
# Drop missing images from the dataset
all_data_encoded = all_data_encoded.dropna(subset=['image_path'])

In [None]:
# Save updated dataset with image paths
all_data_encoded.to_csv(r"E:\Capstone Skin Cancer Project\Datasets\all_data_with_paths.csv", index=False)
print("✅ Image verification complete! Updated dataset saved.")

In [21]:
all_data_encoded = pd.read_csv(r"E:\Capstone Skin Cancer Project\Datasets\all_data_with_paths.csv")

In [22]:
import cv2
import numpy as np

# Store resolutions
resolutions = []

for image_path in all_data_encoded['image_path']:
	img = cv2.imread(image_path)
	if img is not None:
		height, width, _ = img.shape
		resolutions.append((width, height))

# Convert to NumPy array
resolutions = np.array(resolutions)

# Find min/max resolution
min_width, min_height = resolutions.min(axis=0)
max_width, max_height = resolutions.max(axis=0)

print(f"📏 Min Resolution: {min_width}x{min_height}")
print(f"📏 Max Resolution: {max_width}x{max_height}")

📏 Min Resolution: 41x41
📏 Max Resolution: 7360x5184


In [38]:
from sklearn.model_selection import train_test_split

# Split dataset
train_data, test_data = train_test_split(all_data_encoded, test_size=0.2, random_state=42, shuffle=True)

print(f"✅ Train samples: {len(train_data)}")
print(f"✅ Test samples: {len(test_data)}")


✅ Train samples: 332372
✅ Test samples: 83093


Lets start building the model

In [18]:
import tensorflow as tf

# Create the model using MobileNetV2 directly
base_model = tf.keras.applications.MobileNetV3Large(
		input_shape=(224, 224, 3),
		include_top=False,
		weights='imagenet'
)

model = tf.keras.Sequential([
		base_model,
		tf.keras.layers.GlobalAveragePooling2D(),
		tf.keras.layers.Dense(2, activation='softmax')
])

# Print model summary
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v3/weights_mobilenet_v3_large_224_1.0_float_no_top_v2.h5
[1m12683000/12683000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Lets test some methods to apply a masking method for the segmentation model

In [30]:
import cv2
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt


def create_mask_otsu_test(image):
    """
    Create an enhanced binary mask using an improved preprocessing pipeline:
    1. Convert to grayscale.
    2. Enhance contrast using CLAHE.
    3. Denoise with a bilateral filter.
    4. Sharpen using an unsharp mask filter.
    5. Optionally smooth with a Gaussian blur.
    6. Apply Otsu's thresholding.
    7. Clean up with morphological operations.
    """
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Enhance local contrast using CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)

    # Use a bilateral filter to reduce noise while preserving edges
    denoised = cv2.bilateralFilter(enhanced, d=9, sigmaColor=75, sigmaSpace=75)

    # Sharpen the image using an unsharp masking kernel
    sharpening_kernel = np.array([[-1, -1, -1],
                                  [-1,  9, -1],
                                  [-1, -1, -1]])
    sharpened = cv2.filter2D(denoised, -1, sharpening_kernel)

    # Optional: Apply Gaussian Blur to reduce any high-frequency artifacts
    blurred = cv2.GaussianBlur(sharpened, (5, 5), 0)

    # Apply Otsu's thresholding to create the binary mask
    _, mask = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Use morphological opening to remove small noise artifacts from the mask
    kernel_morph = np.ones((3, 3), np.uint8)
    mask_clean = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_morph, iterations=1)

    return mask_clean

In [24]:
def create_mask_watershed(image):
	"""Create mask using Watershed segmentation"""
	# Convert to grayscale
	gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
	# Apply Gaussian blur
	blurred = cv2.GaussianBlur(gray, (5, 5), 0)
	# Otsu's thresholding for markers
	_, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	# Noise removal using morphological operations
	kernel = np.ones((3, 3), np.uint8)
	opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

	# Sure background area
	sure_bg = cv2.dilate(opening, kernel, iterations=3)

	# Finding sure foreground area
	dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
	_, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)
	sure_fg = np.uint8(sure_fg)

	# Finding unknown region
	unknown = cv2.subtract(sure_bg, sure_fg)

	# Marker labelling
	_, markers = cv2.connectedComponents(sure_fg)
	markers = markers + 1
	markers[unknown == 255] = 0

	# Apply watershed
	markers = cv2.watershed(image, markers)
	mask = np.zeros_like(gray)
	mask[markers > 1] = 255
	return mask

In [25]:
def create_mask_adaptive(image):
	"""Create mask using adaptive thresholding"""
	gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
	blurred = cv2.GaussianBlur(gray, (5, 5), 0)
	mask = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
	                             cv2.THRESH_BINARY_INV, 11, 2)
	return mask

In [26]:
def process_and_save_results(image_path, output_dir):
	"""Process an image with different segmentation methods and save results"""
	# Read and preprocess image
	image = cv2.imread(str(image_path))
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	# Create output directory if it doesn't exist
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	# Get base filename
	base_name = image_path.stem

	# Apply different segmentation methods
	masks = {
			'otsu':      create_mask_otsu_test(image),
			'watershed': create_mask_watershed(image),
			'adaptive':  create_mask_adaptive(image)
	}

	# Create figure to display results
	plt.figure(figsize=(15, 5))

	# Plot original image
	plt.subplot(141)
	plt.imshow(image)
	plt.title('Original')
	plt.axis('off')

	# Plot masks
	for i, (method, mask) in enumerate(masks.items(), 2):
		plt.subplot(1, 4, i)
		plt.imshow(mask, cmap='gray')
		plt.title(method.capitalize())
		plt.axis('off')

		# Save individual mask
		cv2.imwrite(str(output_dir / f"{base_name}_{method}_mask.png"), mask)

	# Save comparison figure
	plt.savefig(str(output_dir / f"{base_name}_comparison.png"))
	plt.close()

In [None]:
def maskTest():
	# Define paths
	validation_dir = Path(r"E:\Capstone Skin Cancer Project\Datasets\All Images\Mask Validation")
	output_dir = validation_dir / "segmentation_results"

	# Process each image in the validation directory
	for image_path in validation_dir.glob("*.jpg"):
		try:
			process_and_save_results(image_path, output_dir)
			print(f"Processed {image_path.name}")
		except Exception as e:
			print(f"Error processing {image_path.name}: {str(e)}")
	print("Segmentation results saved.")

In [31]:
maskTest()

Processed ISIC_0000466.jpg
Processed ISIC_0000469.jpg
Processed ISIC_0000482.jpg
Processed ISIC_0000484.jpg
Processed ISIC_0000487.jpg
Processed ISIC_0000502.jpg
Processed ISIC_0000511.jpg
Processed ISIC_0000513.jpg
Processed ISIC_0000516.jpg
Processed ISIC_0000517.jpg
Processed ISIC_0000518.jpg
Processed ISIC_0000519.jpg
Processed ISIC_0000520.jpg
Processed ISIC_0000521.jpg
Processed ISIC_0000522.jpg
Processed ISIC_0000526.jpg
Processed ISIC_0000531.jpg
Processed ISIC_0000533.jpg
Processed ISIC_0000547.jpg
Processed ISIC_0000548.jpg
Processed ISIC_0000549.jpg
Processed ISIC_0000550.jpg
Processed ISIC_0000551.jpg
Processed ISIC_0000552.jpg
Processed ISIC_0053528.jpg
Processed ISIC_0053530.jpg
Processed ISIC_0053531.jpg
Processed ISIC_0053549.jpg
Processed ISIC_0053599.jpg
Processed ISIC_0053675.jpg
Processed ISIC_0053758.jpg
Processed ISIC_0053759.jpg
Processed ISIC_0053760.jpg
Processed ISIC_0053761.jpg
Processed ISIC_0053762.jpg
Processed ISIC_0053763.jpg
Processed ISIC_0053764.jpg
P

In [42]:
import tensorflow as tf
import cv2
import numpy as np
from pathlib import Path

def create_mask_otsu(image):
    """
    Create an enhanced binary mask using an improved preprocessing pipeline:
    1. Convert to grayscale.
    2. Enhance contrast using CLAHE.
    3. Denoise with a bilateral filter.
    4. Sharpen using an unsharp mask filter.
    5. Optionally smooth with a Gaussian blur.
    6. Apply Otsu's thresholding.
    7. Clean up with morphological operations.
    """
    # Convert image to grayscale
     # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

    # Enhance local contrast using CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)

    # Use a bilateral filter to reduce noise while preserving edges
    denoised = cv2.bilateralFilter(enhanced, d=9, sigmaColor=75, sigmaSpace=75)

    # Sharpen the image using an unsharp masking kernel
    sharpening_kernel = np.array([[-1, -1, -1],
                                 [-1,  9, -1],
                                 [-1, -1, -1]])
    sharpened = cv2.filter2D(denoised, -1, sharpening_kernel)

    # Optional: Apply Gaussian Blur to reduce any high-frequency artifacts
    blurred = cv2.GaussianBlur(sharpened, (5, 5), 0)

    # Apply Otsu's thresholding to create the binary mask
    _, mask = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Use morphological opening to remove small noise artifacts from the mask
    kernel_morph = np.ones((3, 3), np.uint8)
    mask_clean = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_morph, iterations=1)

    return mask_clean

In [43]:
def preprocess_image_and_mask(image_path):
    """Preprocess an image and create its segmentation mask."""
    # Read and preprocess image
    image = cv2.imread(str(image_path))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Create mask
    try:
        mask = create_mask_otsu(image)
        mask = (mask > 0).astype(np.uint8)  # Convert to binary 0/1
    except Exception as e:
        print(f"Warning: Mask creation failed for {image_path}. Using fallback mask.")
        mask = np.ones(image.shape[:2], dtype=np.uint8)  # Fallback: use entire image

    # Resize both image and mask to 224x224
    image = cv2.resize(image, (224, 224))
    mask = cv2.resize(mask, (224, 224), interpolation=cv2.INTER_NEAREST)

    # Normalize image to [0,1]
    image = image.astype(np.float32) / 255.0

    return image, mask

In [44]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_list_feature(value):
    """Returns a float_list from a numpy array."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))

def serialize_example(image, mask, metadata, label):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    # Convert image to bytes
    image_bytes = tf.io.encode_jpeg(tf.cast(image * 255, tf.uint8)).numpy()

    # Convert mask to bytes - ensure mask is 3D
    mask_3d = np.expand_dims(mask, axis=-1)  # Add channel dimension
    mask_bytes = tf.io.encode_jpeg(tf.cast(mask_3d * 255, tf.uint8)).numpy()

    feature = {
        'image': _bytes_feature(image_bytes),
        'mask': _bytes_feature(mask_bytes),
        'metadata': tf.train.Feature(float_list=tf.train.FloatList(value=metadata)),
        'label': tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
    }

    return tf.train.Example(features=tf.train.Features(feature=feature))

In [45]:
def write_tfrecord(data, filename):
    """Write dataset to TFRecord including segmentation masks."""
    with tf.io.TFRecordWriter(filename) as writer:
        for idx, row in data.iterrows():
            try:
                # Process image and create mask
                image, mask = preprocess_image_and_mask(row['image_path'])

                # Get metadata and label
                metadata_cols = [col for col in data.columns if col not in ['isic_id', 'image_path', 'label']]
                metadata = row[metadata_cols].values.astype(np.float32)
                label = row['label']

                # Create and write TF Example
                tf_example = serialize_example(image, mask, metadata, label)
                writer.write(tf_example.SerializeToString())

                if idx % 100 == 0:
                    print(f"Processed {idx} images")

            except Exception as e:
                print(f"Error processing image {row['image_path']}: {str(e)}")
                continue

In [46]:
def parse_tfrecord(example_proto):
    """Parse TFRecord dataset."""
    feature_description = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'mask': tf.io.FixedLenFeature([], tf.string),
        'metadata': tf.io.VarLenFeature(tf.float32),
        'label': tf.io.FixedLenFeature([1], tf.float32)
    }

    features = tf.io.parse_single_example(example_proto, feature_description)

    # Decode image and mask
    image = tf.io.decode_jpeg(features['image'], channels=3)
    image = tf.cast(image, tf.float32) / 255.0

    mask = tf.io.decode_jpeg(features['mask'], channels=1)
    mask = tf.squeeze(mask)  # Remove the channel dimension when reading
    mask = tf.cast(mask, tf.float32) / 255.0

    # Handle metadata
    metadata = tf.sparse.to_dense(features['metadata'])

    return (image, mask, metadata), features['label']

In [None]:
write_tfrecord(train_data, r"E:\Capstone Skin Cancer Project\Datasets\train.tfrecord")
write_tfrecord(test_data, r"E:\Capstone Skin Cancer Project\Datasets\test.tfrecord")

print("✅ TFRecord creation complete!")

Processed 180600 images
Processed 233500 images
Processed 171000 images
Processed 105500 images
Processed 96400 images
Processed 26600 images
Processed 39600 images
Processed 189600 images
Processed 131200 images
Processed 181600 images
Processed 226300 images
Processed 74200 images
Processed 55300 images
Processed 90300 images
Processed 98500 images
Processed 392400 images
Processed 15700 images
Processed 233400 images
Processed 111100 images
Processed 339100 images
Processed 161000 images
Processed 221700 images
Processed 15900 images
Processed 399600 images
Processed 345400 images
Processed 273600 images
Processed 118100 images
Processed 340500 images
Processed 14900 images
Processed 290100 images
Processed 104200 images
Processed 233300 images
Processed 70100 images
Processed 368500 images
Processed 163200 images
Processed 385500 images
Processed 265100 images
Processed 213600 images
Processed 228900 images
Processed 287700 images
Processed 83700 images
Processed 339500 images
Proc

In [None]:
# Example of how to read the data back
dataset = tf.data.TFRecordDataset(r"E:\Capstone Skin Cancer Project\Datasets\train.tfrecord")
parsed_dataset = dataset.map(parse_tfrecord)