In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from pathlib import Path
import tensorflow as tf

In [3]:
all_data_encoded = pd.read_csv(r"E:\Capstone Skin Cancer Project\Datasets\all_data_with_paths.csv")


In [4]:
female_label_0 = all_data_encoded[(all_data_encoded['Gender_female'] == 1) & (all_data_encoded['label'] == 0)].sample(5500, random_state=58)
male_label_0 = all_data_encoded[(all_data_encoded['Gender_male'] == 1) & (all_data_encoded['label'] == 0)].sample(5500, random_state=58)
female_label_1 = all_data_encoded[(all_data_encoded['Gender_female'] == 1) & (all_data_encoded['label'] == 1)].sample(5500, random_state=58)
male_label_1 = all_data_encoded[(all_data_encoded['Gender_male'] == 1) & (all_data_encoded['label'] == 1)].sample(5500, random_state=58)

filtered_data = pd.concat([female_label_0, male_label_0, female_label_1, male_label_1]).sample(frac=1, random_state=42).reset_index(drop=True)

all_data_encoded = filtered_data

In [5]:
def create_mask_otsu(image):
	"""
	Create an enhanced binary mask using an improved preprocessing pipeline:
	1. Convert to grayscale.
	2. Enhance contrast using CLAHE.
	3. Denoise with a bilateral filter.
	4. Sharpen using an unsharp mask filter.
	5. Optionally smooth with a Gaussian blur.
	6. Apply Otsu's thresholding.
	7. Clean up with morphological operations.
	"""
	# Convert image to grayscale
	# Convert image to grayscale
	gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

	# Enhance local contrast using CLAHE
	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	enhanced = clahe.apply(gray)

	# Use a bilateral filter to reduce noise while preserving edges
	denoised = cv2.bilateralFilter(enhanced, d=9, sigmaColor=75, sigmaSpace=75)

	# Sharpen the image using an unsharp masking kernel
	sharpening_kernel = np.array([[-1, -1, -1],
	                              [-1, 9, -1],
	                              [-1, -1, -1]])
	sharpened = cv2.filter2D(denoised, -1, sharpening_kernel)

	# Optional: Apply Gaussian Blur to reduce any high-frequency artifacts
	blurred = cv2.GaussianBlur(sharpened, (5, 5), 0)

	# Apply Otsu's thresholding to create the binary mask
	_, mask = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	# Use morphological opening to remove small noise artifacts from the mask
	kernel_morph = np.ones((3, 3), np.uint8)
	mask_clean = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_morph, iterations=1)

	return mask_clean

In [6]:
def preprocess_image_and_mask(image_path):
	"""Preprocess an image and create its segmentation mask."""
	# Read and preprocess image
	image = cv2.imread(str(image_path))
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	# Create mask
	try:
		mask = create_mask_otsu(image)
		mask = (mask > 0).astype(np.uint8)  # Convert to binary 0/1
	except Exception as e:
		print(f"Warning: Mask creation failed for {image_path}. Using fallback mask.")
		mask = np.ones(image.shape[:2], dtype=np.uint8)  # Fallback: use entire image

	# Resize both image and mask to 224x224
	image = cv2.resize(image, (224, 224))
	mask = cv2.resize(mask, (224, 224), interpolation=cv2.INTER_NEAREST)

	# Normalize image to [0,1]
	image = image.astype(np.float32) / 255.0

	return image, mask

In [7]:
def _bytes_feature(value):
	"""Returns a bytes_list from a string / byte."""
	return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_list_feature(value):
	"""Returns a float_list from a numpy array."""
	return tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))

In [8]:
def serialize_example(image, mask, metadata, label):
	"""
	Creates a tf.Example message ready to be written to a file.
	"""
	# Convert image to bytes
	image_bytes = tf.io.encode_jpeg(tf.cast(image * 255, tf.uint8)).numpy()

	# Convert mask to bytes - ensure mask is 3D
	mask_3d = np.expand_dims(mask, axis=-1)  # Add channel dimension
	mask_bytes = tf.io.encode_jpeg(tf.cast(mask_3d * 255, tf.uint8)).numpy()

	feature = {
			'image':    _bytes_feature(image_bytes),
			'mask':     _bytes_feature(mask_bytes),
			'metadata': tf.train.Feature(float_list=tf.train.FloatList(value=metadata)),
			'label':    tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
	}

	return tf.train.Example(features=tf.train.Features(feature=feature))

In [9]:
def write_tfrecord(data, filename):
	"""Write dataset to TFRecord including segmentation masks."""
	with tf.io.TFRecordWriter(filename) as writer:
		for idx, row in data.iterrows():
			try:
				# Process image and create mask
				image, mask = preprocess_image_and_mask(row['image_path'])

				# Get metadata and label
				metadata_cols = [col for col in data.columns if col not in ['isic_id', 'image_path', 'label']]
				metadata = row[metadata_cols].values.astype(np.float32)
				label = row['label']

				# Create and write TF Example
				tf_example = serialize_example(image, mask, metadata, label)
				writer.write(tf_example.SerializeToString())
			except Exception as e:
				print(f"Error processing image {row['image_path']}: {str(e)}")
				continue

In [10]:
from sklearn.model_selection import train_test_split

# Split dataset
train_data, test_data = train_test_split(all_data_encoded, test_size=0.2, random_state=42, shuffle=True)
train_data, val_data = train_test_split(train_data, test_size=0.3, random_state=24, shuffle=True)

print(f"✅ Train samples: {len(train_data)}")
print(f"✅ Test samples: {len(test_data)}")
print(f"✅ Validation samples: {len(val_data)}")

✅ Train samples: 12320
✅ Test samples: 4400
✅ Validation samples: 5280


In [11]:
write_tfrecord(train_data, r"E:\Capstone Skin Cancer Project\Datasets\train.tfrecord")
write_tfrecord(val_data, r"E:\Capstone Skin Cancer Project\Datasets\validation.tfrecord")
write_tfrecord(test_data, r"E:\Capstone Skin Cancer Project\Datasets\test.tfrecord")

print("✅ TFRecord creation complete!")

✅ TFRecord creation complete!
