In [None]:
!pip install opendatasets mahotas scikit-image

In [None]:
import opendatasets as od
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc, confusion_matrix
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import mahotas  # for Haralick Texture
import matplotlib.pyplot as plt
import seaborn as sns

Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Set the path to save model weights in Google Drive
checkpoint_dir = '/content/drive/MyDrive/nature/'
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
# Download dataset
dataset_url = "https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000"
od.download(dataset_url)
base_dir = './skin-cancer-mnist-ham10000/'
image_dir_1 = './skin-cancer-mnist-ham10000/HAM10000_images_part_1'
image_dir_2 = './skin-cancer-mnist-ham10000/HAM10000_images_part_2'

In [None]:
# Load and preprocess grayscale images
def load_and_preprocess_grayscale_image(image_id, img_size=(224, 224)):
    img_path_1 = os.path.join(image_dir_1, f'{image_id}.jpg')
    img_path_2 = os.path.join(image_dir_2, f'{image_id}.jpg')

    if os.path.exists(img_path_1):
        img = cv2.imread(img_path_1)
    elif os.path.exists(img_path_2):
        img = cv2.imread(img_path_2)
    else:
        raise FileNotFoundError(f"Image {image_id}.jpg not found in either folder.")

    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_gray = cv2.resize(img_gray, img_size)
    img_gray = img_gray / 255.0  # Normalize to [0, 1]
    img_gray = np.expand_dims(img_gray, axis=-1)  # Shape (224, 224, 1)

    return img_gray

In [None]:
# Compute Hu Moments (Shape Feature)
def compute_hu_moments(img_gray):
    moments = cv2.moments(img_gray)
    hu_moments = cv2.HuMoments(moments)
    hu_moments = -np.sign(hu_moments) * np.log10(np.abs(hu_moments) + 1e-12)
    return hu_moments.flatten()

In [None]:
# Compute Haralick Texture (Texture Feature)
def compute_haralick_texture(img_gray):
    img_quantized = (img_gray * 255).astype(np.uint8)
    haralick = mahotas.features.haralick(img_quantized).mean(axis=0)
    return haralick

In [None]:
# Load metadata
metadata_path = base_dir + 'HAM10000_metadata.csv'
metadata = pd.read_csv(metadata_path)

In [None]:
# Map labels to numeric values
label_map = {label: idx for idx, label in enumerate(metadata['dx'].unique())}
metadata['label'] = metadata['dx'].map(label_map)

In [None]:
# Get 100 images from each class
sampled_metadata = metadata.groupby('label').apply(lambda x: x.sample(n=100, random_state=42)).reset_index(drop=True)

In [None]:
# Perform offline data augmentation to create 100 more images for each class
augmentation_datagen = ImageDataGenerator(
    rotation_range=90,
    width_shift_range=0.15,
    height_shift_range=0.2,
    zoom_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [None]:
augmented_images = []
augmented_labels = []

In [None]:
for _, row in sampled_metadata.iterrows():
    image_id = row['image_id']
    label = row['label']
    img_gray = load_and_preprocess_grayscale_image(image_id, img_size=(96, 96))
    img_gray = np.expand_dims(img_gray, axis=0)  # Add batch dimension

    # Generate 100 augmented images for each original image
    i = 0
    for batch in augmentation_datagen.flow(img_gray, batch_size=1):
        augmented_images.append(batch[0])
        augmented_labels.append(label)
        i += 1
        if i >= 80:  # Stop after generating 100 images
            break

In [None]:
# Convert augmented data to numpy arrays
augmented_images = np.array(augmented_images)
augmented_labels = np.array(augmented_labels)

In [None]:
# Convert labels to one-hot encoding
augmented_labels = tf.keras.utils.to_categorical(augmented_labels, num_classes=len(label_map))

In [None]:
# Split augmented data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    augmented_images, augmented_labels, test_size=0.2, random_state=42, stratify=augmented_labels
)

In [None]:
# Further split test data into validation and final test sets (50% each)
X_val, X_final_test, y_val, y_final_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42, stratify=y_test
)

In [None]:
# Model structure
input_gray = tf.keras.layers.Input(shape=(96, 96, 1), name='gray_input')
input_hu = tf.keras.layers.Input(shape=(7,), name='hu_moments_input')
input_haralick = tf.keras.layers.Input(shape=(13,), name='haralick_texture_input')

In [None]:
# gray image processing branch
x = tf.keras.layers.BatchNormalization()(input_gray)
x = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling2D((3, 3))(x)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.MaxPooling2D((2, 2))(x)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = tf.keras.layers.MaxPooling2D((2, 2))(x)
x = tf.keras.layers.Dropout(0.25)(x)
x = tf.keras.layers.Flatten()(x)

In [None]:
# Hu and Haralick features branches
hu_features = tf.keras.layers.Dense(32, activation='relu')(input_hu)
hu_features = tf.keras.layers.BatchNormalization()(hu_features)
haralick_features = tf.keras.layers.Dense(32, activation='relu')(input_haralick)
haralick_features = tf.keras.layers.BatchNormalization()(haralick_features)

In [None]:
# Combine features
combined_features = tf.keras.layers.concatenate([x, hu_features, haralick_features])
x = tf.keras.layers.Dense(1024, activation='relu')(combined_features)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(len(label_map), activation='softmax')(x)

In [None]:
# Model definition
model = tf.keras.Model(inputs=[input_gray, input_hu, input_haralick], outputs=output)

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
def extract_hu_haralick_features(X):
    hu = []
    haralick = []
    for img in X:
        img2d = img.squeeze()
        hu.append(compute_hu_moments(img2d))
        haralick.append(compute_haralick_texture(img2d))
    return np.array(hu), np.array(haralick)

X_train_hu, X_train_haralick = extract_hu_haralick_features(X_train)
X_val_hu, X_val_haralick = extract_hu_haralick_features(X_val)
X_final_test_hu, X_final_test_haralick = extract_hu_haralick_features(X_final_test)


In [None]:
# Train
history = model.fit(
    [X_train, X_train_hu, X_train_haralick],
    y_train,
    validation_data=([X_val, X_val_hu, X_val_haralick], y_val),
    epochs=30,
    batch_size=32
)

# Test
y_pred_prob = model.predict([X_final_test, X_final_test_hu, X_final_test_haralick])
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_final_test, axis=1)


In [None]:
# Evaluate the model on the final test set
y_pred_prob = model.predict([X_final_test, np.zeros((X_final_test.shape[0], 7)), np.zeros((X_final_test.shape[0], 13))])
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_final_test, axis=1)

In [None]:
# Compute precision, recall, f1-score
report = classification_report(y_true, y_pred, target_names=label_map.keys())
print("Classification Report:\n", report)

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_map.keys(), yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()