In [1]:
from google.colab import drive
drive.mount('/content/drive')
!unzip -q "/content/drive/MyDrive/2019isic_and_2016isic.zip" -d "/content"


Mounted at /content/drive


In [2]:
# ==============================================================================
# FINAL SCALABLE SCRIPT (GENERATOR-FIRST STRATEGY)
# DermaSense - June 14, 2025
#
# STRATEGY:
# 1. GENERATOR-FIRST DATA HANDLING: Abandons in-memory loading and SMOTE.
#    Uses Keras's `flow_from_directory` for the training set to ensure
#    extreme memory efficiency and scalability.
# 2. CLASS WEIGHT BALANCING: Re-introduces the `class_weight` method to
#    handle imbalance by adjusting the loss function, which uses no extra RAM.
# 3. PATIENT TRAINING & UPGRADED ARCHITECTURE are retained.
# ==============================================================================

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import TopKCategoricalAccuracy
import numpy as np
import os
from sklearn.utils import class_weight

# --- Configuration ---
IMG_SIZE = (300, 300)  # EfficientNetB3 prefers this size
BATCH_SIZE = 32
DATA_DIR = "/content/processed_224x224"  # Path to your data
MODEL_NAME = "best_dermasense_b3_scalable_model.keras"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
VAL_DIR = os.path.join(DATA_DIR, "val")
TEST_DIR = os.path.join(DATA_DIR, "test")

print("🚀 Starting DermaSense with SCALABLE Generator-First Strategy")

# ==============================================================================
# 1. SCALABLE DATA PREPARATION (GENERATORS + CLASS WEIGHTS)
# ==============================================================================

# --- Step 1: Create Data Generators ---
# For training, we apply both preprocessing and augmentation.
train_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
    rotation_range=20,
    width_shift_range=0.15,
    height_shift_range=0.15,
    zoom_range=0.15,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
)

# For validation and test, we ONLY apply preprocessing.
val_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

# Define test_datagen similar to val_datagen
test_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

print("\n🧠 Setting up data generators...")
train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True,
    interpolation='lanczos'
)

val = val_datagen.flow_from_directory(
    VAL_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False,
    interpolation='lanczos'
)

test = test_datagen.flow_from_directory(
    TEST_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False,
    interpolation='lanczos'
)

# --- Step 2: Calculate Class Weights for Balancing ---
print("\n⚖️ Calculating class weights to handle imbalance...")
# Get the class labels from the training generator
class_labels = train_generator.classes

# Calculate the weights
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(class_labels),
    y=class_labels
)

# Create a dictionary mapping class indices to their weights
class_weight_dict = {i: w for i, w in enumerate(weights)}
print("Class weights calculated successfully:")
print(class_weight_dict)


# ==============================================================================
# 2. MODEL DEFINITION & LOSS FUNCTION (Restored "Champion" Head)
# ==============================================================================
class FocalLoss(tf.keras.losses.Loss):
    def __init__(self, alpha=0.25, gamma=2.0, **kwargs): super().__init__(**kwargs); self.alpha=alpha; self.gamma=gamma
    def call(self, y_true, y_pred):
        ce = tf.keras.losses.categorical_crossentropy(y_true, y_pred, from_logits=False)
        p_t = tf.exp(-ce); return tf.reduce_mean(self.alpha * tf.pow(1 - p_t, self.gamma) * ce)

base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=(*IMG_SIZE, 3))
base_model.trainable = False

inputs = base_model.input
x = layers.GlobalAveragePooling2D()(base_model.output)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(384, activation='swish')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(192, activation='swish')(x)
x = layers.Dropout(0.15)(x)
outputs = layers.Dense(train_generator.num_classes, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

# ==============================================================================
# 3. CALLBACKS & COMPILATION
# ==============================================================================
early_stop = EarlyStopping(monitor='val_top_2_accuracy', patience=15, restore_best_weights=True, verbose=1, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_top_2_accuracy', factor=0.2, patience=5, min_lr=1e-7, verbose=1, mode='max')
checkpoint = ModelCheckpoint(MODEL_NAME, monitor='val_top_2_accuracy', save_best_only=True, verbose=1, mode='max')
callbacks = [early_stop, reduce_lr, checkpoint]

# ==============================================================================
# 4. PATIENT TRAINING STRATEGY
# ==============================================================================

# --- Phase 1: Frozen Base Model ---
print("\n🎯 Phase 1: Training classifier head...")
optimizer_phase1 = tf.keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001)
model.compile(optimizer=optimizer_phase1, loss=FocalLoss(), metrics=['accuracy', TopKCategoricalAccuracy(k=2, name='top_2_accuracy')])
history_phase1 = model.fit(
    train_generator,
    epochs=40,
    validation_data=val,
    callbacks=callbacks,
    class_weight=class_weight_dict, # Apply the balancing weights here
    verbose=1
)

# --- Phase 2: Full Fine-Tuning ---
print("\n🔧 Phase 2: Unfreezing all layers for full fine-tuning...")
base_model.trainable = True
optimizer_fine_tune = tf.keras.optimizers.AdamW(learning_rate=2e-5, weight_decay=0.0001)
model.compile(optimizer=optimizer_fine_tune, loss=FocalLoss(), metrics=['accuracy', TopKCategoricalAccuracy(k=2, name='top_2_accuracy')])
print("Model re-compiled for fine-tuning.")
history_fine_tune = model.fit(
    train_generator,
    epochs=80,
    initial_epoch=len(history_phase1.history['loss']),
    validation_data=val,
    callbacks=callbacks,
    class_weight=class_weight_dict, # Continue using weights in fine-tuning
    verbose=1
)

print("\n🚀 SCALABLE TRAINING COMPLETE!")

# ==============================================================================
# 5. FINAL EVALUATION
# ==============================================================================
print("\n" + "="*50 + "\n🎯 FINAL EVALUATION ON TEST SET\n" + "="*50)
print(f"Loading best model weights from '{MODEL_NAME}'...")
try:
    model.load_weights(MODEL_NAME)
    results = model.evaluate(test, verbose=1)
    print(f"\n🏆 Final Model Results:")
    print(f"  - Test Accuracy (Top-1): {results[1]:.4f} ({results[1]*100:.2f}%)")
    print(f"  - Test Top-2 Accuracy:   {results[2]:.4f} ({results[2]*100:.2f}%)")
    print(f"  - Test Loss:             {results[0]:.4f}")
except Exception as e:
    print(f"Could not load or evaluate {MODEL_NAME}. Error: {e}")

🚀 Starting DermaSense with SCALABLE Generator-First Strategy

🧠 Setting up data generators...
Found 14783 images belonging to 7 classes.
Found 2528 images belonging to 7 classes.
Found 2516 images belonging to 7 classes.

⚖️ Calculating class weights to handle imbalance...
Class weights calculated successfully:
{0: np.float64(2.2781630451533363), 1: np.float64(0.8598766868310842), 2: np.float64(0.4500015220236827), 3: np.float64(10.301742160278746), 4: np.float64(0.508391223605475), 5: np.float64(0.9390205170551992), 6: np.float64(21.33189033189033)}
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5
[1m43941136/43941136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step



🎯 Phase 1: Training classifier head...


  self._warn_if_super_not_called()


Epoch 1/40
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.5476 - loss: 0.2347 - top_2_accuracy: 0.7292

  self._warn_if_super_not_called()



Epoch 1: val_top_2_accuracy improved from -inf to 0.82318, saving model to best_dermasense_b3_scalable_model.keras
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m720s[0m 1s/step - accuracy: 0.5477 - loss: 0.2346 - top_2_accuracy: 0.7293 - val_accuracy: 0.6254 - val_loss: 0.1400 - val_top_2_accuracy: 0.8232 - learning_rate: 0.0010
Epoch 2/40
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6488 - loss: 0.1329 - top_2_accuracy: 0.8329
Epoch 2: val_top_2_accuracy did not improve from 0.82318
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 1s/step - accuracy: 0.6488 - loss: 0.1329 - top_2_accuracy: 0.8329 - val_accuracy: 0.6472 - val_loss: 0.1327 - val_top_2_accuracy: 0.8232 - learning_rate: 0.0010
Epoch 3/40
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.6763 - loss: 0.1144 - top_2_accuracy: 0.8532
Epoch 3: val_top_2_accuracy improved from 0.82318 to 0.83703, saving model to b

  self._warn_if_super_not_called()


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 621ms/step - accuracy: 0.8647 - loss: 0.0605 - top_2_accuracy: 0.9516

🏆 Final Model Results:
  - Test Accuracy (Top-1): 0.8565 (85.65%)
  - Test Top-2 Accuracy:   0.9690 (96.90%)
  - Test Loss:             0.0523


In [3]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, top_k_accuracy_score, roc_auc_score, confusion_matrix

# ==============================================================================
# 1. GET MODEL PREDICTIONS
# ==============================================================================
# This section assumes your 'model' and 'test' generator are already loaded.
print("🔍 Predicting on the test set...")
pred_probs = model.predict(test, verbose=1)
y_true = test.classes
class_names = list(test.class_indices.keys())

# Get the Top-1 prediction for each sample
y_pred_top1 = np.argmax(pred_probs, axis=1)


# ==============================================================================
# 2. PER-CLASS F1-SCORES (TOP-1)
# ==============================================================================
# The standard classification report provides the per-class F1-scores.
print("\n" + "="*60)
print("📊 Standard Classification Report (Per-Class F1, etc.)")
print("="*60)
# The 'f1-score' column contains the requested metric for each class.
# We will save this report to extract metrics later.
report = classification_report(y_true, y_pred_top1, target_names=class_names, digits=4, output_dict=True)
print(classification_report(y_true, y_pred_top1, target_names=class_names, digits=4))


# ==============================================================================
# 3. OVERALL ACCURACY METRICS
# ==============================================================================
print("\n" + "="*60)
print("🎯 Overall Accuracy Metrics")
print("="*60)

# Calculate Top-1 and Top-2 accuracy scores
top1_accuracy = accuracy_score(y_true, y_pred_top1)
top2_accuracy = top_k_accuracy_score(y_true, pred_probs, k=2)

print(f"Top-1 Accuracy: {top1_accuracy:.4f} ({top1_accuracy*100:.2f}%)")
print(f"Top-2 Accuracy: {top2_accuracy:.4f} ({top2_accuracy*100:.2f}%)")


# ==============================================================================
# 4. CLINICAL UTILITY METRICS FOR MELANOMA (SOTA)
# ==============================================================================
# This section calculates the key metrics for the binary task of
# "Melanoma vs. Non-Melanoma" which is critical for publication.
print("\n" + "="*60)
print("🔬 Clinical Utility Metrics for Melanoma Detection (SOTA)")
print("="*60)

# Find the class index for melanoma
melanoma_idx = test.class_indices.get('melanoma', None)

if melanoma_idx is not None:
    # Create binary true labels (1 for melanoma, 0 for others)
    y_true_binary = (y_true == melanoma_idx).astype(int)

    # Get the predicted probabilities for the melanoma class
    y_scores_melanoma = pred_probs[:, melanoma_idx]

    # --- Calculate AUC ---
    auc = roc_auc_score(y_true_binary, y_scores_melanoma)
    print(f"Melanoma vs. Others AUC: {auc:.4f}")

    # --- Calculate Specificity ---
    # Specificity = True Negatives / (True Negatives + False Positives)
    tn, fp, fn, tp = confusion_matrix(y_true_binary, y_pred_top1 == melanoma_idx).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    print(f"Melanoma Specificity:    {specificity:.4f}")

    # --- Retrieve Sensitivity (Recall) from the main report ---
    sensitivity = report['melanoma']['recall']
    print(f"Melanoma Sensitivity:    {sensitivity:.4f}")

else:
    print("Could not find 'melanoma' class to calculate clinical metrics.")

# ==============================================================================
# 5. FINAL SOTA METRICS SUMMARY
# ==============================================================================
print("\n" + "="*60)
print("🏆 Final SOTA-Ready Metrics Summary")
print("="*60)

# Balanced Accuracy is the macro average of the recall scores
balanced_accuracy = report['macro avg']['recall']

print(f"Balanced Multi-Class Accuracy: {balanced_accuracy:.4f} ({balanced_accuracy*100:.2f}%)")
if melanoma_idx is not None:
    print(f"Melanoma vs. Others AUC:         {auc:.4f}")
    print(f"Melanoma Sensitivity (Recall):   {sensitivity:.4f}")
    print(f"Melanoma Specificity:            {specificity:.4f}")

🔍 Predicting on the test set...
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 635ms/step

📊 Standard Classification Report (Per-Class F1, etc.)
                      precision    recall  f1-score   support

   actinic_keratosis     0.6061    0.6667    0.6349        90
basal_cell_carcinoma     0.9520    0.8841    0.9168       561
         benign_mole     0.8387    0.9821    0.9048      1006
      dermatofibroma     0.9143    0.8205    0.8649        39
            melanoma     0.8149    0.6519    0.7244       385
seborrheic_keratosis     0.8697    0.7433    0.8016       413
     vascular_lesion     0.9545    0.9545    0.9545        22

            accuracy                         0.8565      2516
           macro avg     0.8500    0.8148    0.8288      2516
        weighted avg     0.8593    0.8565    0.8531      2516


🎯 Overall Accuracy Metrics
Top-1 Accuracy: 0.8565 (85.65%)
Top-2 Accuracy: 0.9690 (96.90%)

🔬 Clinical Utility Metrics for Melanoma Detection (SOTA)
Me