# Melanoma Classification (Kaggle) 

Colab-ready notebook for training an EfficientNet-based melanoma classifier on the Kaggle Melanoma Skin Cancer dataset. The notebook downloads the dataset via Kaggle API, performs EDA, handles class imbalance, trains with callbacks, and saves the model to Google Drive.

**How to use in Colab**
- Upload your `kaggle.json` when prompted (Kaggle > Account > Create API Token).
- Runtime: GPU recommended (Runtime > Change runtime type > GPU).
- Everything else runs end-to-end in Colab (no local files needed).

In [None]:
# Install lightweight dependencies (TensorFlow is preinstalled on Colab)
!pip install -q kaggle seaborn

In [None]:
import os
import random
import shutil
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             precision_recall_fscore_support)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from google.colab import files, drive

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

plt.style.use("seaborn-v0_8")

## Download dataset via Kaggle API
- Upload `kaggle.json` when prompted below.
- The dataset is saved under `/content/data` and automatically unzipped.

In [None]:
DATA_DIR = Path("/content/data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Configure Kaggle credentials
if not Path("/root/.kaggle/kaggle.json").exists():
    print("
Please upload your kaggle.json (Kaggle > Account > Create API Token)")
    uploaded = files.upload()
    if "kaggle.json" not in uploaded:
        raise FileNotFoundError("kaggle.json is required to access the Kaggle API")
    os.makedirs("/root/.kaggle", exist_ok=True)
    shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
    os.chmod("/root/.kaggle/kaggle.json", 0o600)
else:
    print("kaggle.json already present; skipping upload")

# Download + unzip (idempotent)
!kaggle datasets download -d bhaveshmittal/melanoma-cancer-dataset -p /content/data -q --force
!unzip -qo /content/data/melanoma-cancer-dataset.zip -d /content/data

In [None]:
# Locate dataset folders and class subdirectories
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".bmp"}

def has_images(path: Path) -> bool:
    try:
        return any((f.is_file() and f.suffix.lower() in IMAGE_EXTS) for f in path.iterdir())
    except Exception:
        return False

def find_dataset_root(download_root: Path) -> Path:
    candidates = sorted([p for p in download_root.glob("*") if p.is_dir()], key=lambda p: p.stat().st_mtime, reverse=True)
    keyword_hits = [p for p in candidates if "melanoma" in p.name.lower() or "cancer" in p.name.lower()]
    if keyword_hits:
        return keyword_hits[0]
    return candidates[0] if candidates else download_root

def find_split_dir(root: Path, names):
    for name in names:
        candidate = root / name
        if candidate.exists():
            return candidate
    return None

def find_class_dir(base_dir: Path | None, fallback_root: Path) -> Path | None:
    search_root = base_dir if base_dir and base_dir.exists() else fallback_root
    candidates = []
    for path in search_root.rglob("*"):
        if path.is_dir():
            subdirs = [d for d in path.iterdir() if d.is_dir()]
            if len(subdirs) >= 2 and all(has_images(sd) for sd in subdirs):
                candidates.append(path)
    if candidates:
        candidates = sorted(candidates, key=lambda p: sum(1 for _ in p.rglob("*")), reverse=True)
        return candidates[0]
    return None

dataset_root = find_dataset_root(DATA_DIR)
train_dir = find_split_dir(dataset_root, ["train", "training"])
val_dir = find_split_dir(dataset_root, ["val", "valid", "validation"])
test_dir = find_split_dir(dataset_root, ["test", "testing"])

train_dir = find_class_dir(train_dir, dataset_root)
val_dir = find_class_dir(val_dir, dataset_root) if val_dir else None
test_dir = find_class_dir(test_dir, dataset_root) if test_dir else None

print(f"Dataset root: {dataset_root}")
print(f"Train dir:  {train_dir}")
print(f"Val dir:    {val_dir}")
print(f"Test dir:   {test_dir}")
assert train_dir is not None, "Could not locate train directory with class subfolders.

In [None]:
# Gather file paths and labels

def gather_files(root: Path):
    files, labels = [], []
    for class_dir in sorted([d for d in root.iterdir() if d.is_dir()]):
        class_files = [p for p in class_dir.rglob("*") if p.is_file() and p.suffix.lower() in IMAGE_EXTS]
        if not class_files:
            continue
        files.extend(class_files)
        labels.extend([class_dir.name] * len(class_files))
    return files, labels

train_files, train_labels = gather_files(train_dir)
val_files, val_labels = gather_files(val_dir) if val_dir else ([], [])
test_files, test_labels = gather_files(test_dir) if test_dir else ([], [])

print(f"Train images: {len(train_files)}")
print(f"Val images:   {len(val_files)}")
print(f"Test images:  {len(test_files)}")

if len(train_files) == 0:
    raise RuntimeError("No training images found. Check dataset structure.")

## Exploratory Data Analysis

In [None]:
# Class distribution
train_df = pd.DataFrame({"label": train_labels})
plt.figure(figsize=(8, 4))
sns.countplot(data=train_df, x="label", order=sorted(train_df["label"].unique()))
plt.title("Training class distribution")
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

print(train_df["label"].value_counts())

In [None]:
# Image dimension summary (sample up to 200 images for speed)
sample_paths = random.sample(train_files, min(200, len(train_files)))
sizes = []
for path in sample_paths:
    try:
        with Image.open(path) as img:
            sizes.append(img.size)
    except Exception:
        continue

size_df = pd.DataFrame(sizes, columns=["width", "height"])
print(size_df.describe())
plt.figure(figsize=(6, 4))
sns.scatterplot(data=size_df, x="width", y="height", alpha=0.6)
plt.title("Sample image dimensions")
plt.tight_layout()
plt.show()

In [None]:
# Visualize a few samples
n_samples = min(6, len(train_files))
example_paths = random.sample(train_files, n_samples)
fig, axes = plt.subplots(2, (n_samples + 1) // 2, figsize=(12, 6))
axes = axes.flatten()
for ax, path in zip(axes, example_paths):
    with Image.open(path) as img:
        ax.imshow(img)
    ax.set_title(path.parent.name)
    ax.axis("off")
for ax in axes[len(example_paths):]:
    ax.axis("off")
plt.suptitle("Random training samples", fontsize=14)
plt.tight_layout()
plt.show()

## Train / Validation / Test split
- Uses existing `val` / `test` folders if present.
- Otherwise stratifies splits from the training set.

In [None]:
# Create stratified splits when needed
if not test_files:
    train_files, test_files, train_labels, test_labels = train_test_split(
        train_files, train_labels, test_size=0.15, random_state=SEED, stratify=train_labels
    )
    print(f"Created test split with {len(test_files)} images")

if not val_files:
    train_files, val_files, train_labels, val_labels = train_test_split(
        train_files, train_labels, test_size=0.15, random_state=SEED, stratify=train_labels
    )
    print(f"Created validation split with {len(val_files)} images")

class_names = sorted(set(train_labels))
label_to_index = {label: idx for idx, label in enumerate(class_names)}
index_to_label = {idx: label for label, idx in label_to_index.items()}


def encode_labels(labels):
    return np.array([label_to_index[lbl] for lbl in labels], dtype=np.int32)

train_label_ids = encode_labels(train_labels)
val_label_ids = encode_labels(val_labels)
test_label_ids = encode_labels(test_labels)

print(f"Classes: {class_names}")

## Build TensorFlow datasets with augmentation

In [None]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

# Data augmentation applied on-the-fly
augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.08),
        layers.RandomZoom(0.15),
        layers.RandomContrast(0.1),
    ],
    name="augmentation",
)

def load_and_preprocess(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_image(image, channels=3, expand_animations=False)
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32)  # keep 0-255 range; EfficientNet preprocess handles scaling
    return image, label


def make_dataset(paths, labels, training=False):
    ds = tf.data.Dataset.from_tensor_slices((np.array(paths, dtype=str), labels))
    if training:
        ds = ds.shuffle(buffer_size=len(paths), seed=SEED, reshuffle_each_iteration=True)
    ds = ds.map(load_and_preprocess, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_files, train_label_ids, training=True)
val_ds = make_dataset(val_files, val_label_ids, training=False)
test_ds = make_dataset(test_files, test_label_ids, training=False)

## Handle class imbalance with class weights

In [None]:
class_weights_array = compute_class_weight(
    class_weight="balanced", classes=np.arange(len(class_names)), y=train_label_ids
)
class_weights = {int(i): float(w) for i, w in enumerate(class_weights_array)}
print("Class weights:", class_weights)

## Build and train EfficientNet model

In [None]:
BACKBONE = "B0"  # switch to "B3" for EfficientNetB3
INPUT_SHAPE = IMG_SIZE + (3,)
LEARNING_RATE = 1e-4
EPOCHS = 15
DROPOUT_RATE = 0.35


def get_backbone(name: str, input_shape):
    name = name.upper()
    if name == "B3":
        return tf.keras.applications.EfficientNetB3(
            include_top=False, input_shape=input_shape, weights="imagenet"
        )
    return tf.keras.applications.EfficientNetB0(
        include_top=False, input_shape=input_shape, weights="imagenet"
    )


backbone = get_backbone(BACKBONE, INPUT_SHAPE)
backbone.trainable = False  # start with frozen backbone

inputs = keras.Input(shape=INPUT_SHAPE)
x = augmentation(inputs)
x = tf.keras.applications.efficientnet.preprocess_input(x)
x = backbone(x, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(DROPOUT_RATE)(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)

model = keras.Model(inputs, outputs, name=f"efficientnet_{BACKBONE.lower()}_melanoma")
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.summary()

In [None]:
checkpoint_path = "best_melanoma_model.keras"
callbacks = [
    keras.callbacks.ModelCheckpoint(
        checkpoint_path, monitor="val_loss", save_best_only=True, verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=4, restore_best_weights=True, verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=2, min_lr=1e-6, verbose=1
    ),
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    class_weight=class_weights,
)

## Learning curves

In [None]:
history_df = pd.DataFrame(history.history)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history_df["loss"], label="train")
plt.plot(history_df["val_loss"], label="val")
plt.title("Loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_df["accuracy"], label="train")
plt.plot(history_df["val_accuracy"], label="val")
plt.title("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

## Evaluate on test set

In [None]:
test_probs = model.predict(test_ds)
test_pred = np.argmax(test_probs, axis=1)

acc = accuracy_score(test_label_ids, test_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    test_label_ids, test_pred, average="weighted", zero_division=0
)

print(f"Test Accuracy:  {acc:.4f}")
print(f"Test Precision: {prec:.4f}")
print(f"Test Recall:    {rec:.4f}")
print(f"Test F1:        {f1:.4f}")

print("
Classification report:")
print(classification_report(test_label_ids, test_pred, target_names=class_names, zero_division=0))

cm = confusion_matrix(test_label_ids, test_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

## Save model to Google Drive

In [None]:
drive.mount('/content/drive')
MODEL_DIR = Path("/content/drive/MyDrive/melanoma_models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
save_path = MODEL_DIR / f"{model.name}_best.keras"
model.save(save_path)
print(f"Saved model to: {save_path}")