<a href="https://colab.research.google.com/github/anshulchauhan502/btp/blob/main/step1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ===========================
# Cell 0 - Runtime & Setup
# ===========================
# Mount Drive, set base paths, toggle debug (small_run)
from pathlib import Path
import os
import random
import datetime
SEED = 42
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Toggle small_run for quick debug (True -> tiny dataset + fewer epochs)
SMALL_RUN = False  # set True for a fast pipeline check
DRIVE_MOUNT_PATH = "/content/drive"
DATA_ROOT = "/content/drive/MyDrive/ham10000_data"
IMAGES_DIR = "/content/drive/MyDrive/ham10000_data/all_images"
METADATA_CSV = "/content/drive/MyDrive/ham10000_data/HAM10000_metadata.csv"
EXPERIMENT_DIR = "/content/drive/MyDrive/ham10000_data/experiments/efficientnet_b0_binary"

# Create artifact directories
Path(EXPERIMENT_DIR).mkdir(parents=True, exist_ok=True)
print("Experiment dir:", EXPERIMENT_DIR)
print("Images dir:", IMAGES_DIR)
print("Metadata csv:", METADATA_CSV)
print("Small run:", SMALL_RUN)

# If running in Colab, mount drive (uncomment if needed)
# from google.colab import drive
# drive.mount('/content/drive')


Experiment dir: /content/drive/MyDrive/ham10000_data/experiments/efficientnet_b0_binary
Images dir: /content/drive/MyDrive/ham10000_data/all_images
Metadata csv: /content/drive/MyDrive/ham10000_data/HAM10000_metadata.csv
Small run: False


In [4]:
# ===========================
# Cell 1 - Install & check versions (optional pinning)
# ===========================
# In general, prefer Colab's default TF. If you hit incompatibilities, uncomment lines below.
# Recommended tested combo often: tensorflow>=2.10,<2.14 ; scikit-learn, pandas, matplotlib up-to-date.

# !pip install -q "tensorflow>=2.10,<2.14" "flower==2.2.0"  # only use if you need to pin
# !pip install -q matplotlib scikit-learn pandas

import sys, pkgutil
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn
import matplotlib
import math

print("Python:", sys.version.splitlines()[0])
print("TensorFlow:", tf.__version__)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)
print("matplotlib:", matplotlib.__version__)

# Ensure GPU available (recommended)
gpus = tf.config.list_physical_devices('GPU')
print("GPUs found:", gpus)


Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
TensorFlow: 2.19.0
NumPy: 2.0.2
Pandas: 2.2.2
sklearn: 1.6.1
matplotlib: 3.10.0
GPUs found: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
# ===========================
# Cell 2 - Imports & utilities
# ===========================
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from collections import Counter
import json

# Reproducibility helpers (TF)
tf.random.set_seed(SEED)

def save_json(obj, path):
    with open(path, 'w') as f:
        json.dump(obj, f, indent=2)


In [6]:
# ===========================
# Cell 3 - Load metadata & basic EDA
# ===========================
df = pd.read_csv(METADATA_CSV)
print("Metadata columns:", df.columns.tolist())
print("Sample rows:")
display(df.head())
print("Total rows:", len(df))

# HAM10000 typical columns: image_id, dx, dx_type, age, sex, localization, etc.
assert 'image_id' in df.columns or 'imageId' in df.columns, "No image_id column found in metadata!"
# normalize column name
if 'imageId' in df.columns and 'image_id' not in df.columns:
    df = df.rename(columns={'imageId':'image_id'})

# quick class distribution
print("Original diagnosis counts:")
print(df['dx'].value_counts())


Metadata columns: ['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization']
Sample rows:


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


Total rows: 10015
Original diagnosis counts:
dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [7]:
# ===========================
# Cell 4 - Create binary labels (malignant vs benign)
# ===========================
# DEFAULT mapping (change if you want)
# malignant list chosen: melanoma (mel), basal cell carcinoma (bcc), actinic keratoses/intraepithelial carcinoma (akiec)
MALIGNANT = ['mel', 'bcc', 'akiec']

# Create label column: 1 = malignant, 0 = benign
df['label_binary'] = df['dx'].apply(lambda x: 1 if x in MALIGNANT else 0)
print("Binary distribution (malignant=1):")
print(df['label_binary'].value_counts())

# Inspect sample file names and check image files exist
def first_existing_image(image_id):
    for ext in ['.jpg', '.jpeg', '.png']:
        p = os.path.join(IMAGES_DIR, image_id + ext)
        if os.path.exists(p):
            return p
    return None

# Map image paths
df['image_path'] = df['image_id'].astype(str).apply(lambda x: first_existing_image(x))
missing_count = df['image_path'].isna().sum()
print("Images missing (no file found for image_id):", missing_count)
if missing_count > 0:
    print("Examples of missing ids:", df[df['image_path'].isna()]['image_id'].tolist()[:10])
    # Optionally filter out missing
    df = df.dropna(subset=['image_path']).reset_index(drop=True)
    print("Filtered metadata rows:", len(df))


Binary distribution (malignant=1):
label_binary
0    8061
1    1954
Name: count, dtype: int64
Images missing (no file found for image_id): 2513
Examples of missing ids: ['ISIC_0032417', 'ISIC_0032129', 'ISIC_0032343', 'ISIC_0032128', 'ISIC_0032013', 'ISIC_0031967', 'ISIC_0033539', 'ISIC_0032283', 'ISIC_0032463', 'ISIC_0032306']
Filtered metadata rows: 7502


In [8]:
# ===========================
# Cell 5 - Train/val/test stratified split
# ===========================
# Stratified split into train/val/test (70/15/15)
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df['label_binary'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_binary'], random_state=SEED)

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# Small-run: subsample for quick debug
if SMALL_RUN:
    train_df = train_df.sample(n=min(200, len(train_df)), random_state=SEED)
    val_df = val_df.sample(n=min(50, len(val_df)), random_state=SEED)
    test_df = test_df.sample(n=min(50, len(test_df)), random_state=SEED)
    print("Small-run sizes:", len(train_df), len(val_df), len(test_df))

# Save splits for reproducibility
train_df.to_csv(os.path.join(EXPERIMENT_DIR, "train_metadata.csv"), index=False)
val_df.to_csv(os.path.join(EXPERIMENT_DIR, "val_metadata.csv"), index=False)
test_df.to_csv(os.path.join(EXPERIMENT_DIR, "test_metadata.csv"), index=False)


Train/Val/Test sizes: 5251 1125 1126


In [9]:
# ===========================
# Cell 6 - Dataset utils: tf.data pipeline and augmentations
# ===========================
IMG_SIZE = 224
BATCH_SIZE = 32 if not SMALL_RUN else 16
AUTOTUNE = tf.data.experimental.AUTOTUNE

def preprocess_image(path, label, img_size=IMG_SIZE):
    # Reads image file, decodes, resizes and scales to [0,1]
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)  # works with jpg/jpeg
    image = tf.image.convert_image_dtype(image, tf.float32)  # scales to [0,1]
    image = tf.image.resize(image, [img_size, img_size])
    return image, label

def augment(image, label):
    # simple augmentation pipeline (tf.image ops)
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_brightness(image, max_delta=0.1)
    image = tf.image.random_contrast(image, lower=0.9, upper=1.1)
    image = tf.image.random_saturation(image, lower=0.9, upper=1.1)
    # small rotation via tf.keras
    image = tf.keras.preprocessing.image.random_rotation(image.numpy(), rg=10, row_axis=0, col_axis=1, channel_axis=2)
    return tf.convert_to_tensor(image, dtype=tf.float32), label

def make_dataset(df_meta, shuffle=True, augment_data=False):
    paths = df_meta['image_path'].values
    labels = df_meta['label_binary'].astype(np.int32).values
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(paths), seed=SEED)
    ds = ds.map(lambda p, l: tf.py_function(func=preprocess_image, inp=[p, l], Tout=(tf.float32, tf.int32)),
                num_parallel_calls=AUTOTUNE)
    # set shapes after py_function
    def set_shapes(img, lbl):
        img.set_shape([IMG_SIZE, IMG_SIZE, 3])
        lbl.set_shape([])
        return img, lbl
    ds = ds.map(set_shapes, num_parallel_calls=AUTOTUNE)
    if augment_data:
        # use simple augmentation wrapper using tf.py_function for rotation (could be slower)
        def aug_py(image, label):
            img, lbl = tf.py_function(func=augment, inp=[image, label], Tout=(tf.float32, tf.int32))
            img.set_shape([IMG_SIZE, IMG_SIZE, 3])
            lbl.set_shape([])
            return img, lbl
        ds = ds.map(aug_py, num_parallel_calls=AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_dataset(train_df, shuffle=True, augment_data=True)
val_ds = make_dataset(val_df, shuffle=False, augment_data=False)
test_ds = make_dataset(test_df, shuffle=False, augment_data=False)

# quick sanity: show a batch shape
for batch in train_ds.take(1):
    images, labels = batch
    print("Batch shapes:", images.shape, labels.shape)
    break


Batch shapes: (32, 224, 224, 3) (32,)


In [10]:
# ===========================
# Cell 7 - Class weights & imbalance handling
# ===========================
from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(train_df['label_binary'])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df['label_binary'])
class_weights_dict = {int(classes[i]): float(class_weights[i]) for i in range(len(classes))}
print("Class weights:", class_weights_dict)
save_json(class_weights_dict, os.path.join(EXPERIMENT_DIR, "class_weights.json"))


Class weights: {0: 0.6073328706916493, 1: 2.8292025862068964}


In [11]:
# ===========================
# Cell 8 - Build EfficientNetB0 model
# ===========================
def build_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), base_trainable=False):
    base = tf.keras.applications.EfficientNetB0(include_top=False, weights='imagenet', input_shape=input_shape, pooling='avg')
    base.trainable = base_trainable  # whether to fine-tune base
    inputs = layers.Input(shape=input_shape)
    x = tf.keras.applications.efficientnet.preprocess_input(inputs)  # correct preprocessing
    x = base(x, training=False)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)  # binary
    model = models.Model(inputs, outputs)
    return model

model = build_model(base_trainable=False)
model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# ===========================
# Cell 9 - Compile, callbacks, and training
# ===========================
EPOCHS = 20 if not SMALL_RUN else 3
LEARNING_RATE = 1e-4

model.compile(
    optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

checkpoint_path = os.path.join(EXPERIMENT_DIR, "best_model.h5")
cb = [
    callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1),
    callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    class_weight=class_weights_dict,
    callbacks=cb
)

# Save training history
hist_path = os.path.join(EXPERIMENT_DIR, "history.json")
save_json(history.history, hist_path)
print("Training complete, history saved to:", hist_path)


Epoch 1/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - accuracy: 0.4268 - auc: 0.5010 - loss: 0.7036
Epoch 1: val_loss improved from inf to 0.68734, saving model to /content/drive/MyDrive/ham10000_data/experiments/efficientnet_b0_binary/best_model.h5




[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1381s[0m 8s/step - accuracy: 0.4270 - auc: 0.5010 - loss: 0.7035 - val_accuracy: 0.8231 - val_auc: 0.5000 - val_loss: 0.6873 - learning_rate: 1.0000e-04
Epoch 2/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 906ms/step - accuracy: 0.4958 - auc: 0.4775 - loss: 0.7013
Epoch 2: val_loss did not improve from 0.68734
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 972ms/step - accuracy: 0.4958 - auc: 0.4776 - loss: 0.7013 - val_accuracy: 0.1769 - val_auc: 0.5000 - val_loss: 0.6988 - learning_rate: 1.0000e-04
Epoch 3/20
[1m164/165[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 919ms/step - accuracy: 0.4546 - auc: 0.4754 - loss: 0.6930
Epoch 3: val_loss did not improve from 0.68734
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 977ms/step - accuracy: 0.4548 - auc: 0.4756 - loss

In [None]:
# ===========================
# Cell 10 - Evaluation on test set (detailed metrics)
# ===========================
# load best model (checkpoint may have been saved)
model.load_weights(checkpoint_path)
print("Loaded best weights from:", checkpoint_path)

# get predictions and labels
y_true = []
y_probs = []
for batch in test_ds:
    imgs, labels = batch
    probs = model.predict(imgs)
    y_true.extend(labels.numpy().tolist())
    y_probs.extend(probs.ravel().tolist())

y_true = np.array(y_true)
y_probs = np.array(y_probs)
y_pred = (y_probs >= 0.5).astype(int)

# Metrics
print("Classification report (test):")
print(classification_report(y_true, y_pred, digits=4))
cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:\n", cm)
auc = roc_auc_score(y_true, y_probs)
print("ROC AUC:", auc)

# Save metrics
metrics = {
    "confusion_matrix": cm.tolist(),
    "roc_auc": float(auc),
    "classification_report": classification_report(y_true, y_pred, output_dict=True)
}
save_json(metrics, os.path.join(EXPERIMENT_DIR, "metrics.json"))
print("Saved metrics.json")
