In [1]:
import numpy as np
import copy
import pandas as pd
import math, re, os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
import keras
from pathlib import Path
from functools import partial
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, cross_val_score
import cv2

import warnings

print("Tensorflow version " + tf.__version__)

Tensorflow version 2.17.1


In [2]:
# As of now, there seems to be issues with Kaggle TPUs. Hence using GPU
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


# Global variables

In [3]:
DIR_PATH = "/kaggle/input/siim-isic-melanoma-classification/"
TRAIN_PATH = DIR_PATH + "tfrecords/train*.tfrec"
TEST_PATH = DIR_PATH + "tfrecords/test*.tfrec"

TRAIN_JPEG_PATH = DIR_PATH + "jpeg/train/"
TEST_JPEG_PATH = DIR_PATH + "jpeg/test/"

TRAIN_TABDATA_PATH = DIR_PATH + "train.csv"
TEST_TABDATA_PATH = DIR_PATH + "test.csv"

AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 128
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 16
IMAGE_SIZE = [1024, 1024] # for TFRecord images
IMAGE_RESIZE = [128, 128]

EPOCHS = 5

TRAIN_ON_FULL_DATA = False
TRAIN_VALID_SPLIT = True
TEST_PREDICT = False

RANDOM_SEED = 0

# Tabular data

In [4]:
train = pd.read_csv(TRAIN_TABDATA_PATH)
train["image_path"] = train["image_name"].apply(lambda x: os.path.join(TRAIN_JPEG_PATH, f"{x}.jpg"))
test = pd.read_csv(TEST_TABDATA_PATH)

In [None]:
train.head()

In [None]:
test.head()

# EDA

In [None]:
print(train.shape, test.shape)

## Mising values

In [None]:
print("Train")
print(train.isnull().sum())

print("\n----------------\n")

print("Test")
print(test.isnull().sum())

We can replace null values in the columns sex and anatom_site with "unknown".
Lets check what to replace null values in age_approx with.

In [None]:
train.age_approx.describe()

In [None]:
sum(train.age_approx == 0)

In [None]:
train.loc[train.age_approx == 0, :]

In [None]:
train.loc[train.age_approx != 0, :].describe()

In [None]:
train.age_approx.unique(), test.age_approx.unique()

age_approx values are in multiples of 5. 

There are 2 rows with values 0. So missing values can't be replaced with 0. Its strange that there are age_approx with 0. But we'll leave it as it is. Could be any age under 5.

We'll replace missing values with -10 for visualization purpose and also include age_missing column

In [None]:
train["sex"] = train["sex"].fillna("unknown")

train["anatom_site_general_challenge"] = train["anatom_site_general_challenge"].fillna("unknown")
test["anatom_site_general_challenge"] = test["anatom_site_general_challenge"].fillna("unknown")

train["age_approx"] = train["age_approx"].fillna(-10)

train["age_missing"] = (train.age_approx == -10)
train["age_missing"] = train["age_missing"].astype(int)

test["age_missing"] = (test.age_approx == -10)
test["age_missing"] = test["age_missing"].astype(int)

In [None]:
train.age_approx.unique(), test.age_approx.unique()

In [None]:
print("Train")
print(train.isnull().sum())

print("\n----------------\n")

print("Test")
print(test.isnull().sum())

In [None]:
train.head()

In [None]:
test.head()

## Target distribution

In [None]:
target_counts = train.target.value_counts()
target_counts

In [None]:
target_counts[0]*100/sum(target_counts), target_counts[1]*100/sum(target_counts)

98.237 % of images are of benign cases and only 1.762 % is of malignant case.

## Unique images

In [None]:
print(train.shape, test.shape)
print(len(train.image_name.unique()), len(test.image_name.unique()))

So all images are unique in both train and test

## Images per patient

In [None]:
train_patientids = pd.DataFrame(train.patient_id.value_counts())
display(train_patientids.describe())
test_patientids = pd.DataFrame(test.patient_id.value_counts())
display(test_patientids.describe())

In [None]:
def hist_and_box(df, col_name, main_title, title, hist_xlabel, bin_range=None):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))

    sns.histplot(data=df[col_name].values, binwidth=5, binrange=bin_range, kde=True, ax=axes[0])
    axes[0].set_title("Histogram of " + title)
    axes[0].set_xlabel(hist_xlabel)
    axes[0].set_ylabel("Frequency")
    
    sns.boxplot(data=df[col_name].values, ax=axes[1])
    axes[1].set_title("Boxplot of " + title)
    axes[1].set_ylabel(hist_xlabel)

    plt.suptitle(main_title)
    
    plt.tight_layout()
    plt.show()

In [None]:
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")

hist_and_box(train_patientids, "count", "Train data", "images per patient", "Number of images for a patient")

hist_and_box(test_patientids, "count", "Test data", "images per patient", "Number of images for a patient")

## Patient ID overlap

In [None]:
venn2(subsets = (set(train.patient_id.unique()), set(test.patient_id.unique())),
      set_labels = ('Train Patient IDs', 'Test Patient IDs'))
plt.show()

There are no common patient ids b/w train and test. For train-validation split, we'll need to ensure the same.

## Sex

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.countplot(data=train, x="sex", ax=axes[0])
for container in axes[0].containers:
    axes[0].bar_label(container)
axes[0].set_title("Train data")

axes[1] = sns.countplot(data=test, x="sex", ax=axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)
axes[1].set_title("Test data")    

plt.tight_layout() 
plt.show()

In [None]:
ax = sns.countplot(data=train, x="sex",
                   hue="target")
for container in ax.containers:
    ax.bar_label(container)
ax.set_title("Sex and target count")
ax.tick_params(axis='x', labelrotation=90)

plt.tight_layout()
plt.show()

## Anatomical site

In [None]:
print(np.sort(train.anatom_site_general_challenge.unique()))
print(np.sort(test.anatom_site_general_challenge.unique()))

In [None]:
anatom_site_order = train.anatom_site_general_challenge.value_counts().index
anatom_site_order

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.countplot(data=train, x="anatom_site_general_challenge", 
              order=anatom_site_order,
              ax=axes[0])
for container in axes[0].containers:
    axes[0].bar_label(container)
axes[0].set_title("Train data")
axes[0].tick_params(axis='x', labelrotation=90)

axes[1] = sns.countplot(data=test, x="anatom_site_general_challenge", 
                        order=anatom_site_order,
                        ax=axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)
axes[1].set_title("Test data")   
axes[1].tick_params(axis='x', labelrotation=90)

plt.tight_layout()
plt.show()

In [None]:
ax = sns.countplot(data=train, x="anatom_site_general_challenge",
                   hue="target",
                   order=anatom_site_order)
for container in ax.containers:
    ax.bar_label(container)
ax.set_title("Anatom_site and target count")
ax.tick_params(axis='x', labelrotation=90)

plt.tight_layout()
plt.show()

## Age

In [None]:
print("Train")
print(train.age_approx.describe())

print("\n----------------\n")

print("Test")
print(test.age_approx.describe())

In [None]:
hist_and_box(train, "age_approx", "Train data", "patient age", "Patient age", (-10,100))
hist_and_box(test, "age_approx", "Test data", "patient age", "Patient age", (-10,100))

In [None]:
hist_and_box(train.loc[train.target==1, :], "age_approx", "Malignant", "patient age", "Patient age", (-10,100))
hist_and_box(train.loc[train.target==0, :], "age_approx", "Benign", "patient age", "Patient age", (-10,100))

In [None]:
train.loc[(train.target==1) & (train.age_approx==-10.0), :]

The proportion of categories of sex, anatom_site, age_approx seem to be visibly similar in train and test data. So that need not be an additional consideration during train-validation split.

# Check images

In [None]:
print("Examples : Malignant (With Melanoma)")
imgs = train.loc[train.target==1].sample(10).image_name.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = cv2.imread(TRAIN_JPEG_PATH + k + ".jpg")
    img = cv2.resize(img, IMAGE_RESIZE)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(2,5,i+1); plt.axis('off')
    plt.imshow(img)
plt.show()

print("Examples : Benign (Without Melanoma)")
imgs = train.loc[train.target==0].sample(10).image_name.values
plt.figure(figsize=(20,8))
for i,k in enumerate(imgs):
    img = cv2.imread(TRAIN_JPEG_PATH + k + ".jpg")
    img = cv2.resize(img, IMAGE_RESIZE)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.subplot(2,5,i+1); plt.axis('off')
    plt.imshow(img)
plt.show()

# Load data

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    # image = tf.cast(image, tf.float32) / 255.0   # convnext doesn't need this
    # image = tf.reshape(image, [*IMAGE_SIZE, 3])  # not required since resize is done next
    image = tf.image.resize(image, IMAGE_RESIZE)
    return image

## Load TFRecord data

In [None]:
def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

def load_dataset(filenames, labeled=True, ordered=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    return dataset

In [None]:
train_filenames = tf.io.gfile.glob(TRAIN_PATH) 
test_filenames = tf.io.gfile.glob(TEST_PATH)

print('Train TFRecord Files:', len(train_filenames))
print('Test TFRecord Files:', len(test_filenames))

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

num_training_images = count_data_items(train_filenames)
num_test_images = count_data_items(test_filenames)
print(
    'Dataset: {} training images, {} unlabeled test images'.format(
        num_training_images, num_test_images
    )
)

## Load jpeg data

In [None]:
def load_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = decode_image(image)
    return image, label

# image_paths = train["image_path"].values
# labels = train["target"].values

def load_jpeg_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(lambda x, y: load_image(x, y))
    return dataset

# Data Augmentation

In [None]:
def augmentation_pipeline(image, label):
    image = tf.image.random_flip_left_right(image)
    return image, label

# Get datasets

In [None]:
def get_training_dataset(tfrecord=True, filenames=None, image_paths=None, labels=None):
    if tfrecord:
        dataset = load_dataset(filenames, labeled=True)
    else: #jpeg
        dataset = load_jpeg_dataset(image_paths, labels)
    dataset = dataset.map(augmentation_pipeline, num_parallel_calls=AUTOTUNE)
    dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)
    dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def get_validation_dataset(ordered=False, tfrecord=True, filenames=None, image_paths=None, labels=None):
    if tfrecord:
        dataset = load_dataset(filenames, labeled=True, ordered=ordered) 
    else: #jpeg
        dataset = load_jpeg_dataset(image_paths, labels)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

def get_test_dataset(ordered=False):   # only use TFRecord for predicting test data
    dataset = load_dataset(filenames, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(10,10))
    image_batch = tf.cast(image_batch, tf.float32) / 255.0
    for n in range(25):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(image_batch[n])
        if label_batch[n]:
            plt.title("Malignant")
        else:
            plt.title("Benign")
        plt.axis("off")

In [None]:
train_dataset = get_training_dataset(filenames=train_filenames)
image_batch, label_batch = next(iter(train_dataset))
show_batch(image_batch.numpy(), label_batch.numpy())

# Train validation split using jpeg data

Since TFRecord images are grouped together into different files, performing train-validation split using patientid column would be easier with jpeg files.

Lets perform 80-20 train-validation split taking into account the columns : target and patientid. Train and validation datasets need 
* equal proportion of target column values.
* non-overlapping sets of patientids

In [None]:
train_index, valid_index = next(StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED).split(train.image_name, train.target, train.patient_id))

print(len(train_index), len(valid_index))
print(len(train_index)/train.shape[0], len(valid_index)/train.shape[0])

We can use the rest of the 4 splits generated above, but we'll only use the 1st split for quick iteration and then train on full data once we identify the best model and hyperparams.

In [None]:
train_data = train.loc[train_index, :]
valid_data = train.loc[valid_index, :]

In [None]:
venn2(subsets = (set(train_data.patient_id.unique()), 
                 set(valid_data.patient_id.unique())),
      set_labels = ('Trainset Patient IDs', 'Validset Patient IDs'))
plt.show()

As expected, it non-overlapping

In [None]:
t_target_counts = train_data.target.value_counts()
print(t_target_counts[0]*100/sum(t_target_counts), t_target_counts[1]*100/sum(t_target_counts))

v_target_counts = valid_data.target.value_counts()
print(v_target_counts[0]*100/sum(v_target_counts), v_target_counts[1]*100/sum(v_target_counts))

Proportion of target categories is same in train and validation

In [None]:
train_jpeg_dataset = get_training_dataset(tfrecord=False, 
                                          image_paths=train_data.image_path.values, 
                                          labels=train_data.target.values)
image_batch, label_batch = next(iter(train_jpeg_dataset))
show_batch(image_batch.numpy(), label_batch.numpy())

In [None]:
valid_jpeg_dataset = get_validation_dataset(tfrecord=False,
                                            image_paths=valid_data.image_path.values,
                                            labels=valid_data.target.values)
image_batch, label_batch = next(iter(valid_jpeg_dataset))
show_batch(image_batch.numpy(), label_batch.numpy())

Load the train and valid dataset again, since we have iterated over them to show_batch

In [None]:
train_jpeg_dataset = get_training_dataset(tfrecord=False, 
                                          image_paths=train_data.image_path.values, 
                                          labels=train_data.target.values)
valid_jpeg_dataset = get_validation_dataset(tfrecord=False,
                                            image_paths=valid_data.image_path.values,
                                            labels=valid_data.target.values)

In [None]:
if TRAIN_ON_FULL_DATA or TRAIN_VALID_SPLIT:

    base_model = keras.applications.EfficientNetV2S(
        include_top=False,
        input_shape=(*IMAGE_RESIZE, 3),
        include_preprocessing=True
    )
    base_model.trainable = False
        
    inputs = keras.Input(shape=(*IMAGE_RESIZE, 3))
    x = base_model(inputs, training=False)
    x = keras.layers.GlobalAveragePooling2D()(x)
    x = keras.layers.Dropout(0.2)(x)
    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs, outputs)
    
    model.summary(show_trainable=True)

In [None]:
if TRAIN_ON_FULL_DATA or TRAIN_VALID_SPLIT:

    lr_scheduler = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-5, 
        decay_steps=10000, 
        decay_rate=0.9)
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr_scheduler, epsilon=0.001),
        loss='binary_crossentropy',  
        metrics=[keras.metrics.AUC(name='auc')])

In [None]:
if TRAIN_VALID_SPLIT:
    steps_per_epoch = int(np.ceil(train_data.shape[0] / BATCH_SIZE))
    validation_steps = int(np.ceil(valid_data.shape[0] / BATCH_SIZE))
    history = model.fit(train_jpeg_dataset, 
                        steps_per_epoch=steps_per_epoch, 
                        epochs=EPOCHS,
                        validation_data=valid_jpeg_dataset,
                        validation_steps=validation_steps)

In [None]:
if TRAIN_VALID_SPLIT:
    history_frame = pd.DataFrame(history.history)
    history_frame.loc[:, ['loss', 'val_loss']].plot()
    history_frame.loc[:, ['auc', 'val_auc']].plot()

In [None]:
if TRAIN_ON_FULL_DATA:
    steps_per_epoch = int(np.ceil(num_training_images / BATCH_SIZE))
    history = model.fit(train_dataset, 
                        steps_per_epoch=steps_per_epoch, 
                        epochs=EPOCHS)

In [None]:
if TRAIN_ON_FULL_DATA:
    history_frame = pd.DataFrame(history.history)
    history_frame.loc[:, ['loss', 'auc']].plot()

In [None]:
if TRAIN_ON_FULL_DATA:
    model.save("effnetv2_s_1.keras")

In [None]:
if TEST_PREDICT:
    model = keras.models.load_model("/kaggle/input/melanoma-classification/tensorflow2/effnetv2_s_1/1/effnetv2_s_1.keras")
    model.summary()

# Predict on test data

In [None]:
if TEST_PREDICT:
    test_dataset = get_test_dataset(ordered=True)
    test_images = test_dataset.map(lambda image, idnum: image)
    prediction_prob = model.predict(test_images, steps=np.ceil(num_test_images / BATCH_SIZE))

    print(prediction_prob)

# Create submission file

In [None]:
if TEST_PREDICT:
    image_names = np.array([img_name.numpy().decode("utf-8") 
                            for img, img_name in iter(test_dataset.unbatch())])
    
    submission = pd.DataFrame(dict(image_name=image_names, target=prediction_prob[:, 0]))
    submission = submission.sort_values('image_name') 
    submission.to_csv('submission.csv', index=False)
    !head submission.csv

# References
* https://www.kaggle.com/code/jessemostipak/getting-started-tpus-cassava-leaf-disease
* https://www.kaggle.com/code/amyjang/tensorflow-transfer-learning-melanoma
* https://www.kaggle.com/code/cdeotte/triple-stratified-kfold-with-tfrecords
* https://www.kaggle.com/code/ibtesama/siim-baseline-keras-vgg16
