In [None]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python
# Current preserved directory (/kaggle/working/) 20GB
# Temporary files (unpreserved after session) /kaggle/temp/

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
from tensorflow import keras
from typing import Tuple
from keras.layers import MaxPool2D, Conv2D, Flatten, Dense, BatchNormalization, Activation, Dropout
from keras import Sequential
from sklearn.metrics import auc, roc_curve
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils.vis_utils import plot_model

FILE_PATH_BASE = '/kaggle/input/histopathologic-cancer-detection'
TRAIN_FILES_PATH = f'{FILE_PATH_BASE}/train/'
TEST_FILES_PATH = f'{FILE_PATH_BASE}/test/'
TRAIN_LABELS_PATH = f'{FILE_PATH_BASE}/train_labels.csv'
TRAIN_FILES = os.listdir(TRAIN_FILES_PATH)
TEST_FILES = os.listdir(TEST_FILES_PATH)

# Description of the project, data and objectives
This project main objective is to develop an algorithm that is able to identify metastatic cancer in small image patches taken from larger digital pathology scans. The performance of the model will be evaluated on the area under the ROC curve between the predicted probability and the observed target. The data is already divided into two separate folders:

1. Train data
2. Test data

And has a separate file with the true labels of the train data as a CSV file.

Here is the size and dimensions of the data:

In [None]:
n_samples_train = len(TRAIN_FILES)
n_samples_test = len(TEST_FILES)
image = plt.imread(f'{FILE_PATH_BASE}/train/{TRAIN_FILES[0]}')
print(image.shape)
print(f'Number of training images: {n_samples_train}')
print(f'Number of test images: {n_samples_test}')
print(f'Number of total images: {n_samples_train+n_samples_test}')

The dataset contains 277,483 total images, with 220,025 ($\approx$ 80%) images for training and 57,458 ($\approx$ 20%) images for testing. After loading one image we can see the dimensions are 96 X 96 X 3.

# EDA
Below there is a sample of pathology images that are positive and negative. After a brief research on how the images are classified by specialists(I am not one by any means), it seems that the main difference between them are the shape of the tissue in the tumors. If it has irregular shape and size, then this might indicate metastatic cancer. 

It is also important to note the following detail in the instruction: "A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. Tumor tissue in the outer region of the patch does not influence the label. This outer region is provided to enable fully-convolutional models that do not use zero-padding, to ensure consistent behavior when applied to a whole-slide image"

In [None]:
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)
sample_true, sample_false = train_labels_df.query('label == 1').sample(10)['id'].values,train_labels_df.query('label == 0').sample(10)['id'].values
fig = plt.figure(figsize=(25, 4))
for idx, id_ in enumerate(sample_true):
    ax = fig.add_subplot(1, 10, idx + 1, xticks=[], yticks=[])
    ax.set_title('True')
    p = f'{FILE_PATH_BASE}/train/{id_}.tif'
    im = Image.open(p)
    plt.imshow(im)
fig = plt.figure(figsize=(25, 4))
for idx, id_ in enumerate(sample_false):
    ax = fig.add_subplot(1, 10, idx + 1, xticks=[], yticks=[])
    ax.set_title('False')
    p = f'{FILE_PATH_BASE}/train/{id_}.tif'
    im = Image.open(p)
    plt.imshow(im)

In [None]:
counts = train_labels_df.label.value_counts()
f_counts, t_counts = counts
print(f'False counts: {f_counts},',f'True counts: {t_counts},', f'Imbalance Ratio: {round(f_counts/t_counts,2)}')
fig, ax = plt.subplots()
ax.pie(counts, labels=['False', 'True'], autopct='%1.1f%%')
plt.show()

The dataset is imbalanced, but not to an extent that compromises training with the full set. 

The plan for the analysis and the architecture will be the following.
* Further separate the training dataset, into training and validation (80-20) split.
* Balance the dataset
* Use a convolutional neuron network to compress the images
* Tweak hyperparameters such as the number of filters, number of layers, activation functions, loss function and learning rate
* Compute the chart for the loss of training vs validation over EPOCH's
* Compute the chart for the AUC curve of training vs validation over EPOCH's
* Predict test data and get the results.

In [None]:
train_labels_df_balanced = pd.concat([train_labels_df[train_labels_df.label == 1], train_labels_df[train_labels_df.label == 0].sample(89117)])
counts = train_labels_df_balanced.label.value_counts()
train_labels_df_balanced = train_labels_df.sample(frac=1)
f_counts, t_counts = counts
print(f'False counts: {f_counts},',f'True counts: {t_counts},', f'Imbalance Ratio: {round(f_counts/t_counts,2)}')
fig, ax = plt.subplots()
ax.pie(counts, labels=['False', 'True'], autopct='%1.1f%%')
plt.show()
train_labels_df_balanced.to_csv('/kaggle/working/balanced_train_labels.csv', index=False)
train_labels_df.to_csv('/kaggle/working/train_labels.csv', index=False)

In [None]:
class project_data:
    def __init__(self, train_path: str, test_path: str, csv_label_path: str, seed: int, batch_size: int = 32, validation_split = 0.2, target_size: Tuple[int,int] = (96,96), y_col: str = 'label'):
        self.batch_size = batch_size
        self.target_size = target_size
        self.y_col = y_col
        self.df_label_mapping = pd.read_csv(csv_label_path).assign(
            file_name = lambda df_: df_.id + '.tif',
        )
        self.df_label_mapping[y_col] = self.df_label_mapping[y_col].astype('str')
        self.df_test_mapping = pd.DataFrame([
            [f[:-4],f] for f in TEST_FILES #READING GLOBAL VARIABLE TO AVOID DUPLICATE LOAD
        ], columns = ['id','file_name'])
        image_gen = ImageDataGenerator(
            rescale= 1./255,
            validation_split= validation_split
        )
        self.train_generator = image_gen.flow_from_dataframe(
            dataframe=self.df_label_mapping,
            directory=train_path,
            x_col='file_name',
            y_col=y_col,
            target_size=target_size,
            batch_size=batch_size,
            class_mode='binary',
            color_mode='rgb',
            shuffle=True,
            seed=seed,
            subset='training',
            #rotation_range=20,
            #horizontal_flip=True,
            #vertical_flip=True
        )
        self.validation_generator= image_gen.flow_from_dataframe(
            dataframe=self.df_label_mapping,
            directory=train_path,
            x_col='file_name',
            y_col=y_col,
            target_size=target_size,
            batch_size=batch_size,
            class_mode='binary',
            color_mode='rgb',
            shuffle=True,
            seed=seed,
            subset='validation'
        )
        self.test_generator = ImageDataGenerator(
            rescale= 1./255
        ).flow_from_dataframe(
            dataframe=self.df_test_mapping,
            directory=test_path,
            x_col='file_name',
            y_col=None,
            target_size=target_size,
            batch_size=batch_size*2,
            class_mode=None,
            color_mode='rgb',
            shuffle=False
        )

In [None]:
data = project_data(TRAIN_FILES_PATH, TEST_FILES_PATH, 'train_labels.csv', seed=2023)

# Model Architecture
I tried different architectures the one that had the best performance has the following:
1. Three convolutional layers (convolutional layer, normalization, activation: relu, convolutional layer, normalization, activation: relu, max pool) with no. filters (32,64,96), kernels (3,3)
2. A densely connected neural network with 256 units
3. A final densenly connected network with a sigmoid activation function

I played around with different parameters like input size, number of filters, normalization, data augmentation (flips & rotations, etc), number of parameters, dropout, learning rate, optimization, epochs etc.

In [None]:
gpus = tf.config.list_logical_devices('GPU')
print("Number of available GPUs: ", len(gpus))
strategy = tf.distribute.MirroredStrategy(gpus)
with strategy.scope():
#if True:
    model = Sequential([
        Conv2D(data.batch_size, (3,3), input_shape=(96,96,3)),
        BatchNormalization(),
        Activation('relu'),
        Conv2D(data.batch_size, (3,3)),
        BatchNormalization(),
        Activation('relu'),
        MaxPool2D(2,2),
        
        Conv2D(data.batch_size*2, (3,3)),
        BatchNormalization(),
        Activation('relu'),
        Conv2D(data.batch_size*2, (3,3)),
        BatchNormalization(),
        Activation('relu'),
        MaxPool2D(2,2),
        
        Conv2D(data.batch_size*3, (3,3)),
        BatchNormalization(),
        Activation('relu'),
        Conv2D(data.batch_size*3, (3,3)),
        BatchNormalization(),
        Activation('relu'),
        MaxPool2D(2,2),
        
        Flatten(),
        Dense(256, use_bias=False),
        BatchNormalization(),
        Activation('relu'),
        #Dropout(0.5),
        
        Dense(1,activation='sigmoid')
    ])
    
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
history = model.fit(
    data.train_generator,
    epochs = 10,
    validation_data=data.validation_generator,
    verbose=1,
    callbacks=[
        EarlyStopping(monitor='val_loss', min_delta=0, patience=2, mode='auto', restore_best_weights=True),
    ]
)

In [None]:
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
display(Image.open('model.png'))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
test_results = model.predict(data.test_generator)
test_labels = np.round(np.where(test_results > 0.5, 1, 0).flatten()).astype(int)

In [None]:
df = data.df_test_mapping[['id']]
df["label"] = test_labels
df.to_csv('/kaggle/working/test_labels_2.csv', index=False)
pd.read_csv('/kaggle/working/test_labels_2.csv')