In [None]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python
# Current preserved directory (/kaggle/working/) 20GB
# Temporary files (unpreserved after session) /kaggle/temp/
#import numpy as np
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt
#import matplotlib.image as mpimg
import tensorflow as tf
#from tensorflow.keras.preprocessing.image import ImageDataGenerator
#from tensorflow.keras.preprocessing import image
#from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
from tensorflow import keras

TRAIN_FILES_PATH = '/kaggle/input/histopathologic-cancer-detection/train/'
TEST_FILES_PATH = '/kaggle/input/histopathologic-cancer-detection/test/'
Y_TRAIN_PATH = '/kaggle/input/histopathologic-cancer-detection/train_labels.csv'
TRAIN_FILES = os.listdir(TRAIN_FILES_PATH)
TEST_FILES = os.listdir(TEST_FILES_PATH)
FILE_PATH_BASE = '/kaggle/input/histopathologic-cancer-detection'

# Description of the project, data and objectives
This project main objective is to develop an algorithm that is able to identify metastatic cancer in small image patches taken from larger digital pathology scans. The performance of the model will be evaluated on the area under the ROC curve between the predicted probability and the observed target. The data is already divided into two separate folders:

1. Train data
2. Test data

And has a separate file with the true labels of the train data as a CSV file.

Here is the size and dimensions of the data:

In [None]:
n_samples_train = len(TRAIN_FILES)
n_samples_test = len(TEST_FILES)
image = plt.imread(f'{FILE_PATH_BASE}/train/{TRAIN_FILES[0]}')
print(image.shape)
print(f'Number of training images: {n_samples_train}')
print(f'Number of test images: {n_samples_test}')
print(f'Number of total images: {n_samples_train+n_samples_test}')

The dataset contains 277,483 total images, with 220,025 ($\approx$ 80%) images for training and 57,458 ($\approx$ 20%) images for testing. After loading one image we can see the dimensions are 96 X 96 X 3.

# EDA
Below there is a sample of pathology images that are positive and negative. After a brief research on how the images are classified by specialists(I am not one by any means), it seems that the main difference between them are the shape of the tissue in the tumors. If it has irregular shape and size, then this might indicate metastatic cancer. 

It is also important to note the following detail in the instruction: "A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. Tumor tissue in the outer region of the patch does not influence the label. This outer region is provided to enable fully-convolutional models that do not use zero-padding, to ensure consistent behavior when applied to a whole-slide image"

In [None]:
train_labels_df = pd.read_csv(Y_TRAIN_PATH)
sample_true, sample_false = train_labels_df.query('label == 1').sample(10)['id'].values,train_labels_df.query('label == 0').sample(10)['id'].values
fig = plt.figure(figsize=(25, 4))
for idx, id_ in enumerate(sample_true):
    ax = fig.add_subplot(1, 10, idx + 1, xticks=[], yticks=[])
    ax.set_title('True')
    p = f'{FILE_PATH_BASE}/train/{id_}.tif'
    im = Image.open(p)
    plt.imshow(im)
fig = plt.figure(figsize=(25, 4))
for idx, id_ in enumerate(sample_false):
    ax = fig.add_subplot(1, 10, idx + 1, xticks=[], yticks=[])
    ax.set_title('False')
    p = f'{FILE_PATH_BASE}/train/{id_}.tif'
    im = Image.open(p)
    plt.imshow(im)

In [None]:
counts = train_labels_df.label.value_counts()
f_counts, t_counts = counts
print(f'False counts: {f_counts},',f'True counts: {t_counts},', f'Imbalance Ratio: {round(f_counts/t_counts,2)}')
fig, ax = plt.subplots()
ax.pie(counts, labels=['False', 'True'], autopct='%1.1f%%')
plt.show()

The dataset is imbalanced, so it would be best to balance the datasets. 

The plan for the analysis and the architecture will be the following.
* Further separate the training dataset, into training and validation (80-20) split.
* Balance the dataset
* Use a convolutional neuron network to compress the images
* Tweak hyperparameters such as the number of filters, number of layers, activation functions, loss function and learning rate
* Compute the chart for the loss of training vs validation over EPOCH's
* Compute the chart for the AUC curve of training vs validation over EPOCH's
* Predict test data and get the results.

In [None]:
train_labels_df_balanced = pd.concat([train_labels_df[train_labels_df.label == 1], train_labels_df[train_labels_df.label == 0].sample(89117)])
counts = train_labels_df_balanced.label.value_counts()
f_counts, t_counts = counts
print(f'False counts: {f_counts},',f'True counts: {t_counts},', f'Imbalance Ratio: {round(f_counts/t_counts,2)}')
fig, ax = plt.subplots()
ax.pie(counts, labels=['False', 'True'], autopct='%1.1f%%')
plt.show()
train_labels_df_balanced.to_csv('/kaggle/working/balanced_train_labels.csv', index=False)

In [None]:
FILE_BASE = '/kaggle/input/histopathologic-cancer-detection'
class project_data:
    def __init__(self, img_train_path: str, img_test_path: str, csv_train_labels_path: str, validation_pct: float = 0.2, batch_size: int = 25):
        self.batch_size = batch_size
        df_train_labels = pd.read_csv(csv_train_labels_path).sample(frac = 1, random_state=2023)
        self.n_samples = df_train_labels.shape[0]
        self.n_train_samples = int(self.n_samples*(1-validation_pct))
        self.train_labels = df_train_labels.reset_index(drop=True).set_index('id')["label"].to_dict()
        self.data_files = {
            'train': df_train_labels.id.values[:self.n_train_samples+1],
            'validation': df_train_labels.id.values[self.n_train_samples:],
            'test': os.listdir(img_test_path)
        }
        self.train_steps_per_epoch = round(self.n_train_samples/batch_size)
        self.n_validation_samples = len(self.data_files['validation'])
        self.validation_steps = round(self.n_validation_samples/batch_size)
        
    def get_batch_function(self, dataset: str = 'train'):
        c = 0
        while True:
            yield (np.array([plt.imread(f'{FILE_BASE}/{dataset}/{f}.tif') for f in self.data_files[dataset][c:c+self.batch_size]]), np.array([self.train_labels[f] if dataset != 'test' else None for f in self.data_files[dataset][c:c+self.batch_size]]))
            c  += self.batch_size

In [None]:
data = project_data(TRAIN_FILES_PATH, TEST_FILES_PATH, '/kaggle/working/balanced_train_labels.csv', batch_size=32)
train_generator = data.get_batch_function()
validation_generator = data.get_batch_function('validation')
test_generator = data.get_batch_function('test')

In [None]:
gpus = tf.config.list_logical_devices('GPU')
print("Number of available GPUs: ", len(gpus))
strategy = tf.distribute.MirroredStrategy(gpus)
with strategy.scope():
    model = keras.Sequential([
        keras.layers.Conv2D(data.batch_size,(3,3),activation='relu',input_shape=(96,96,3)),
        keras.layers.MaxPool2D(2,2),
        keras.layers.Conv2D(64,(3,3),activation='relu'),
        keras.layers.MaxPool2D(2,2),
        keras.layers.Conv2D(128,(3,3),activation='relu'),
        keras.layers.MaxPool2D(2,2),
        keras.layers.Conv2D(128,(3,3),activation='relu'),
        keras.layers.MaxPool2D(2,2),
        keras.layers.Flatten(),
        keras.layers.Dense(512,activation='relu'),
        keras.layers.Dense(1,activation='sigmoid')
    ])
    
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy', 'AUC'])
history = model.fit(
    train_generator,
    epochs = 10,
    steps_per_epoch=data.train_steps_per_epoch,
    validation_data=validation_generator,
    validation_steps=data.validation_steps,
    verbose=1
)
    
    #model.evaluate(validation_generator)
    #validation_generator.reset()
    #preds = model.predict(validation_generator, verbose=1)
    
    #fpr, tpr, _ = roc_curve(validation_generator.classes, preds)
    #roc_auc = auc(fpr, tpr)
    #plt.figure()
    #lw = 2
    #plt.plot(fpr, tpr, color='darkorange',
    #lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    #plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    #plt.xlabel('False Positive Rate')
    #plt.ylabel('True Positive Rate')
    #plt.title('Receiver operating characteristic example')
    #plt.legend(loc="lower right")
    #plt.show()

In [None]:
history