Preprocessing from:
https://github.com/gsurma/histopathologic_cancer_detector/blob/master/histopathologic_cancer_detector.ipynb
Model from:
https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-10min-0-925-lb

-----------------------------------------
# Data visualization

In [1]:
%matplotlib inline

import numpy as np 
import pandas as pd 
from glob import glob 
from skimage.io import imread 
import os
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
# from keras.applications.nasnet import NASNetMobile
# from keras.applications import NASNetMobile
# from keras.applications.xception import Xception
from keras.utils.vis_utils import plot_model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Average
from keras.layers import Input, Concatenate, GlobalMaxPooling2D, BatchNormalization
from keras.layers import Conv2D, MaxPool2D, Activation
from keras.models import Model
from keras.models import Sequential
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import RMSprop, Adam
from keras.models import load_model
# from livelossplot import PlotLossesKeras

# import src.constants

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6547872457234652627
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 3927445834682178080
physical_device_desc: "device: XLA_GPU device"
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 9037226364990484591
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7390353818
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12562898854748317704
physical_device_desc: "device: 0, name: Tesla P4, pci bus id: 0000:00:04.0, compute capability: 6.1"
]


In [17]:
# Output files
TRAINING_LOGS_FILE = "training_logs.csv"
MODEL_SUMMARY_FILE = "model_summary.txt"
MODEL_PLOT_FILE = "model_plot.png"
MODEL_FILE = "model.h5"
MODEL_WITH_AUG_FILE = "model_with_augmentation.h5"
TRAINING_PLOT_FILE = "training.png"
VALIDATION_PLOT_FILE = "validation.png"
# ROC_PLOT_FILE = "roc.png"
KAGGLE_SUBMISSION_FILE = "kaggle_submission.csv"

# Hyperparams
SAMPLE_COUNT = 85000 #??? why only 85000
TRAINING_RATIO = 0.9
IMAGE_SIZE = 96
EPOCHS = 13
BATCH_SIZE = 192
VERBOSITY = 1
TESTING_BATCH_SIZE = 5000

In [4]:
# Data (https://www.kaggle.com/c/histopathologic-cancer-detection/data)
#input_dir = '../input/'
def prepare_data_for_augmentation(need_to_prepare = False):
    training_path = 'data/training'
    validation_path = 'data/validation'
    if (need_to_prepare == True):
        data_dir = 'data/'
        training_dir = data_dir + 'train/'
        data_frame = pd.DataFrame({'path': glob(os.path.join(training_dir,'*.tif'))})
        data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[2].split('.')[0])
        labels = pd.read_csv(data_dir + 'train_labels.csv')
        data_frame = data_frame.merge(labels, on='id')
        negatives = data_frame[data_frame.label == 0].sample(SAMPLE_COUNT)
        positives = data_frame[data_frame.label == 1].sample(SAMPLE_COUNT)
        data_frame = pd.concat([negatives, positives]).reset_index()
        data_frame = data_frame[['path', 'id', 'label']]
        data_frame['image'] = data_frame['path'].map(imread)

        training_path = 'data/training'
        validation_path = 'data/validation'

        for folder in [training_path, validation_path]:
            for subfolder in ['0', '1']:
                path = os.path.join(folder, subfolder)
                os.makedirs(path, exist_ok=True)

        training, validation = train_test_split(data_frame, train_size=TRAINING_RATIO, stratify=data_frame['label'])

        data_frame.set_index('id', inplace=True)

        for images_and_path in [(training, training_path), (validation, validation_path)]:
            images = images_and_path[0]
            path = images_and_path[1]
            for image in images['id'].values:
                file_name = image + '.tif'
                label = str(data_frame.loc[image,'label'])
                destination = os.path.join(path, label, file_name)
                if not os.path.exists(destination):
                    source = os.path.join(data_dir + 'train', file_name)
                    shutil.copyfile(source, destination)
    
    return [training_path, validation_path]

In [5]:
def prepare_data_generator(with_augmentation, training_path, validation_path):
    if (with_augmentation == True):
        # Data augmentation
        training_data_generator = ImageDataGenerator(
            rescale=1./255,
            horizontal_flip=True,
            vertical_flip=True,
            rotation_range=180,
            zoom_range=0.4, 
            width_shift_range=0.3,
            height_shift_range=0.3,
            shear_range=0.3,
            channel_shift_range=0.3)
    else:
        # just rescale it now
        training_data_generator = ImageDataGenerator(rescale=1./255)

    # Data generation
    training_generator = training_data_generator.flow_from_directory(
        training_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary')

    validation_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        validation_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary')

    testing_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        validation_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        shuffle=False)
    
    return [training_generator, validation_generator, testing_generator]


In [6]:
# Model
def create_model():
    kernel_size = (3,3)
    pool_size= (2,2)
    first_filters = 32
    second_filters = 64
    third_filters = 128

    dropout_conv = 0.3
    dropout_dense = 0.5

    model = Sequential()
    model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3)))
    model.add(Conv2D(first_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size = pool_size)) 
    model.add(Dropout(dropout_conv))

    model.add(Conv2D(second_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Conv2D(second_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))

    model.add(Conv2D(third_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Conv2D(third_filters, kernel_size, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(MaxPool2D(pool_size = pool_size))
    model.add(Dropout(dropout_conv))

    #model.add(GlobalAveragePooling2D())
    model.add(Flatten())
    model.add(Dense(256, use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(dropout_dense))
    model.add(Dense(1, activation = "sigmoid"))

    # Compile the model
    model.compile(Adam(0.01), loss = "binary_crossentropy", metrics=["accuracy"])
    model.summary()

    plot_model(model,
               to_file=MODEL_PLOT_FILE,
               show_shapes=True,
               show_layer_names=True)
    
    return model

In [7]:
def train(model, training_generator, validation_generator, model_name):
    #  Training
#     earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)
#     reducel = ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.1)
    history = model.fit_generator(training_generator, 
                                    steps_per_epoch=len(training_generator), 
                                    validation_data=validation_generator,
                                    validation_steps=len(validation_generator),
                                    epochs=EPOCHS,
                                    callbacks=[ModelCheckpoint(model_name,
                                                             monitor='val_acc',
                                                             verbose=VERBOSITY,
                                                             save_best_only=True,
                                                             mode='max'),
                                                 CSVLogger(TRAINING_LOGS_FILE,
                                                           append=False,
                                                           separator=';')])
#     PlotLossesKeras(),


#     history = model.fit_generator(training_generator,
#                                   steps_per_epoch=len(training_generator), 
#                                   validation_data=validation_generator,
#                                   validation_steps=len(validation_generator),
#                                   epochs=EPOCHS,
#                                   verbose=0,
#                                   callbacks=[PlotLossesKeras(),
#                                              ModelCheckpoint(MODEL_FILE,
#                                                              monitor='val_acc',
#                                                              verbose=VERBOSITY,
#                                                              save_best_only=True,
#                                                              mode='max'),
#                                              CSVLogger(TRAINING_LOGS_FILE,
#                                                        append=False,
#                                                        separator=';')])

In [8]:
def plot_epochs(history):
    # Training plots
    epochs = [i for i in range(1, len(history.history['loss'])+1)]

    plt.plot(epochs, history.history['loss'], color='blue', label="training_loss")
    plt.plot(epochs, history.history['val_loss'], color='red', label="validation_loss")
    plt.legend(loc='best')
    plt.title('training')
    plt.xlabel('epoch')
    plt.savefig(TRAINING_PLOT_FILE, bbox_inches='tight')
    plt.close()

    plt.plot(epochs, history.history['acc'], color='blue', label="training_accuracy")
    plt.plot(epochs, history.history['val_acc'], color='red',label="validation_accuracy")
    plt.legend(loc='best')
    plt.title('validation')
    plt.xlabel('epoch')
    plt.savefig(VALIDATION_PLOT_FILE, bbox_inches='tight')
    plt.close()

In [9]:
def plot_AUC(model, testing_generator, filename):
    # ROC testing plot
    model.load_weights(MODEL_FILE)
    predictions = model.predict_generator(testing_generator, steps=len(testing_generator), verbose=VERBOSITY)
    false_positive_rate, true_positive_rate, threshold = roc_curve(testing_generator.classes, predictions)
    area_under_curve = auc(false_positive_rate, true_positive_rate)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, label='AUC = {:.3f}'.format(area_under_curve))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    plt.savefig(filename, bbox_inches='tight')
    plt.close()


In [10]:
def test():
    testing_files = glob(os.path.join('data/test/','*.tif'))
    submission = pd.DataFrame()
    for index in range(0, len(testing_files), TESTING_BATCH_SIZE):
        data_frame = pd.DataFrame({'path': testing_files[index:index+TESTING_BATCH_SIZE]})
        data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[2].split(".")[0])
        data_frame['image'] = data_frame['path'].map(imread)
        images = np.stack(data_frame.image, axis=0)
        predicted_labels = [model.predict(np.expand_dims(image/255.0, axis=0))[0][0] for image in images]
        predictions = np.array(predicted_labels)
        data_frame['label'] = predictions
        submission = pd.concat([submission, data_frame[["id", "label"]]])
    submission.to_csv(KAGGLE_SUBMISSION_FILE, index=False, header=True)

In [11]:
#run without augmentation
paths = prepare_data_for_augmentation(need_to_prepare=False)

In [12]:
generators = prepare_data_generator(
    with_augmentation = False, 
    training_path = paths[0], 
    validation_path = paths[1])

Found 195526 images belonging to 2 classes.
Found 32606 images belonging to 2 classes.
Found 32606 images belonging to 2 classes.


In [13]:
model = create_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 94, 94, 32)        896       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 92, 92, 32)        9216      
_________________________________________________________________
batch_normalization_1 (Batch (None, 92, 92, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 92, 92, 32)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 46, 46, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)  

In [14]:
model = load_model(MODEL_FILE)
model.summary()

Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 94, 94, 32)        896       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 92, 92, 32)        9216      
_________________________________________________________________
batch_normalization_1 (Batch (None, 92, 92, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 92, 92, 32)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 46, 46, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 46, 46, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)           

In [15]:
history = train(model = model, 
                training_generator = generators[0],
                validation_generator = generators[1],
                model_name = MODEL_FILE)

Epoch 1/13
   2/1019 [..............................] - ETA: 1:20:45 - loss: 0.1511 - acc: 0.9583

KeyboardInterrupt: 

In [16]:
# plot_epochs(history)
plot_AUC(model, generators[2], "roc_without_aug.png")



In [None]:
test()

In [12]:
generators = prepare_data_generator(
    with_augmentation = True, 
    training_path = paths[0], 
    validation_path = paths[1])

Found 195526 images belonging to 2 classes.
Found 32606 images belonging to 2 classes.
Found 32606 images belonging to 2 classes.


In [15]:
model_with_aug = create_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 94, 94, 32)        896       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 92, 92, 32)        9216      
_________________________________________________________________
batch_normalization_7 (Batch (None, 92, 92, 32)        128       
_________________________________________________________________
activation_7 (Activation)    (None, 92, 92, 32)        0         
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 46, 46, 32)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 46, 46, 32)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 44, 44, 64)        18432     
__________

In [None]:
model_with_aug = load_model(MODEL_WITH_AUG_FILE)

In [None]:
history = train(model = model_with_aug, 
                training_generator = generators[0],
                validation_generator = generators[1],
                model_name = MODEL_WITH_AUG_FILE)

Instructions for updating:
Use tf.cast instead.
Epoch 1/13

Epoch 00001: val_acc improved from -inf to 0.50396, saving model to model_with_augmentation.h5
Epoch 2/13

In [None]:
# plot_epochs(history)
plot_AUC(model_with_aug, generators[2], "roc_with_aug.png")
# test()