Taken from https://www.kaggle.com/qitvision/a-complete-ml-pipeline-fast-ai

-----------------------------------------
# Data visualization

In [1]:
%matplotlib inline

import numpy as np 
import pandas as pd 
from glob import glob 
from skimage.io import imread 
import os
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator
# from keras.applications.nasnet import NASNetMobile
from keras.applications import NASNetMobile
from keras.applications.xception import Xception
from keras.utils.vis_utils import plot_model
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Average, Input, Concatenate, GlobalMaxPooling2D
from keras.models import Model
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.optimizers import Adam
# !pip install livelossplot
from livelossplot import PlotLossesKeras

# import src.constants

Using TensorFlow backend.


In [2]:
# Output files
TRAINING_LOGS_FILE = "training_logs.csv"
MODEL_SUMMARY_FILE = "model_summary.txt"
MODEL_PLOT_FILE = "model_plot.png"
MODEL_FILE = "model.h5"
TRAINING_PLOT_FILE = "training.png"
VALIDATION_PLOT_FILE = "validation.png"
ROC_PLOT_FILE = "roc.png"
KAGGLE_SUBMISSION_FILE = "kaggle_submission.csv"

# Hyperparams
SAMPLE_COUNT = 85000 #??? why only 85000
TRAINING_RATIO = 0.9
IMAGE_SIZE = 96
EPOCHS = 10
BATCH_SIZE = 192
VERBOSITY = 1
TESTING_BATCH_SIZE = 5000

In [3]:
# Data (https://www.kaggle.com/c/histopathologic-cancer-detection/data)
#input_dir = '../input/'
def prepare_data_for_augmentation():
    data_dir = 'data/'
    training_dir = data_dir + 'train/'
    data_frame = pd.DataFrame({'path': glob(os.path.join(training_dir,'*.tif'))})
    data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[2].split('.')[0])
    labels = pd.read_csv(data_dir + 'train_labels.csv')
    data_frame = data_frame.merge(labels, on='id')
    negatives = data_frame[data_frame.label == 0].sample(SAMPLE_COUNT)
    positives = data_frame[data_frame.label == 1].sample(SAMPLE_COUNT)
    data_frame = pd.concat([negatives, positives]).reset_index()
    data_frame = data_frame[['path', 'id', 'label']]
    data_frame['image'] = data_frame['path'].map(imread)

    training_path = 'data/training'
    validation_path = 'data/validation'

    for folder in [training_path, validation_path]:
        for subfolder in ['0', '1']:
            path = os.path.join(folder, subfolder)
            os.makedirs(path, exist_ok=True)

    training, validation = train_test_split(data_frame, train_size=TRAINING_RATIO, stratify=data_frame['label'])

    data_frame.set_index('id', inplace=True)

    for images_and_path in [(training, training_path), (validation, validation_path)]:
        images = images_and_path[0]
        path = images_and_path[1]
        for image in images['id'].values:
            file_name = image + '.tif'
            label = str(data_frame.loc[image,'label'])
            destination = os.path.join(path, label, file_name)
            if not os.path.exists(destination):
                source = os.path.join(data_dir + 'train', file_name)
                shutil.copyfile(source, destination)
    
    return [training_path, validation_path]

In [4]:
def prepare_data_generator(with_augmentation, training_path, validation_path):
    if (with_augmentation == True):
        # Data augmentation
        training_data_generator = ImageDataGenerator(
            rescale=1./255,
            horizontal_flip=True,
            vertical_flip=True,
            rotation_range=180,
            zoom_range=0.4, 
            width_shift_range=0.3,
            height_shift_range=0.3,
            shear_range=0.3,
            channel_shift_range=0.3)
    else:
        # just rescale it now
        training_data_generator = ImageDataGenerator(rescale=1./255)

    # Data generation
    training_generator = training_data_generator.flow_from_directory(
        training_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary')

    validation_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        validation_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary')

    testing_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        validation_path,
        target_size=(IMAGE_SIZE,IMAGE_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        shuffle=False)
    
    return [training_generator, validation_generator, testing_generator]


In [5]:
# Model
def create_model():
    input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3)
    input_tensor = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
    inputs = Input(input_shape)

    xception = Xception(include_top=False, input_shape=input_shape)(inputs)
    nas_net = NASNetMobile(include_top=False, input_tensor=input_tensor)(inputs)

    outputs = Concatenate(axis=-1)([GlobalAveragePooling2D()(xception), GlobalAveragePooling2D()(nas_net)])
    outputs = Dropout(0.5)(outputs)
    outputs = Dense(1, activation='sigmoid')(outputs)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(lr=0.0001, decay=0.00001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()

    plot_model(model,
               to_file=MODEL_PLOT_FILE,
               show_shapes=True,
               show_layer_names=True)
    
    return model

In [18]:
def train(model, training_generator, validation_generator):
    #  Training
    
    history = model.fit_generator(training_generator,
                                  steps_per_epoch=len(training_generator), 
                                  validation_data=validation_generator,
                                  validation_steps=len(validation_generator),
                                  epochs=EPOCHS,
                                  verbose=VERBOSITY)
#                                   callbacks=[PlotLossesKeras(),
# #                                              ModelCheckpoint(MODEL_FILE,
# #                                                              monitor='val_acc',
# #                                                              verbose=VERBOSITY,
# #                                                              save_best_only=True,
# #                                                              mode='max'),
#                                              CSVLogger(TRAINING_LOGS_FILE,
#                                                        append=False,
#                                                        separator=';')])

In [7]:
def plot_epochs(history):
    # Training plots
    epochs = [i for i in range(1, len(history.history['loss'])+1)]

    plt.plot(epochs, history.history['loss'], color='blue', label="training_loss")
    plt.plot(epochs, history.history['val_loss'], color='red', label="validation_loss")
    plt.legend(loc='best')
    plt.title('training')
    plt.xlabel('epoch')
    plt.savefig(TRAINING_PLOT_FILE, bbox_inches='tight')
    plt.close()

    plt.plot(epochs, history.history['acc'], color='blue', label="training_accuracy")
    plt.plot(epochs, history.history['val_acc'], color='red',label="validation_accuracy")
    plt.legend(loc='best')
    plt.title('validation')
    plt.xlabel('epoch')
    plt.savefig(VALIDATION_PLOT_FILE, bbox_inches='tight')
    plt.close()

In [8]:
def plot_AUC(model, testing_generator):
    # ROC testing plot
    model.load_weights(MODEL_FILE)
    predictions = model.predict_generator(testing_generator, steps=len(testing_generator), verbose=VERBOSITY)
    false_positive_rate, true_positive_rate, threshold = roc_curve(testing_generator.classes, predictions)
    area_under_curve = auc(false_positive_rate, true_positive_rate)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, label='AUC = {:.3f}'.format(area_under_curve))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.savefig(ROC_PLOT_FILE, bbox_inches='tight')
    plt.close()


In [9]:
def test():
    testing_files = glob(os.path.join('data/test/','*.tif'))
    submission = pd.DataFrame()
    for index in range(0, len(testing_files), TESTING_BATCH_SIZE):
        data_frame = pd.DataFrame({'path': testing_files[index:index+TESTING_BATCH_SIZE]})
        data_frame['id'] = data_frame.path.map(lambda x: x.split('/')[2].split(".")[0])
        data_frame['image'] = data_frame['path'].map(imread)
        images = np.stack(data_frame.image, axis=0)
        predicted_labels = [model.predict(np.expand_dims(image/255.0, axis=0))[0][0] for image in images]
        predictions = np.array(predicted_labels)
        data_frame['label'] = predictions
        submission = pd.concat([submission, data_frame[["id", "label"]]])
    submission.to_csv(KAGGLE_SUBMISSION_FILE, index=False, header=True)

In [10]:
#run without augmentation
paths = prepare_data_for_augmentation()



In [11]:
generators = prepare_data_generator(
    with_augmentation = False, 
    training_path = paths[0], 
    validation_path = paths[1])

Found 218375 images belonging to 2 classes.
Found 72458 images belonging to 2 classes.
Found 72458 images belonging to 2 classes.


In [12]:
model = create_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 96, 96, 3)    0                                            
__________________________________________________________________________________________________
xception (Model)                (None, 3, 3, 2048)   20861480    input_2[0][0]                    
__________________________________________________________________________________________________
NASNet (Model)                  (None, 3, 3, 1056)   4269716     input_2[0][0]                    
__________________________________________________________________________________________________
global_average_poolin

In [19]:
history = train(model = model, 
                training_generator = generators[0],
                validation_generator = generators[1])

Epoch 1/10


AbortedError: Operation received an exception:Status: 5, message: could not create a view primitive descriptor, in file tensorflow/core/kernels/mkl_slice_op.cc:435
	 [[{{node training/Adam/gradients/NASNet/concatenate_3/concat_grad/Slice}}]]

In [None]:
plot_epochs(history)
plot_AUC(model)
# test()

In [None]:
prepare_data_generator(with_augmentation=Trues)
model = create_model()
history = train(model)
plot_epochs(history)
plot_AUC(model=model, testing_generator=generators[0])
# test()