In [25]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics

from keras.preprocessing.image import ImageDataGenerator 

In [8]:
dir_train_images  = './data/training/'
dir_test_images   = './data/testing/'
dir_train_labels  = './data/labels_training.csv'
dir_test_ids      = './data/sample_submission.csv'

In [22]:
def load_data(dir_data, dir_labels, training=True):
    
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    filenames = []
    for identifier in ids:
        fname     = identifier.astype(str) + '.tif' 
        fpath     = dir_data + fname
        image     = mpl.image.imread(fpath)
        data.append(image)
        
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids
    
def cv_performance_assessment(X,y,k,clf):
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        # Train the classifier
        X_train_features = preprocess_and_extract_features(X_train)
        clf              = clf.fit(X_train_features,y_train)
        
        # Test the classifier on the validation data for this fold
        X_val_features   = preprocess_and_extract_features(X_val)
        cpred            = clf.predict_proba(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
    return prediction_scores

def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()

In [143]:
# MODEL
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.layers import Activation, Dropout, Flatten, Dense, LeakyReLU
from keras.regularizers import l2
from keras import optimizers

model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(5, 5), strides=(2,2), input_shape=(101, 101, 3)))
model.add(LeakyReLU(alpha=0.05)) 
model.add(MaxPooling2D(pool_size=(3, 3)))

model.add(Conv2D(filters=48, kernel_size=(5, 5), strides=(2,2), 
                 kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(LeakyReLU(alpha=0.05)) 
model.add(BatchNormalization())

model.add(Conv2D(filters=64, kernel_size=(5, 5), strides=(2,2)))
model.add(LeakyReLU(alpha=0.05)) #LeakyRELU
model.add(BatchNormalization())

# the model so far outputs 3D feature maps (height, width, features)
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors

model.add(Dense(64, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
#model.add(Activation('relu'))
model.add(LeakyReLU(alpha=0.05)) #LeakyRELU
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
# COMPILE
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

In [144]:
train_df = pd.read_csv(dir_train_labels)
train_df['filename'] = train_df['id'].astype(str) + '.tif'
train_df['label'] = train_df['label'].astype(str)

In [145]:
#train_df = train_df.sample(500)

In [140]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        validation_split = 0.20)

# this is a generator that will read pictures found in subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        directory='data/training',
        x_col="filename",
        y_col="label",
        target_size=(101, 101),
        batch_size=32,
        class_mode='binary',
        subset='training',
        seed=45)

validation_generator = train_datagen.flow_from_dataframe( 
        dataframe=train_df,
        directory=dir_train_images, 
        x_col="filename",
        y_col="label",
        target_size=(101, 101),
        class_mode='binary',
        subset='validation',
        batch_size = 300,
        seed=45)

Found 1200 validated image filenames belonging to 2 classes.
Found 300 validated image filenames belonging to 2 classes.


In [146]:
model.fit_generator(
        train_generator,
        steps_per_epoch=100,
        epochs=50, 
        validation_data=validation_generator)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f1e19f7a940>

In [153]:
test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
test_data = test_data/255
test_scores = model.predict_proba(test_data)
test_scores

submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores.ravel()})
submission_file.to_csv('submission.csv',
                           columns=['id','score'],
                           index=False