In [10]:
# import os, re, and skimage packages to load images
import os
import re
from skimage import io, color
from skimage.color import rgb2gray
from skimage.io import imread

# import numpy, pandas, shutil, and sklearn packages to implenet k-fold cross-validation
import numpy as np
import pandas as pd
import shutil
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# import keras packages to generate model
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.utils import plot_model
from keras.callbacks import History

# import matplot for visualizing training results
import matplotlib.pyplot as plt  


Using TensorFlow backend.


In [None]:
'''
establish expected image parameters, locations, and augmentation procedures
'''

# establish data directories for training, testing, and validation
train_data_dir = '../../data/train'
validation_data_dir = '../../data/validation'
test_data_dir = '../../data/test'

# expected image size
img_width, img_height = 512, 512

# establish batch size
batch_size = 16

# establish number of epochs to run
epoch = 100

# augmentation configuration for training
train_datagen = ImageDataGenerator(
#        rotation_range=20,
        rescale=1./255,
        horizontal_flip=True,
        vertical_flip = True)

# augmentation configuration for testing: only rescaling
test_datagen = ImageDataGenerator(rescale=1./255)



In [None]:
# create image generators that will read images found in
# subfolders of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        color_mode = 'grayscale',
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode='binary')  # since we use binary_crossentropy loss, we need binary labels

# this is a similar generator, for validation data
validation_generator = test_datagen.flow_from_directory(
        validation_data_dir,
        color_mode = 'grayscale',
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode='binary')

# this is a similar generator, for testing data
test_generator = test_datagen.flow_from_directory(
        test_data_dir,
        target_size = (img_width, img_height),
        batch_size = 1,
        color_mode = 'grayscale',
        class_mode = 'binary',
        # this will also ensure the same order
        shuffle = False)


In [None]:
'''
build 16-layer VGG ConvNet
'''

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(img_width, img_height, 1)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

plot_model(model, to_file='../../output/models/vgg16_model.png')

In [None]:
# create function to run prediction, using the testing data generator function with batch size = 1
'''
train model and generate history callback
'''

def train_model(train_generator= train_generator, validation_generator=validation_generator,
                validation_samples = validation, train_samples = train, 
                epoch = epoch, batch_size = batch_size):
    history = model.fit_generator(
            train_generator,
            steps_per_epoch = train_samples // batch_size,
            epochs=epoch,
            validation_data = validation_generator,
            validation_steps = validation_samples // batch_size,
            callbacks=[history])
    return history
#    weights_filename = '../../output/weights/vgg16_weights_Mar22-730.h5' + i
#    model.save_weights('../../output/weights/vgg16_weights_Mar22-730.h5')  # always save your weights after training or during training

# create test data generator
test_generator = test_datagen.flow_from_directory(
        test_data_dir,
        target_size = (img_width, img_height),
        batch_size = 1,
        color_mode = 'grayscale',
        class_mode = 'binary',
        # this will also ensure the same order
        shuffle = False)

# run prediction generator on test dataset
def predict_test(model = model, test_data_dir = test_data_dir, 
                 img_width = img_width, img_height = img_height, 
                 total_test_images = total_test_images):
    probabilities=model.predict_generator(
        test_generator, 
        steps=total_test_images)

# return filenames for testing images for evaluation of predictions
image_name=test_generator.filenames
probabilities
# compare predictions to ground truth and acquire overall prediction accuracy


In [None]:

# used to copy files according to each fold
def copy_images(df, directory):
    destination_directory = "../../data/" + directory
    print("copying {} files to {}...".format(directory, destination_directory))

    # remove all files from previous fold
    if os.path.exists(destination_directory):
        shutil.rmtree(destination_directory)

    # create folder for files from this fold
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    # create subfolders for each class
    for c in set(list(df['class'])):
        if not os.path.exists(destination_directory + '/' + c):
            os.makedirs(destination_directory + '/' + c)

    # copy files for this fold from a directory holding all the files
    for i, row in df.iterrows():
        try:
            # this is the path to all of your images kept together in a separate folder
            path_from = "../../data/images/"
            path_from = path_from + "{}.jpg"
            path_to = "{}/{}".format(destination_directory, row['class'])

            # move from folder keeping all files to training, test, or validation folder (the "directory" argument)
            shutil.copy(path_from.format(row['filename']), path_to)
        except Exception, e:
            print("Error when copying {}: {}".format(row['filename'], str(e)))


In [None]:

# dataframe containing the filenames of the images (e.g., GUID filenames) and the classes
df = pd.read_csv('../../data/labels.csv')
df_y = df['Label']
df_x = df
del df_x['Label']

skf = StratifiedKFold(n_splits = 10)
total_actual = []
total_predicted = []
total_val_accuracy = []
total_val_loss = []
total_test_accuracy = []



In [None]:
for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_test = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_test = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    test = pd.concat([x_test, y_test], axis = 1)

    # take 20% of the training data from this fold for validation during training
    validation = train.sample(frac = 0.2)

    # make sure validation data does not include training data
    train = train[~train['filename'].isin(list(validation['filename']))]

    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')
    copy_images(test, 'test')

    print('**** Running fold '+ str(i))

    # here you call a function to create and train your model, returning validation accuracy and validation loss
    history = History()
    history = train_model()
    val_accuracy = history[val_acc]
    val_loss = history[val_loss]

    # append validation accuracy and loss for average calculation later on
    total_val_accuracy.append(val_accuracy)
    total_val_loss.append(val_loss)

    # here you will call a predict() method that will predict the images on the "test" subfolder 
    # this function returns the actual classes and the predicted classes in the same order
#    total_test_images = len(x_test)
#    actual, predicted = predict()

    # append accuracy from the predictions on the test data
#    total_test_accuracy.append(accuracy_score(actual, predicted))

    # append all of the actual and predicted classes for your final evaluation
#    total_actual = total_actual + actual
#    total_predicted = total_predicted + predicted

    # this is optional, but you can also see the performance on each fold as the process goes on
#    print(classification_report(total_actual, total_predicted))
#    print(confusion_matrix(total_actual, total_predicted))

print(classification_report(total_actual, total_predicted))
print(confusion_matrix(total_actual, total_predicted))
print("Validation accuracy on each fold:")
print(total_val_accuracy)
print("Mean validation accuracy: {}%".format(np.mean(total_val_accuracy) * 100))

print("Validation loss on each fold:")
print(total_val_loss)
print("Mean validation loss: {}".format(np.mean(total_val_loss)))

#print("Test accuracy on each fold:")
#print(total_test_accuracy)
#print("Mean test accuracy: {}%".format(np.mean(total_test_accuracy) * 100))
