In [None]:
import glob
import os.path
import re
import itertools
from collections import Counter

import numpy
from sklearn.preprocessing import scale
import tensorflow
import keras

import deepometry.model

# Evaluate

After training a model to classify single cell images, it is often useful to evaluate the performance of the model on an unseen annotated dataset. Evaluation helps predict model performance on unseen data.

Suppose we have the following directory structure where images from one experiment (`experiment_02`) have been classified as one of three classes (`class_A`, `class_B`, or `class_C`). Data from this experiment was not shown to the model during training. Images are saved as NPY files with patient prefixes:

    /data/parsed/
        experiment_02/
            class_A/
                patient_A__32e88e1ac3a8f44bf8f77371155553b9.npy
                patient_A__3dc56a0c446942aa0da170acfa922091.npy  
                ...
            class_B/
                patient_A__8068ef7dcddd89da4ca9740bd2ccb31e.npy
                patient_A__8348deaa70dfc95c46bd02984d28b873.npy
                ...
            class_C/  
                patient_A__c1ecbca7bd98c01c1d3293b64cd6739a.npy
                patient_A__c56cfb8e7e7121dd822e47c67d07e2d4.npy
                ...
                

The data can be used to evaluate a model for classifying image data as one of the three classes. The `collect_pathnames` and `load` functions defined below will select images to use for evaluating the model and generate the labels for the evaluation images.

In [None]:
def collect_pathnames(directories, labels):
    """
    :param directories: List of directories to select samples from. Assumes subdirectories of each directory
                        correspond to class labels. Contents of subdirectories are NPY files containing data
                        of that label.
    :return: List of pathnames.
    """
    pathnames = []

    for directory in directories:
        subdirectories = sorted(glob.glob(os.path.join(directory, "*")))
        
        # transform the files of the same label into directory
        subdirectory_pathnames = [glob.glob("{}/*.npy".format(subdirectory)) for subdirectory in subdirectories ]      

        nsamples = max([len(pathnames) for pathnames in subdirectory_pathnames])
        print(nsamples)

        pathnames += [list(numpy.random.permutation(pathnames)[:nsamples]) for pathnames in subdirectory_pathnames]

    pathnames = sum(pathnames, [])

    return pathnames


def load(pathnames, labels, patient_to_include):
    """
    Load training and target data.
    
    Assumes data is stored in a directory corresponding to some class label.

    :param pathnames: List of image pathnames.
    :param labels: List of class labels.
    :return: Tuple (training, target) data, as NumPy arrays.
    """
    print('Before exclusion: ',len(pathnames))
    pathnames = [x for x in pathnames if patient_to_include in x]
    print('After exclusion: ',len(pathnames))

    x = numpy.empty((len(pathnames),) + _shape(pathnames[0]), dtype=numpy.uint8)

    y = numpy.empty((len(pathnames),), dtype=numpy.uint8)

    label_to_index = {label: index for index, label in enumerate(sorted(labels))}

    for index, pathname in enumerate(pathnames):
        if (os.path.isfile(pathname) == True):

            label = os.path.split(os.path.dirname(pathname))[-1]

            x[index] = numpy.load(pathname)

            y[index] = label_to_index[label]

    return x, y


def _shape(pathname):
    """
    Infer the shape of the sample data from a single sample.
    
    :param pathname: Path to a sample.
    :return: Sample dimensions.
    """
    return numpy.load(pathname).shape

In [None]:
directories = ["/data/parsed/"]

labels = ["class_A", "class_B", "class_C"]

pathnames = collect_pathnames(directories, labels)

patient_to_test = 'patient_A'
x, y = load(pathnames, labels, patient_to_test)

In [None]:
# build session running on GPU 1
configuration = tensorflow.ConfigProto()
configuration.gpu_options.allow_growth = True
configuration.gpu_options.visible_device_list = "3"
session = tensorflow.Session(config = configuration)

# apply session
keras.backend.set_session(session)

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=4)

model.compile()

In [None]:
model.model.load_weights('/model/resnet/model.h5')

# Classification test

The evaluation and target data (`x` and `y`, respectively) is next passed to the model for evaluation. **A previously trained model is required.** The `evaluate` method loads the trained model weights. See the `fit` notebook for instructions on training a model. 

Evaluation data is provided to the model in batches of 32 samples. Use `batch_size` to configure the number of samples. A smaller `batch_size` requires less memory.

The evaluate function outputs the model's loss and accuracy metrics as the array `[loss, accuracy]`.

In [None]:
predicted = model.predict(
    batch_size=50,
    x=x
)

predicted = numpy.argmax(predicted, -1)
expected = y

In [None]:
import matplotlib.pyplot
import sklearn.metrics
import pandas
import seaborn

In [None]:
%matplotlib inline

In [None]:
confusion = sklearn.metrics.confusion_matrix(expected, predicted)

confusion = pandas.DataFrame(confusion)

matplotlib.pyplot.figure(figsize=(12, 8))

seaborn.heatmap(confusion, annot=True)

#output_directory = '/results/'
#matplotlib.pyplot.savefig( os.path.join(output_directory, 'confusion_matrix_absolute.eps') , format='eps', dpi=600)

In [None]:
confusion = sklearn.metrics.confusion_matrix(expected, predicted)

confusion = confusion.astype('float') / confusion.sum(axis=1)[:, numpy.newaxis]

confusion = pandas.DataFrame(confusion)

matplotlib.pyplot.figure(figsize=(12, 8))

seaborn.heatmap(confusion, annot=True)

#output_directory = '/results/'
#matplotlib.pyplot.savefig( os.path.join(output_directory, 'confusion_matrix_percent.eps') , format='eps', dpi=600)

In [None]:
sklearn.metrics.accuracy_score(expected, predicted)

# For unsupervised t-SNE/PCA

to be used on http://projector.tensorflow.org

In [None]:
def collect_pathnames(directories, labels):
    """
    :param directories: List of directories to select samples from. Assumes subdirectories of each directory
                        correspond to class labels. Contents of subdirectories are NPY files containing data
                        of that label.
    :return: List of pathnames.
    """
    pathnames = []

    for directory in directories:
        subdirectories = sorted(glob.glob(os.path.join(directory, "*")))
        
        # transform the files of the same label into directory
        filelist = [glob.glob("{}/*{}*".format(subdirectory,label)) for label in labels for subdirectory in subdirectories ]
        
        subdirectory_pathnames = []
        for i in range(len(labels)):
            a = filelist[i*len(subdirectories):(i+1)*len(subdirectories)]
            subdirectory_pathnames.append( list(itertools.chain.from_iterable(a)) )        

        #nsamples = max([len(pathnames) for pathnames in subdirectory_pathnames])
        nsamples = 5000 # this is helpful to limit the number of datapoints to be displayed on projector.tensorflow.org

        pathnames += [list(numpy.random.permutation(pathnames)[:nsamples]) for pathnames in subdirectory_pathnames]

    pathnames = sum(pathnames, [])

    return pathnames

In [None]:
pathnames = collect_pathnames(directories, labels)
x, y = load(pathnames, labels)

In [None]:
from keras.layers import *
from keras.models import Sequential

In [None]:
layers = model.model.layers

In [None]:
#model.model.summary()

In [None]:
print(layers[-2])

In [None]:
abstract_model = None # Clear cached abstract_model
abstract_model = Sequential([layers[-2]])

In [None]:
extracted_features = abstract_model.predict(
    batch_size=50,
    x=x
)

In [None]:
numpy.savetxt('/results/table_of_features.txt' , scale(extracted_features), delimiter='\t')

In [None]:
def save_metadata(file):
    with open(file, 'w') as f:
        for i in range(y.shape[0]):
            f.write('{}\n'.format( list(sorted(labels))[y[i]] ))     

save_metadata('/results/metadata.tsv')

print('Done.')