In [4]:
import numpy

In [8]:
int(numpy.median([3,4,5]))

4

In [None]:
import glob
import os
import itertools
import re
from collections import Counter

import numpy
import tensorflow
import keras

import deepometry.model

# Fit

After parsing raw .CIF files into numpy arrays, we expectingly have the following directory structure, where numpy arrays .NPY are saved in subdirectories named after the class labels (e.g. `positive`, `negative` etc.).

    /data/
        patient_A/
            positive/
                32e88e1ac3a8f44bf8f77371155553b9.npy
                3dc56a0c446942aa0da170acfa922091.npy  
                ...
            negative/
                8068ef7dcddd89da4ca9740bd2ccb31e.npy
                8348deaa70dfc95c46bd02984d28b873.npy
                ...
        patient_B/
            positive/
                3618e715e62a229aa78a7e373b49b888.npy
                3cf53cea7f4db1cfd101e06c366c9868.npy
                ...
            negative/
                84949e1eba7802b00d4a1755fa9af15e.npy
                852a1edbf5729fe8721e9e5404a8ad20.npy
                ...

                
The data can be used to train a model to classify image data as one of the three classes. The `sample` and `load` functions defined below will select images to use for training the model and generate the labels for the training images.

Suppose there is a large imbalance between the number of samples per class in each experiment. Undersampling across classes balances the data seen by the model during training. Additionally, `class_weights` are introduced to work together with undersampling to improve prediction accuracy in underrepresented classes. The `sample` function performs undersampling across classes (per-experiment).

In [None]:
def _shape(pathname):
    """
    Infer the shape of the sample data from a single sample.
    
    :param pathname: Path to a sample.
    :return: Sample dimensions.
    """
    return numpy.load(pathname).shape


def load(pathnames, labels):
    """
    Load training and target data.
    
    Assumes data is stored in a directory corresponding to some class label.

    :param pathnames: List of image pathnames.
    :param labels: List of class labels.
    :return: Tuple (training, target) data, as NumPy arrays.
    """
    x = numpy.empty((len(pathnames),) + _shape(pathnames[0]), dtype=numpy.uint8)

    y = numpy.empty((len(pathnames),), dtype=numpy.uint8)

    label_to_index = {label: index for index, label in enumerate(sorted(labels))}

    for index, pathname in enumerate(pathnames):
        if os.path.isfile(pathname) == True:

            label = os.path.split(os.path.dirname(pathname))[-1]

            x[index] = numpy.load(pathname)

            y[index] = label_to_index[label]

    return x, y

In [None]:
def sample(directories):
    """
    Sample pathnames from directories. 
    
    For each directory, samples are randomly selected equally across subdirectories.

    :param directories: List of directories to select samples from. Assumes subdirectories of each directory
                        correspond to class labels. Contents of subdirectories are NPY files containing data
                        of that label.
    :return: List of sampled pathnames.
    """
    pathnames = []

    for directory in directories:
        subdirectories = sorted(glob.glob(os.path.join(directory, "*")))

        subdirectory_pathnames = [glob.glob(os.path.join(subdirectory, "*")) for subdirectory in subdirectories]

        nsamples = min([len(pathnames) for pathnames in subdirectory_pathnames])

        pathnames += [list(numpy.random.permutation(pathnames)[:nsamples]) for pathnames in subdirectory_pathnames]

    pathnames = sum(pathnames, [])

    return pathnames

In [None]:
directories = ["/data/parsed/"]

# labels = [os.path.split(x[0])[-1] for x in os.walk(directories[0])][1:]

In [None]:
labels = ['unactivated','siglecneg2','activated','siglecneg1']

In [None]:
samples = sample(directories)

x, y = load(samples, labels)

The training and target data (`x` and `y`, respectively) is next passed to the model for training. The model is confiured to withhold 20% of the training data for validation. Use `validation_split` to adjust the size of the partition.

The model will iterate over the training data at most 512 times, specified by `epochs`. Training will terminate early if the validation loss fails to improve for 20 epochs. Training and validation data is provided to the model in batches of 32 samples. Use `batch_size` to configure the number of samples. A smaller `batch_size` requires less memory.

In [None]:
# build session running on GPU 1
configuration = tensorflow.ConfigProto()
configuration.gpu_options.allow_growth = True
configuration.gpu_options.visible_device_list = "3"
session = tensorflow.Session(config = configuration)

# apply session
keras.backend.set_session(session)

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=len(labels))

model.compile()

model.fit(
    x,
    y,
    batch_size=32,
    epochs=512,
    validation_split=0.2,
    verbose=1
)

In [None]:
%matplotlib inline

import pandas
import pkg_resources

import matplotlib.pyplot

In [None]:
csv = pandas.read_csv(pkg_resources.resource_filename("deepometry", "data/training.csv"))

In [None]:
_, (ax0, ax1) = matplotlib.pyplot.subplots(ncols=2, figsize=(16, 4))

ax0.plot(csv["acc"], c="r")
ax0.plot(csv["val_acc"], c="b")

ax1.plot(csv["loss"][30:], c="r")
ax1.plot(csv["val_loss"][30:], c="b");