In [None]:
import glob
import os

import numpy

import deepometry.model

# Fit

Suppose we have the following directory structure where images from two experiments (`experiment_00` and `experiment_01`) have been classified as one of three classes (`class_A`, `class_B`, or `class_C`). Images are saved as NPY files:

    /data/
        experiment_00/
            class_A/
                32e88e1ac3a8f44bf8f77371155553b9.npy
                3dc56a0c446942aa0da170acfa922091.npy  
                ...
            class_B/
                8068ef7dcddd89da4ca9740bd2ccb31e.npy
                8348deaa70dfc95c46bd02984d28b873.npy
                ...
            class_C/  
                c1ecbca7bd98c01c1d3293b64cd6739a.npy
                c56cfb8e7e7121dd822e47c67d07e2d4.npy
                ...
        experiment_01/
            class_A/
                3618e715e62a229aa78a7e373b49b888.npy
                3cf53cea7f4db1cfd101e06c366c9868.npy
                ...
            class_B/
                84949e1eba7802b00d4a1755fa9af15e.npy
                852a1edbf5729fe8721e9e5404a8ad20.npy
                ...
            class_C/
                c037169cda28a9403ec73806c09f4813.npy
                c343d6f9fde4a9b55f6fa5464a1bbc3d.npy
                ...
                
The data can be used to train a model to classify image data as one of the three classes. The `sample` and `load` functions defined below will select images to use for training the model and generate the labels for the training images.

Suppose there is a large imbalance between the number of samples per class in each experiment. Undersampling across classes balances the data seen by the model during training. Additionally, undersampling can improve prediction accuracy in underrepresented classes. The `sample` function performs undersampling across classes (per-experiment).

In [None]:
def _shape(pathname):
    """
    Infer the shape of the sample data from a single sample.
    
    :param pathname: Path to a sample.
    :return: Sample dimensions.
    """
    return numpy.load(pathname).shape


def load(pathnames, labels):
    """
    Load training and target data.
    
    Assumes data is stored in a directory corresponding to some class label.

    :param pathnames: List of image pathnames.
    :param labels: List of class labels.
    :return: Tuple (training, target) data, as NumPy arrays.
    """
    x = numpy.empty((len(pathnames),) + _shape(pathnames[0]), dtype=numpy.uint8)

    y = numpy.empty((len(pathnames),), dtype=numpy.uint8)

    label_to_index = {label: index for index, label in enumerate(sorted(labels))}

    for index, pathname in enumerate(pathnames):
        label = os.path.split(os.path.dirname(pathname))[-1]

        x[index] = numpy.load(pathname)

        y[index] = label_to_index[label]

    return x, y

In [None]:
def sample(directories):
    """
    Sample pathnames from directories. 
    
    For each directory, samples are randomly selected equally across subdirectories.

    :param directories: List of directories to select samples from. Assumes subdirectories of each directory
                        correspond to class labels. Contents of subdirectories are NPY files containing data
                        of that label.
    :return: List of sampled pathnames.
    """
    pathnames = []

    for directory in directories:
        subdirectories = sorted(glob.glob(os.path.join(directory, "*")))

        subdirectory_pathnames = [glob.glob(os.path.join(subdirectory, "*")) for subdirectory in subdirectories]

        nsamples = min([len(pathnames) for pathnames in subdirectory_pathnames])

        pathnames += [list(numpy.random.permutation(pathnames)[:nsamples]) for pathnames in subdirectory_pathnames]

    pathnames = sum(pathnames, [])

    return pathnames

In [None]:
directories = ["/data/experiment_00", "/data/experiment_00"]

labels = ["class_A", "class_B", "class_C"]

samples = sample(directories)

x, y = load(samples, labels)

The training and target data (`x` and `y`, respectively) is next passed to the model for training. The model is confiured to withhold 20% of the training data for validation. Use `validation_split` to adjust the size of the partition.

The model will iterate over the training data at most 512 times, specified by `epochs`. Training will terminate early if the validation loss fails to improve for 20 epochs. Training and validation data is provided to the model in batches of 32 samples. Use `batch_size` to configure the number of samples. A smaller `batch_size` requires less memory.

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=len(labels))

model.compile()

model.fit(
    x,
    y,
    batch_size=32,
    epochs=512,
    validation_split=0.2,
    verbose=1
)