In [None]:
%matplotlib inline

In [None]:
import glob
import os

import keras
import matplotlib.pyplot
import numpy
import pandas
import pkg_resources
import tensorflow

import deepometry.model
import deepometry.utils

# Fit

After parsing raw image data, we expectingly have the following directory structure, where data arrays as .NPYs are saved in subdirectories named after the class labels (e.g. `positive`, `negative` etc.).

    /data/parsed/
        patient_A/
            positive/
                patient_A__32e88e1ac3a8f44bf8f77371155553b9.npy
                patient_A__3dc56a0c446942aa0da170acfa922091.npy  
                ...
            negative/
                patient_A__8068ef7dcddd89da4ca9740bd2ccb31e.npy
                patient_A__8348deaa70dfc95c46bd02984d28b873.npy
                ...
        patient_B/
            positive/
                patient_B__3618e715e62a229aa78a7e373b49b888.npy
                patient_B__3cf53cea7f4db1cfd101e06c366c9868.npy
                ...
            negative/
                patient_B__84949e1eba7802b00d4a1755fa9af15e.npy
                patient_B__852a1edbf5729fe8721e9e5404a8ad20.npy
                ...

                
The data can be used to train a model to classify image data as one of each class. The `deepometry.utils.load` function selects images to use for training the model and generate the labels for the training images.

Suppose there is a large imbalance between the number of samples per class in each experiment. Undersampling across classes balances the data seen by the model during training. Additionally, `class_weights` are introduced to work together with undersampling to improve prediction accuracy in underrepresented classes. The `deepometry.utils.load` function performs undersampling across classes (per-experiment) with `sample=True`.

In [None]:
directories = glob.glob(os.path.join("data/parsed", "*"))

x, y, units = deepometry.utils.load(directories, sample=True)

The training and target data (`x` and `y`, respectively) is next passed to the model for training. The model is confiured to withhold 20% of the training data for validation. Use `validation_split` to adjust the size of the partition.

The model will iterate over the training data at most 512 times, specified by `epochs`. Training will terminate early if the validation loss fails to improve for 20 epochs. Training and validation data is provided to the model in batches of 32 samples. Use `batch_size` to configure the number of samples. A smaller `batch_size` requires less memory.

In [None]:
# build session running on GPU 1
configuration = tensorflow.ConfigProto()
configuration.gpu_options.allow_growth = True
configuration.gpu_options.visible_device_list = "3"
session = tensorflow.Session(config = configuration)

# apply session
keras.backend.set_session(session)

In [None]:
model = deepometry.model.Model(shape=x.shape[1:], units=units)

model.compile()

model.fit(
    x,
    y,
    batch_size=32,
    class_weight="auto",
    epochs=512,
    validation_split=0.2,
    verbose=1
)

Visualize training accuracy and loss

In [None]:
csv = pandas.read_csv(pkg_resources.resource_filename("deepometry", "data/training.csv"))

In [None]:
_, (ax0, ax1) = matplotlib.pyplot.subplots(ncols=2, figsize=(16, 4))

ax0.plot(csv["acc"], c="r")
ax0.plot(csv["val_acc"], c="b")

ax1.plot(csv["loss"][30:], c="r")
ax1.plot(csv["val_loss"][30:], c="b");