In [1]:
import time
import datetime
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

from model import conv_block
from data import example_to_tensor, normalize, add_channel_axis, train_test_split
from plot import plot_slice, plot_animated_volume
from config import data_root_dir, seed

%matplotlib inline
plt.rcParams["figure.figsize"] = [15, 7]

In [5]:
input_shape = (48, 256, 256, 1)
neg_tfrecord_glob = "CT-0/*.tfrecord"
pos_tfrecord_glob = "CT-[1-4]/*.tfrecord"

epochs = 1000
patience = 30
batch_size = 4
learning_rate = 0.001
dropout_rate = 0.0
seed = 5
val_perc = 0.12  # percentage from the already splitted training test
test_perc = 0.1

In [6]:
neg_tfrecord_fnames = [str(p) for p in Path(data_root_dir).glob(neg_tfrecord_glob)]
neg_x = (
    tf.data.TFRecordDataset(neg_tfrecord_fnames)
    .map(example_to_tensor, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(add_channel_axis, num_parallel_calls=tf.data.experimental.AUTOTUNE)
)
# num_neg = sum(1 for _ in neg_x)
num_neg = 254
print(f"Number of negative samples: {num_neg}")
neg_x

Number of negative samples: 254


<ParallelMapDataset shapes: (None, None, None, 1), types: tf.float32>

In [7]:
pos_tfrecord_fnames = [str(p) for p in Path(data_root_dir).glob(pos_tfrecord_glob)]
pos_x = (
    tf.data.TFRecordDataset(pos_tfrecord_fnames)
    .map(example_to_tensor, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(add_channel_axis, num_parallel_calls=tf.data.experimental.AUTOTUNE)
)
# num_pos = sum(1 for _ in pos_x)
num_pos = 856
print(f"Number of positive samples: {num_pos}")
pos_x

Number of positive samples: 856


<ParallelMapDataset shapes: (None, None, None, 1), types: tf.float32>

In [8]:
neg_y = tf.data.Dataset.from_tensors(tf.constant([0], dtype=tf.int8)).repeat(num_neg)
neg_dataset = tf.data.Dataset.zip((neg_x, neg_y))
neg_dataset

<ZipDataset shapes: ((None, None, None, 1), (1,)), types: (tf.float32, tf.int8)>

In [9]:
pos_y = tf.data.Dataset.from_tensors(tf.constant([1], dtype=tf.int8)).repeat(num_pos)
pos_dataset = tf.data.Dataset.zip((pos_x, pos_y))
pos_dataset

<ZipDataset shapes: ((None, None, None, 1), (1,)), types: (tf.float32, tf.int8)>

In [None]:
dataset = neg_dataset.concatenate(pos_dataset)
dataset, test_dataset = train_test_split(
    dataset,
    test_perc=test_perc,
    cardinality=(num_pos + num_neg),
    seed=seed,
)
test_dataset = test_dataset.batch(1)
train_dataset, val_dataset = train_test_split(
    dataset,
    test_perc=val_perc,
    cardinality=None,
    seed=seed,
)
val_dataset = (
    val_dataset.batch(batch_size)
    .cache()
    .prefetch(tf.data.experimental.AUTOTUNE)
)
train_dataset = (
    train_dataset.batch(batch_size)
    .take(16)
    .cache()  # must be called before shuffle
    .shuffle(buffer_size=64, reshuffle_each_iteration=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
train_dataset

In [30]:
def count_labels(dataset):
    "Return a dictionary of the label count."
    return dict(Counter(label.numpy()[0] for _, label in dataset.unbatch()))


print(f"Train labels:\n\t{count_labels(train_dataset)}")
print(f"Validation labels:\n\t{count_labels(val_dataset)}")
print(f"Test labels:\n\t{count_labels(test_dataset)}")

Train labels:
	{1: 677, 0: 203}
Validation labels:
	{1: 98, 0: 21}
Test labels:
	{0: 30, 1: 81}


In [31]:
inputs = keras.layers.Input(input_shape)

x = conv_block(inputs, filters=32, dropout_rate=dropout_rate)
x = conv_block(x, filters=64, dropout_rate=dropout_rate)
x = conv_block(x, filters=128, dropout_rate=dropout_rate)

x = keras.layers.GlobalAveragePooling3D()(x)
x = keras.layers.Dense(
    512,
    kernel_initializer="lecun_normal",
    bias_initializer="lecun_normal",
    activation="selu",
)(x)
x = keras.layers.AlphaDropout(dropout_rate)(x)

outputs = keras.layers.Dense(
    1,
    activation="sigmoid",
)(x)

cnn = keras.Model(inputs, outputs, name="baseline-3dcnn")
cnn.summary()

Model: "baseline-3dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 48, 256, 256, 1)] 0         
_________________________________________________________________
conv3d_3 (Conv3D)            (None, 48, 256, 256, 32)  896       
_________________________________________________________________
alpha_dropout_4 (AlphaDropou (None, 48, 256, 256, 32)  0         
_________________________________________________________________
max_pooling3d_3 (MaxPooling3 (None, 24, 128, 128, 32)  0         
_________________________________________________________________
conv3d_4 (Conv3D)            (None, 24, 128, 128, 64)  55360     
_________________________________________________________________
alpha_dropout_5 (AlphaDropou (None, 24, 128, 128, 64)  0         
_________________________________________________________________
max_pooling3d_4 (MaxPooling3 (None, 12, 64, 64, 64) 

In [32]:
cnn.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=[
        keras.metrics.TruePositives(name="tp"),
        keras.metrics.FalsePositives(name="fp"),
        keras.metrics.TrueNegatives(name="tn"),
        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.BinaryAccuracy(name="accuracy"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
        keras.metrics.AUC(name="auc"),
    ],
)

In [None]:
monitor_metric = "val_auc"

start_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
best_checkpoint = f"models/baseline-{start_time}.h5"
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    best_checkpoint, monitor=monitor_metric, mode="max", verbose=1, save_best_only=True
)
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor=monitor_metric, patience=patience, mode="max"
)
log_dir = f"logs/baseline-{start_time}"
file_writer = tf.summary.create_file_writer(log_dir)
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,
    write_graph=False,
    profile_batch=0,
)
cnn.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs,
    callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb],
)
with file_writer.as_default():
    tf.summary.text(
        "Hyperparameters",
        f"{seed=}; "
        f"{input_shape=}; "
        f"{epochs=}; "
        f"{patience=}; "
        f"{batch_size=}; "
        f"{learning_rate=}; "
        f"{dropout_rate=}; "
        f"{val_perc=}; "
        f"{test_perc=}",
        step=0,
    )
cnn = keras.models.load_model(best_checkpoint)

Epoch 1/1000
    220/Unknown - 50s 229ms/step - loss: 0.6360 - tp: 613.0000 - fp: 181.0000 - tn: 22.0000 - fn: 64.0000 - accuracy: 0.7216 - precision: 0.7720 - recall: 0.9055 - auc: 0.5648
Epoch 00001: val_auc improved from -inf to 0.48494, saving model to models/baseline-20201026-155003.h5
Epoch 2/1000
Epoch 00002: val_auc did not improve from 0.48494
Epoch 3/1000
Epoch 00003: val_auc did not improve from 0.48494
Epoch 4/1000
Epoch 00004: val_auc improved from 0.48494 to 0.51628, saving model to models/baseline-20201026-155003.h5
Epoch 5/1000
Epoch 00005: val_auc did not improve from 0.51628
Epoch 6/1000
Epoch 00006: val_auc did not improve from 0.51628
Epoch 7/1000
Epoch 00007: val_auc did not improve from 0.51628
Epoch 8/1000
Epoch 00008: val_auc improved from 0.51628 to 0.53523, saving model to models/baseline-20201026-155003.h5
Epoch 9/1000
Epoch 00009: val_auc improved from 0.53523 to 0.56244, saving model to models/baseline-20201026-155003.h5
Epoch 10/1000
Epoch 00010: val_auc d

Epoch 14/1000
Epoch 00014: val_auc did not improve from 0.56244
Epoch 15/1000
Epoch 00015: val_auc did not improve from 0.56244
Epoch 16/1000

In [None]:
# cnn = keras.models.load_model("models/baseline-20201025-005657.h5")

In [None]:
cnn.evaluate(test_dataset, verbose=1, return_dict=True)

In [None]:
x, y = next(iter(test_dataset.skip(0)))
prediction = cnn(x, training=False)
print(f"real: {y.numpy()}, prediction: {prediction.numpy()}")
plot_animated_volume(x[0, :], fps=2)

In [None]:
def prediction_bias(dataset):
    """Prediction bias is the difference
        average_labels - average_predictions
    
    It should be near zero.
    Return the tuple (label_avg, prediction_avg, prediction_bias)
    """
    label_avg = np.mean([label.numpy()[0] for _, label in dataset.unbatch()])

    def gen():
        for x, _ in dataset:
            yield x

    x_dataset = (
        tf.data.Dataset.from_generator(gen, tf.float32)
        .unbatch()
        .padded_batch(1, input_shape)
    )
    prediction_avg = np.mean([cnn(x, training=False).numpy()[0][0] for x in x_dataset])
    return label_avg, prediction_avg, np.abs(label_avg - prediction_avg)

In [None]:
l, p, b = prediction_bias(test_dataset)
print(f"Labels average: {l}")
print(f"Predictions average: {p}")
print(f"Prediction bias: {b}")