In [1]:
import time
import datetime
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

from model import conv_block
from data import example_to_tensor, normalize, add_channel_axis, train_test_split
from plot import plot_slice, plot_animated_volume
from config import data_root_dir, seed

%matplotlib inline
plt.rcParams["figure.figsize"] = [15, 7]

In [7]:
input_shape = (48, 256, 256, 1)
neg_tfrecord_glob = "covid-neg/*.tfrecord"
pos_tfrecord_glob = "covid-pos/*.tfrecord"

epochs = 1000
patience = 10
batch_size = 8
learning_rate = 0.0001
dropout_rate = 0.0
val_perc = 0.12  # percentage from the already splitted training test
test_perc = 0.1

In [3]:
neg_tfrecord_fnames = [str(p) for p in Path(data_root_dir).glob(neg_tfrecord_glob)]
neg_x = (
    tf.data.TFRecordDataset(neg_tfrecord_fnames)
    .map(example_to_tensor, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(add_channel_axis, num_parallel_calls=tf.data.experimental.AUTOTUNE)
)
# num_neg = sum(1 for _ in neg_x)
num_neg = 250
# num_neg = 254
print(f"Number of negative samples: {num_neg}")
neg_x

Number of negative samples: 250


<ParallelMapDataset shapes: (None, None, None, 1), types: tf.float32>

In [4]:
pos_tfrecord_fnames = [str(p) for p in Path(data_root_dir).glob(pos_tfrecord_glob)]
pos_x = (
    tf.data.TFRecordDataset(pos_tfrecord_fnames)
    .map(example_to_tensor, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(normalize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .map(add_channel_axis, num_parallel_calls=tf.data.experimental.AUTOTUNE)
)
# num_pos = sum(1 for _ in pos_x)
num_pos = 250
# num_pos = 856
print(f"Number of positive samples: {num_pos}")
pos_x

Number of positive samples: 250


<ParallelMapDataset shapes: (None, None, None, 1), types: tf.float32>

In [5]:
neg_y = tf.data.Dataset.from_tensors(tf.constant([0], dtype=tf.int8)).repeat(num_neg)
neg_dataset = tf.data.Dataset.zip((neg_x, neg_y))
neg_dataset

<ZipDataset shapes: ((None, None, None, 1), (1,)), types: (tf.float32, tf.int8)>

In [6]:
pos_y = tf.data.Dataset.from_tensors(tf.constant([1], dtype=tf.int8)).repeat(num_pos)
pos_dataset = tf.data.Dataset.zip((pos_x, pos_y))
pos_dataset

<ZipDataset shapes: ((None, None, None, 1), (1,)), types: (tf.float32, tf.int8)>

In [8]:
dataset = neg_dataset.concatenate(pos_dataset)
dataset, test_dataset = train_test_split(
    dataset,
    test_perc=test_perc,
    cardinality=(num_pos + num_neg),
    seed=seed,
)
test_dataset = test_dataset.batch(1)
train_dataset, val_dataset = train_test_split(
    dataset,
    test_perc=val_perc,
    cardinality=None,
    seed=seed,
)
val_dataset = (
    val_dataset.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
)
train_dataset = (
    train_dataset.batch(batch_size)
    .cache()  # must be called before shuffle
    .shuffle(buffer_size=64, reshuffle_each_iteration=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)
train_dataset

<PrefetchDataset shapes: ((None, None, None, None, 1), (None, 1)), types: (tf.float32, tf.int8)>

In [9]:
def count_labels(dataset):
    "Return a dictionary of the label count."
    return dict(Counter(label.numpy()[0] for _, label in dataset.unbatch()))


print(f"Train labels:\n\t{count_labels(train_dataset)}")
print(f"Validation labels:\n\t{count_labels(val_dataset)}")
print(f"Test labels:\n\t{count_labels(test_dataset)}")

Train labels:
	{0: 201, 1: 195}
Validation labels:
	{0: 23, 1: 31}
Test labels:
	{1: 24, 0: 26}


In [10]:
def build_and_compile_model(learning_rate, dropout_rate):
    inputs = keras.layers.Input(input_shape)

    x = conv_block(inputs, filters=32, dropout_rate=dropout_rate)
    x = conv_block(x, filters=64, dropout_rate=dropout_rate)
    x = conv_block(x, filters=128, dropout_rate=dropout_rate)

    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(
        512,
        kernel_initializer="lecun_normal",
        bias_initializer="zeros",
        activation="selu",
    )(x)
    x = keras.layers.AlphaDropout(dropout_rate)(x)

    outputs = keras.layers.Dense(
        1,
        activation="sigmoid",
    )(x)
    cnn = keras.Model(inputs, outputs, name="baseline-3dcnn")
    cnn.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[
            keras.metrics.TruePositives(name="tp"),
            keras.metrics.FalsePositives(name="fp"),
            keras.metrics.TrueNegatives(name="tn"),
            keras.metrics.FalseNegatives(name="fn"),
            keras.metrics.BinaryAccuracy(name="accuracy"),
        ],
    )
    return cnn

In [11]:
cnn = build_and_compile_model(learning_rate, dropout_rate)

monitor_metric = "val_accuracy"

start_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
best_checkpoint = f"models/baseline-{start_time}.h5"
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    best_checkpoint,
    monitor=monitor_metric,
    mode="max",
    verbose=1,
    save_best_only=True,
)
early_stopping_cb = keras.callbacks.EarlyStopping(
    monitor=monitor_metric, patience=patience, mode="max"
)
log_dir = f"logs/baseline-{start_time}"
file_writer = tf.summary.create_file_writer(log_dir)
tensorboard_cb = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,
    write_graph=False,
    profile_batch=0,
)
cnn.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs,
    callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb],
)
with file_writer.as_default():
    tf.summary.text(
        "Hyperparameters",
        f"{seed=}; "
        f"{input_shape=}; "
        f"{epochs=}; "
        f"{patience=}; "
        f"{batch_size=}; "
        f"{learning_rate=}; "
        f"{dropout_rate=}; "
        f"{val_perc=}; "
        f"{test_perc=}",
        step=0,
    )

Epoch 1/1000
     50/Unknown - 24s 479ms/step - loss: 9.4927 - tp: 10.0000 - fp: 19.0000 - tn: 182.0000 - fn: 185.0000 - accuracy: 0.4848
Epoch 00001: val_accuracy improved from -inf to 0.42593, saving model to models/baseline-20201031-132941.h5
Epoch 2/1000
Epoch 00002: val_accuracy did not improve from 0.42593
Epoch 3/1000
Epoch 00003: val_accuracy did not improve from 0.42593
Epoch 4/1000
Epoch 00004: val_accuracy improved from 0.42593 to 0.57407, saving model to models/baseline-20201031-132941.h5
Epoch 5/1000
Epoch 00005: val_accuracy did not improve from 0.57407
Epoch 6/1000
Epoch 00006: val_accuracy did not improve from 0.57407
Epoch 7/1000
Epoch 00007: val_accuracy did not improve from 0.57407
Epoch 8/1000
Epoch 00008: val_accuracy did not improve from 0.57407
Epoch 9/1000
Epoch 00009: val_accuracy did not improve from 0.57407
Epoch 10/1000
Epoch 00010: val_accuracy did not improve from 0.57407
Epoch 11/1000
Epoch 00011: val_accuracy did not improve from 0.57407
Epoch 12/1000
Ep

In [21]:
cnn = keras.models.load_model("models/baseline-20201029-111058.h5")
cnn.evaluate(test_dataset, verbose=1, return_dict=True)



{'loss': 0.7026064395904541,
 'tp': 19.0,
 'fp': 16.0,
 'tn': 10.0,
 'fn': 5.0,
 'accuracy': 0.5799999833106995}

In [26]:
cnn = keras.models.load_model("models/baseline-20201029-113438.h5")
cnn.evaluate(test_dataset, verbose=1, return_dict=True)



{'loss': 1.1716235876083374,
 'tp': 24.0,
 'fp': 26.0,
 'tn': 0.0,
 'fn': 0.0,
 'accuracy': 0.47999998927116394}

In [27]:
cnn = keras.models.load_model("models/baseline-20201029-115235.h5")
cnn.evaluate(test_dataset, verbose=1, return_dict=True)

OSError: SavedModel file does not exist at: models/baseline-20201019-115235.h5/{saved_model.pbtxt|saved_model.pb}

In [None]:
x, y = next(iter(test_dataset.skip(5)))
prediction = cnn(x, training=False)
print(f"real: {y.numpy()}, prediction: {prediction.numpy()}")
plot_animated_volume(x[0, :], fps=3)

In [None]:
def prediction_bias(dataset):
    """Prediction bias is the difference
        average_labels - average_predictions

    It should be near zero.
    Return the tuple (label_avg, prediction_avg, prediction_bias)
    """
    label_avg = np.mean([label.numpy()[0] for _, label in dataset.unbatch()])

    def gen():
        for x, _ in dataset:
            yield x

    x_dataset = (
        tf.data.Dataset.from_generator(gen, tf.float32)
        .unbatch()
        .padded_batch(1, input_shape)
    )
    prediction_avg = np.mean([cnn(x, training=False).numpy()[0][0] for x in x_dataset])
    return label_avg, prediction_avg, np.abs(label_avg - prediction_avg)

In [None]:
l, p, b = prediction_bias(train_dataset)
print(f"Labels average: {l}")
print(f"Predictions average: {p}")
print(f"Prediction bias: {b}")