In [None]:
# library dependencies
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import lzma
import pickle
from sklearn.model_selection import train_test_split
import keras
from keras import layers
import tensorflow as tf

In [None]:
# method to store data as serialized binary structure lzma compressed
def can_pickles(data, filename):
    with lzma.LZMAFile(filename, 'wb') as f:
        pickle.dump(data, f, pickle.DEFAULT_PROTOCOL)

# method to retrieve data from a compressed pickle file (created with the method above)
def uncan_pickles(filename):
    with lzma.LZMAFile(filename, 'rb') as f:
        return pickle.load(f)

baseline data

X holds a list of sequences one hot encoded
y holds a list of PTR values as floats

In [None]:
# read the prepared data back
X = uncan_pickles('../data/onehot_x_lung.pickle.xz')
y = uncan_pickles('../data/onehot_y_lung.pickle.xz')

In [None]:
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X, padding="post")

In [None]:
# convert type of target values from string to float
y = np.array(y).astype(float)

In [None]:
X[0]

In [None]:
y[0]

In [None]:
# get some idea of the range of the PTR in the selected SAMPLE
print(np.min(y), np.max(y), np.mean(y), np.std(y))

In [None]:
# simple/dumb baseline mean absolute error of always predicting 4.974
mae = np.mean(np.abs(np.array(y) - 4.974))
mae

In [None]:
# count of input sequences
len(X)

split data in train and test subsets

verify that the split worked properly

In [None]:
# split in train and test sub sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=1202)

In [None]:
# split the train set again in train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1202)

In [None]:
# get a sample
X_train[1]

In [None]:
# get the matching target
y_train[1]

In [None]:
# find the target value in the raw dataset
for i in range(len(y)):
    if y[i] == 3.746:
        print(i)

In [None]:
# compare if the raw dataset entry matches the subset entry
if X_padded[6663].all() == X_train[1].all():
    print('OK')

model building

In [None]:
def zrimec_model():
    inputs = layers.Input(shape=(7999, 4))

    conv1 = layers.Conv1D(
        filters=32,
        kernel_size=10,
        strides=1,
        activation='relu',
        padding='valid'
    )(inputs)
    norm1 = layers.BatchNormalization()(conv1)
    drop1 = layers.Dropout(
        rate=0.1
    )(norm1)
    pool1 = layers.MaxPooling1D(
        pool_size=4,
        strides=4
    )(drop1)
    conv2 = layers.Conv1D(
        filters=64,
        kernel_size=10,
        strides=1,
        activation='relu',
        padding='valid'
    )(pool1)
    norm2 = layers.BatchNormalization()(conv2)
    drop2 = layers.Dropout(
        rate=0.1
    )(norm2)
    pool2 = layers.MaxPooling1D(
        pool_size=4,
        strides=4
    )(drop2)
    conv3 = layers.Conv1D(
        filters=128,
        kernel_size=10,
        strides=1,
        activation='relu',
        padding='valid'
    )(pool2)
    norm3 = layers.BatchNormalization()(conv3)
    drop3 = layers.Dropout(
        rate=0.1
    )(norm3)
    pool3 = layers.MaxPooling1D(
        pool_size=4,
        strides=4
    )(drop3)
    flat = layers.Flatten()(pool3)
    dense = layers.Dense(
        units=64,
        activation='relu'
    )(flat)
    norm4 = layers.BatchNormalization()(flat)
    drop4 = layers.Dropout(
        rate=0.1
    )(norm4)
    outputs = layers.Dense(
        units=1
    )(drop4)

    model = keras.Model(inputs=inputs, outputs=outputs, name='ZrimecModel')
    return model

In [None]:
def danq_model():
    inputs = layers.Input(shape=(7999, 4))

    conv = layers.Conv1D(
        filters=320,
        kernel_size=26,
        padding='valid',
        activation='relu'
    )(inputs)
    pool = layers.MaxPooling1D(
        pool_size=13,
        strides=13
    )(conv)
    drop1 = layers.Dropout(
        rate=0.2
    )(pool)

    forward_layer = layers.LSTM(units=320, return_sequences=True)
    backward_layer = layers.LSTM(units=320, return_sequences=True, go_backwards=True)
    bilstm = layers.Bidirectional(
        forward_layer, backward_layer=backward_layer
    )(drop1)
    drop2 = layers.Dropout(
        rate=0.2
    )(bilstm)
    flat = layers.Flatten()(drop2)
    dense1 = layers.Dense(
        units=925,
        activation='relu'
    )(flat)
    outputs = layers.Dense(
        units=1
    )(dense1)
    
    # model
    model = keras.Model(inputs=inputs, outputs=outputs, name='DanQModel')
    
    return model

In [None]:
def baseline_model():
    # input
    # setting fixed shape since the sequences are padded to the max length (threshold of preproc2)
    inputs = layers.Input(shape=(7999, 4))
    # 1D convolution
    conv = layers.Conv1D(
        filters=320, 
        kernel_size=26, 
        strides=1, 
        activation='relu'
    )(inputs)
    # batch normalization
    norm = layers.BatchNormalization()(conv)
    # maxpool
    pool = layers.MaxPooling1D(
        pool_size=13,
        strides=13
    )(norm)
    # dropout
    drop = layers.Dropout(rate=0.1)(pool)
    # bi-directional LSTM
    bilstm = layers.Bidirectional(
        layers.LSTM(
            units=320, 
            dropout=0,
            return_sequences=True,
        ),
        merge_mode='mul',
        # input_shape=(8000, 4),
    )(drop)
    # maxpool
    pool = layers.MaxPooling1D(
        pool_size=13,
        strides=13,
    )(bilstm)
    drop = layers.Dropout(rate=0.1)(pool)
    # flatten
    flat = layers.Flatten()(drop)
    # fully connected
    dense = layers.Dense(
        units=64,
        activation='relu',
    )(flat)
    # batch normalization
    norm = layers.BatchNormalization()(dense)
    # dropout
    drop = layers.Dropout(rate=0.1)(norm)
    # dense
    outputs = layers.Dense(units=1)(drop)

    # model
    model = keras.Model(inputs=inputs, outputs=outputs, name='BaselineModel')
    
    return model

In [None]:
def augur_model():
    inputs = layers.Input(shape=(7999, 4))
    conv1 = layers.LSTM(units=32, recurrent_dropout=0.25)(inputs)
    drop1 = layers.Dropout(rate=0.2)(conv1)
    outputs = layers.Dense(units=1)(drop1)
    
    # model
    model = keras.Model(inputs=inputs, outputs=outputs, name='AugurModel')
    
    return model

In [None]:
# both zrimec and danq have a really bad performance (at least with 5 iterations, danq is slow because it's big)
# model = zrimec_model()
# model = danq_model()
# model = baseline_model()
model = augur_model()

In [None]:
def model_test(model, epochs=5):
    model.summary()
    
    model.compile(
        loss=keras.losses.MeanSquaredError(),
        # optimizer=keras.optimizers.Adam(learning_rate=0.1, beta_1=0.999, beta_2=0.99, epsilon=1e-6),
        optimizer=keras.optimizers.Adam(),
        metrics=[keras.metrics.MeanAbsolutePercentageError(), keras.metrics.RootMeanSquaredError(), keras.losses.MeanSquaredError(), keras.metrics.MeanAbsoluteError()],
    )
    
    history = model.fit(
        X_train, 
        y_train, 
        batch_size=64, 
        epochs=epochs, 
        validation_data=(X_val, y_val), 
        # callbacks=callbacks
    )
    
    y_pred = model.predict(X_test)
    print('Random prediction sample (truth, prediction):', y_test[0], y_pred[0])
    
    plot_loss(history)

In [None]:
def plot_loss(hist):
    history_dict = hist.history
    loss_values = history_dict["loss"]
    val_loss_values = history_dict["val_loss"]
    epochs = range(1, len(loss_values) + 1)
    plt.plot(epochs, loss_values, "bo", label="Training loss")
    plt.plot(epochs, val_loss_values, "b", label="Validation loss")
    plt.title("Training and validation loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [None]:
model_test(zrimec_model)

In [None]:
model_test(danq_model)

In [None]:
model_test(baseline_model)

In [None]:
model_test(augur_model)

manual tests

In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model, 'baseline.png')
# img = plt.imread('baseline.png')
# plt.figure(figsize=(10,10))
# plt.imshow(img)
# plt.axis('off')
# plt.show()

In [None]:
model.compile(
    loss=keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.Adam(learning_rate=0.1, beta_1=0.999, beta_2=0.99, epsilon=1e-6),
    metrics=[keras.metrics.MeanAbsolutePercentageError(), keras.metrics.RootMeanSquaredError(), keras.losses.MeanSquaredError(), keras.metrics.MeanAbsoluteError()],
)

# LOSS
# works keras.losses.MeanSquaredError(),

# METRIC
# useless keras.metrics.Accuracy(), 

In [None]:
callbacks = [
    # keras.callbacks.ModelCheckpoint("PTR_baseline.keras", save_best_only=True)
]

In [None]:
# alternative methode to prepadding the sequences
# X_train_ragged = tf.ragged.constant(X_train, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)
# dataset = tf.data.Dataset.from_tensor_slices(tensor)
# dataset = dataset.map(lambda x: x)

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    batch_size=64, 
    epochs=10, 
    validation_data=(X_val, y_val), 
    callbacks=callbacks
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(y_test[0], y_pred[0])

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
plot_loss(history)

In [None]:
def full_model():
    # input
    inputs = keras.Input(shape=(None, 10))
    # 1D convolution
    conv = keras.Conv1D(
        filters=128, 
        kernel_size=10, 
        strides=1, 
        activation='relu'
    )(inputs)
    # batch normalization
    norm = keras.BatchNormalization()(conv)
    # maxpool
    pool = keras.MaxPooling1D(
        pool_size=4,
        strides=4
    )(norm)
    # dropout
    drop = keras.Dropout(rate=0.2)(pool)
    # bi-directional LSTM
    bilstm = keras.Bidirectional(
        keras.LSTM(
            units=128,
            dropout=0
        ),
        merge_mode='concat'
    )(drop)
    # batch normalization
    norm = keras.BatchNormalization()(bilstm)
    # maxpool
    pool = keras.MaxPooling1D(
        pool_size=4,
        strides=4
    )(norm)
    # dropout
    drop = keras.Dropout(rate=0.2)(pool)
    # flatten
    flat = keras.Flatten()(drop)

    # second input
    inputs2 = keras.Input(shape=(29,))

    # concatenation
    conc = keras.Concatenate(axis=1)([inputs, inputs2])

    # fully connected
    dense = keras.Dense(
        units=64,
        activation='relu'
    )(conc)
    # batch normalization
    norm = keras.BatchNormalization()(dense)
    # dropout
    drop = keras.Dropout(rate=0.2)(norm)
    # dense
    outputs = keras.Dense(units=1)(drop)

    # model
    model = keras.Model(inputs=[inputs, inputs2], outputs=outputs, name='full_model')
    
    return model