In [1]:
# library dependencies
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import lzma
import pickle
from sklearn.model_selection import train_test_split
import keras
from keras import layers
import tensorflow as tf
import keras_tuner
import math

2024-01-17 23:19:41.507257: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-17 23:19:41.508602: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-17 23:19:41.526732: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 23:19:41.526751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 23:19:41.527381: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
# method to store data as serialized binary structure lzma compressed
def can_pickles(data, filename):
    with lzma.LZMAFile(filename, 'wb') as f:
        pickle.dump(data, f, pickle.DEFAULT_PROTOCOL)

# method to retrieve data from a compressed pickle file (created with the method above)
def uncan_pickles(filename):
    with lzma.LZMAFile(filename, 'rb') as f:
        return pickle.load(f)

In [3]:
# read the prepared data back
X = uncan_pickles('../data/onehot_x_lung.pickle.xz')
y = uncan_pickles('../data/onehot_y_lung.pickle.xz')

In [4]:
# build an inhomogenous numpy array from the training set
X = np.array(X, dtype=object)

In [5]:
# convert type of target values from string to float
y = np.array(y).astype(float)

In [6]:
len(X), len(y)

(8201, 8201)

### Rewrite to only use TF Dataset

In [7]:
# shuffle data before building a dataset
# helps to make sure that the input data is shuffled without getting knots in the brain due to lazy execution
rng = np.random.default_rng(1202)
rand_idx = np.arange(len(X))
rng.shuffle(rand_idx)

In [8]:
rand_idx

array([4301, 1552, 6386, ..., 3572, 5989, 6785])

In [9]:
# now sort the X and y train arrays according to the sorted indicds
X = X[rand_idx]
y = y[rand_idx]

In [None]:
X = tf.ragged.constant(X, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [None]:
# full_dataset = tf.data.Dataset.from_tensor_slices((X, y))
full_dataset = tf.data.Dataset.from_tensor_slices([X, y])

In [None]:
full_dataset.element_spec

In [None]:
# padded batches from ragged tensors are not supported (yet)
# it needs a work around creating a uniform tensor
# idea from : https://github.com/tensorflow/tensorflow/issues/39163
def reformat(data, label):
    return data, label

In [None]:
full_dataset = full_dataset.map(reformat)

In [None]:
full_dataset.element_spec

In [None]:
train_all_dataset = full_dataset.take(math.ceil(len(full_dataset) * 0.8))
test_dataset = full_dataset.skip(math.ceil(len(full_dataset) * 0.8))

In [None]:
l_full = len(full_dataset)
l_train = len(train_all_dataset)
l_test = len(test_dataset)
l_full, l_train, l_test, l_train + l_test

In [None]:
batch_size = 32

In [None]:
# for one shot training sort before apply padding
train_dataset = train_all_dataset.take(math.ceil(len(train_all_dataset) * 0.8))
val_dataset = train_all_dataset.skip(math.ceil(len(train_all_dataset) * 0.8))

In [None]:
train_dataset = sorted(train_dataset, key=lambda x: len(x[0]))
train_dataset = train_dataset.padded_batch(batch_size)

In [None]:
def danq_model():
    inputs = layers.Input(shape=(None, 4))

    conv = layers.Conv1D(
        filters=320,
        kernel_size=26,
        padding='valid',
        activation='relu'
    )(inputs)
    pool = layers.MaxPooling1D(
        pool_size=13,
        strides=13
    )(conv)
    drop1 = layers.Dropout(
        rate=0.2
    )(pool)

    forward_layer = layers.LSTM(units=320, return_sequences=True)
    backward_layer = layers.LSTM(units=320, return_sequences=True, go_backwards=True)
    bilstm = layers.Bidirectional(
        forward_layer, backward_layer=backward_layer
    )(drop1)
    drop2 = layers.Dropout(
        rate=0.2
    )(bilstm)
    # flat = layers.Flatten()(drop2)
    gmp = layers.GlobalMaxPool1D()(drop2)
    dense1 = layers.Dense(
        units=925,
        activation='relu'
    )(gmp)
    outputs = layers.Dense(
        units=1
    )(dense1)
    
    # model
    model = keras.Model(inputs=inputs, outputs=outputs, name='DanQModel')
    
    return model

In [None]:
def run_model(model, tds, vds, epochs=100):
    model.summary()
    
    model.compile(
        loss=keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(),
        metrics=keras.metrics.MeanAbsoluteError()
    )
    
    history = model.fit(
        tds,
        epochs=epochs,
        validation_data=vds,
        verbose=0
    )
    
    val_mse, val_mae = model.evalute(test_dataset)

    return val_mae

In [None]:
train_all_dataset.element_spec

In [None]:
train_all_dataset.window(math.ceil(len(train_all_dataset) / num_splits))

In [None]:
for w in train_all_dataset.window(math.ceil(len(train_all_dataset) / num_splits)):
    print(w)
    w.element_spec

In [None]:
# for cross validation split train
all_scores = []
num_splits = 5
splits = train_all_dataset.window(math.ceil(len(train_all_dataset) / num_splits))

print('k-fold cross validation with', num_splits)

for i in range(num_splits):
    print('  split:', i)
    
    train_split_id = (i + 1) % num_splits
    for l, p in enumerate(splits):
        if l == i:
            val_ds = p
        elif l == train_split_id:
            train_ds = tf.data.Dataset.zip(p)
        else:
            # train_ds.concatenate(tf.data.Dataset.zip(p))
            train_ds.concatenate(p)
            
    # train_ds = tf.data.Dataset.from_tensors(train_ds)
    # train_ds = tf.data.experimental.from_variant(train_ds, structure=(tf.TensorSpec(shape=(None, 4), dtype=tf.int8, name=None), tf.TensorSpec(shape=(), dtype=tf.float64, name=None)))
    # train_ds = tf.data.Dataset.from_variant_tensor_slices(train_ds) # not available in V2 any more
    # train_ds = tf.data.Dataset(train_ds)

    print(val_ds)
    
    for argh in train_ds:
        print(argh)
        break

    train_ds = sorted(train_ds, key=lambda x: len(x[0]))
    train_ds = train_ds.padded_batch(batch_size)

    # score = run_model(danq_model, train_ds, val_ds)
    print('  score:', score)
    
    all_scores.append(score)

# calculate score
mean_score = np.mean(all_scores)
print('mean score:', mean_score)

# rerun training with train_all_dataset
# train_all_dataset = sorted(train_all_dataset, key=lambda x: len(x[0]))
# train_all_dataset = train_all_dataset.padded_batch(batch_size)
# run training
# score = run_model(danq_model, train_all_dataset, test_dataset)

In [None]:
dataset = tf.data.Dataset.range(7).window(3)
print(dataset.element_spec)
for window in dataset:
    print(window)
    print(window.element_spec)

### Data Preparation

Split data in train and test subsets and then split the train subset again in train and validation.

A simple verification if the X and y correlation are preserved on the split is done.

In [None]:
# split in train and test sub sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1202)

In [None]:
# pad test input (variable input is not accepted)
# X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding="post")

In [None]:
# split the train set again in train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1202)

In [None]:
# find the first unique PTR value that is also in y_train
train_idx = 0
for i in range(len(y)):
    count = 0
    for l in range(len(y)):
        if i != l and y[i] == y[l]:
            count += 1
            continue
    if count == 0:
        for m in range(len(y_train)):
            if y[i] == y_train[m]:
                train_idx = m
                break
train_idx

In [None]:
# get a sample
X_train[train_idx]

In [None]:
# get the matching target
search_y = y_train[train_idx]
search_y

In [None]:
# find the target value in the raw dataset
full_idx = 0
for i in range(len(y)):
    if y[i] == search_y:
        print(i)
        full_idx = i

In [None]:
# compare if the raw dataset entry matches the subset entry
if X[full_idx].all() == X_train[train_idx].all():
    print('OK')

### Sort Training Data

In [None]:
# build an inhomogenous numpy array from the training set
X_train = np.array(X_train, dtype=object)

In [None]:
# build an array containing the sequence lengths
sequence_lengths = list(map(lambda x: len(x), X_train))

In [None]:
# sort the array but only get the indices
sorted_indices = np.argsort(sequence_lengths)

In [None]:
sorted_indices

In [None]:
# now sort the X and y train arrays according to the sorted indicds
X_train = X_train[sorted_indices]
y_train = y_train[sorted_indices]

In [None]:
# check if the previously found values still correlate
for i in range(len(y_train)):
    if y_train[i] == search_y:
        print(X_train[i])

### Ragged Tensor Tests

In [None]:
# this does not work since the sequences are of different length
# X_test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
X_train_tensor = tf.ragged.constant(X_train, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)
X_val_tensor = tf.ragged.constant(X_val, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [None]:
X_train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tensor, y_train))
X_val_dataset = tf.data.Dataset.from_tensor_slices((X_val_tensor, y_val))

In [None]:
X_train_dataset.element_spec

In [None]:
X_val_dataset.element_spec

In [None]:
# padded batches from ragged tensors are not supported (yet)
# it needs a work around creating a uniform tensor
# idea from : https://github.com/tensorflow/tensorflow/issues/39163
def reformat(data, label):
    return data, label

In [None]:
X_train_dataset = X_train_dataset.map(reformat)
X_val_dataset = X_val_dataset.map(reformat)

In [None]:
bar = sorted(X_train_dataset, key=lambda x: len(x[0]))

In [None]:
bar = X_val_dataset.as_numpy_iterator()

In [None]:
count = 0
for x, y in bar:
    print(y)
    if count == 5:
        break    
    count += 1

In [None]:
moo, boo = zip(*bar)

In [None]:
foo = tf.ragged.constant(moo, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [None]:
len(X_val_dataset)

In [None]:
size = math.ceil(len(X_val_dataset) * 0.3)
size

In [None]:
foo = X_val_dataset.window(size)

In [None]:
for w in foo:
    print(w)
    print(len(w[0]))

In [None]:
for i, s in enumerate(foo):
    print(i, s)
    s.concatenate(foo)

In [None]:
# shuffle the dataset (again) and create padded batches
batch_size = 32
X_train_dataset = X_train_dataset.shuffle(buffer_size=len(X_train), seed=1202).padded_batch(batch_size)
X_val_dataset = X_val_dataset.shuffle(buffer_size=len(X_val), seed=1202).padded_batch(batch_size)

In [None]:
# optinally repeat the dataset multiple times -> WHY?
# rep = 3
# X_train_dataset = X_train_dataset.repeat(rep)
# X_val_dataset = X_val_dataset.repeat(rep)

In [None]:
datalen = []
ds_iterator = iter(X_train_dataset)
for data, label in ds_iterator:
    datalen.append(len(data[1]))

In [None]:
datalen[:5]

In [None]:
# testing if keras can use the dataset
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(None,4)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train_dataset, epochs=1)