In [53]:
# library dependencies
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import lzma
import pickle
from sklearn.model_selection import train_test_split
import keras
from keras import layers
import tensorflow as tf
import keras_tuner
import math

In [2]:
# method to store data as serialized binary structure lzma compressed
def can_pickles(data, filename):
    with lzma.LZMAFile(filename, 'wb') as f:
        pickle.dump(data, f, pickle.DEFAULT_PROTOCOL)

# method to retrieve data from a compressed pickle file (created with the method above)
def uncan_pickles(filename):
    with lzma.LZMAFile(filename, 'rb') as f:
        return pickle.load(f)

In [128]:
# read the prepared data back
X = uncan_pickles('../data/onehot_x_lung.pickle.xz')
y = uncan_pickles('../data/onehot_y_lung.pickle.xz')

In [156]:
# build an inhomogenous numpy array from the training set
X = np.array(X, dtype=object)

In [129]:
# convert type of target values from string to float
y = np.array(y).astype(float)

In [157]:
len(X), len(y)

(8201, 8201)

### Rewrite to only use TF Dataset

In [158]:
# shuffle data before building a dataset
# helps to make sure that the input data is shuffled without getting knots in the brain due to lazy execution
rng = np.random.default_rng(1202)
rand_idx = np.arange(len(X))
rng.shuffle(rand_idx)

In [159]:
rand_idx

array([4301, 1552, 6386, ..., 3572, 5989, 6785])

In [160]:
# now sort the X and y train arrays according to the sorted indicds
X = X[rand_idx]
y = y[rand_idx]

In [161]:
X = tf.ragged.constant(X, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [162]:
full_dataset = tf.data.Dataset.from_tensor_slices((X, y))

In [164]:
full_dataset.element_spec

(RaggedTensorSpec(TensorShape([None, 4]), tf.int8, 0, tf.int32),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [165]:
# padded batches from ragged tensors are not supported (yet)
# it needs a work around creating a uniform tensor
# idea from : https://github.com/tensorflow/tensorflow/issues/39163
def reformat(data, label):
    return data, label

In [166]:
full_dataset = full_dataset.map(reformat)

In [167]:
full_dataset

<_MapDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.int8, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>

In [168]:
train_all_dataset = full_dataset.take(math.ceil(len(full_dataset) * 0.8))
test_dataset = full_dataset.skip(math.ceil(len(full_dataset) * 0.8))

In [173]:
l_full = len(full_dataset)
l_train = len(train_all_dataset)
l_test = len(test_dataset)
l_full, l_train, l_test, l_train + l_test

(8201, 6561, 1640, 8201)

In [174]:
batch_size = 32

In [None]:
# for one shot training sort before apply padding
train_dataset = train_all_dataset.take(math.ceil(len(train_all_dataset) * 0.8))
val_dataset = train_all_dataset.skip(math.ceil(len(train_all_dataset) * 0.8))

In [None]:
train_dataset = sorted(train_dataset, key=lambda x: len(x[0]))
train_dataset = train_dataset.padded_batch(batch_size)

In [175]:
def danq_model():
    inputs = layers.Input(shape=(None, 4))

    conv = layers.Conv1D(
        filters=320,
        kernel_size=26,
        padding='valid',
        activation='relu'
    )(inputs)
    pool = layers.MaxPooling1D(
        pool_size=13,
        strides=13
    )(conv)
    drop1 = layers.Dropout(
        rate=0.2
    )(pool)

    forward_layer = layers.LSTM(units=320, return_sequences=True)
    backward_layer = layers.LSTM(units=320, return_sequences=True, go_backwards=True)
    bilstm = layers.Bidirectional(
        forward_layer, backward_layer=backward_layer
    )(drop1)
    drop2 = layers.Dropout(
        rate=0.2
    )(bilstm)
    # flat = layers.Flatten()(drop2)
    gmp = layers.GlobalMaxPool1D()(drop2)
    dense1 = layers.Dense(
        units=925,
        activation='relu'
    )(gmp)
    outputs = layers.Dense(
        units=1
    )(dense1)
    
    # model
    model = keras.Model(inputs=inputs, outputs=outputs, name='DanQModel')
    
    return model

In [176]:
def run_model(model, tds, vds, epochs=100):
    model.summary()
    
    model.compile(
        loss=keras.losses.MeanSquaredError(),
        optimizer=keras.optimizers.Adam(),
        metrics=keras.metrics.MeanAbsoluteError()
    )
    
    history = model.fit(
        tds,
        epochs=epochs,
        validation_data=vds,
        verbose=0
    )
    
    val_mse, val_mae = model.evalute(test_dataset)

    return val_mae

In [177]:
# for cross validation split train
all_scores = []
num_splits = 5
splits = train_all_dataset.window(math.ceil(len(train_all_dataset) / num_splits))

print('k-fold cross validation with', num_splits)

for i in range(num_splits):
    print('  split:', i)
    val_ds = splits[i]
    train_split_id = (i + 1) % num_splits
    train_ds = splits[train_split_id]
    for l in range(num_splits):
        if l != i and l != train_split_id:
            train_ds.concatenate(splits[l])
    train_ds = sorted(train_ds, key=lambda x: len(x[0]))
    train_ds = train_ds.padded_batch(batch_size)

    score = run_model(danq_model, train_ds, val_ds)
    print('  score:', score)
    
    all_scores.append(score)

# calculate score
mean_score = np.mean(all_scores)
print('mean score:', mean_score)

# rerun training with train_all_dataset
# train_all_dataset = sorted(train_all_dataset, key=lambda x: len(x[0]))
# train_all_dataset = train_all_dataset.padded_batch(batch_size)
# run training

AttributeError: '_TakeDataset' object has no attribute 'windows'

### Data Preparation

Split data in train and test subsets and then split the train subset again in train and validation.

A simple verification if the X and y correlation are preserved on the split is done.

In [5]:
# split in train and test sub sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1202)

In [6]:
# pad test input (variable input is not accepted)
# X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding="post")

In [7]:
# split the train set again in train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1202)

In [8]:
# find the first unique PTR value that is also in y_train
train_idx = 0
for i in range(len(y)):
    count = 0
    for l in range(len(y)):
        if i != l and y[i] == y[l]:
            count += 1
            continue
    if count == 0:
        for m in range(len(y_train)):
            if y[i] == y_train[m]:
                train_idx = m
                break
train_idx

390

In [9]:
# get a sample
X_train[train_idx]

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0]])

In [10]:
# get the matching target
search_y = y_train[train_idx]
search_y

3.377

In [11]:
# find the target value in the raw dataset
full_idx = 0
for i in range(len(y)):
    if y[i] == search_y:
        print(i)
        full_idx = i

8194


In [12]:
# compare if the raw dataset entry matches the subset entry
if X[full_idx].all() == X_train[train_idx].all():
    print('OK')

OK


### Sort Training Data

In [13]:
# build an inhomogenous numpy array from the training set
X_train = np.array(X_train, dtype=object)

In [14]:
# build an array containing the sequence lengths
sequence_lengths = list(map(lambda x: len(x), X_train))

In [15]:
# sort the array but only get the indices
sorted_indices = np.argsort(sequence_lengths)

In [16]:
sorted_indices

array([1140,  657, 1659, ...,   71, 4128, 5096])

In [17]:
# now sort the X and y train arrays according to the sorted indicds
X_train = X_train[sorted_indices]
y_train = y_train[sorted_indices]

In [18]:
# check if the previously found values still correlate
for i in range(len(y_train)):
    if y_train[i] == search_y:
        print(X_train[i])

[[0 0 1 0]
 [1 0 0 0]
 [1 0 0 0]
 ...
 [0 0 1 0]
 [0 1 0 0]
 [0 0 1 0]]


### Ragged Tensor Tests

In [19]:
# this does not work since the sequences are of different length
# X_test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [34]:
X_train_tensor = tf.ragged.constant(X_train, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)
X_val_tensor = tf.ragged.constant(X_val, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [35]:
X_train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tensor, y_train))
X_val_dataset = tf.data.Dataset.from_tensor_slices((X_val_tensor, y_val))

In [36]:
X_train_dataset.element_spec

(RaggedTensorSpec(TensorShape([None, 4]), tf.int8, 0, tf.int32),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [37]:
X_val_dataset.element_spec

(RaggedTensorSpec(TensorShape([None, 4]), tf.int8, 0, tf.int32),
 TensorSpec(shape=(), dtype=tf.float64, name=None))

In [38]:
# padded batches from ragged tensors are not supported (yet)
# it needs a work around creating a uniform tensor
# idea from : https://github.com/tensorflow/tensorflow/issues/39163
def reformat(data, label):
    return data, label

In [39]:
X_train_dataset = X_train_dataset.map(reformat)
X_val_dataset = X_val_dataset.map(reformat)

In [118]:
bar = sorted(X_train_dataset, key=lambda x: len(x[0]))

In [119]:
bar = X_val_dataset.as_numpy_iterator()

In [117]:
count = 0
for x, y in bar:
    print(y)
    if count == 5:
        break    
    count += 1

tf.Tensor(5.745, shape=(), dtype=float64)
tf.Tensor(4.487, shape=(), dtype=float64)
tf.Tensor(6.816, shape=(), dtype=float64)
tf.Tensor(3.835, shape=(), dtype=float64)
tf.Tensor(5.633, shape=(), dtype=float64)
tf.Tensor(5.207, shape=(), dtype=float64)


In [98]:
moo, boo = zip(*bar)

In [102]:
foo = tf.ragged.constant(moo, dtype=tf.int8, ragged_rank=1, row_splits_dtype=tf.int32)

In [60]:
len(X_val_dataset)

1312

In [78]:
size = math.ceil(len(X_val_dataset) * 0.3)
size

394

In [85]:
foo = X_val_dataset.window(size)

In [80]:
for w in foo:
    print(w)
    print(len(w[0]))

(<_VariantDataset element_spec=TensorSpec(shape=(None, 4), dtype=tf.int8, name=None)>, <_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.float64, name=None)>)
394
(<_VariantDataset element_spec=TensorSpec(shape=(None, 4), dtype=tf.int8, name=None)>, <_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.float64, name=None)>)
394
(<_VariantDataset element_spec=TensorSpec(shape=(None, 4), dtype=tf.int8, name=None)>, <_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.float64, name=None)>)
394
(<_VariantDataset element_spec=TensorSpec(shape=(None, 4), dtype=tf.int8, name=None)>, <_VariantDataset element_spec=TensorSpec(shape=(), dtype=tf.float64, name=None)>)
130


In [82]:
len(foo)

4

In [26]:
# shuffle the dataset (again) and create padded batches
batch_size = 32
X_train_dataset = X_train_dataset.shuffle(buffer_size=len(X_train), seed=1202).padded_batch(batch_size)
X_val_dataset = X_val_dataset.shuffle(buffer_size=len(X_val), seed=1202).padded_batch(batch_size)

In [27]:
# optinally repeat the dataset multiple times -> WHY?
# rep = 3
# X_train_dataset = X_train_dataset.repeat(rep)
# X_val_dataset = X_val_dataset.repeat(rep)

In [28]:
datalen = []
ds_iterator = iter(X_train_dataset)
for data, label in ds_iterator:
    datalen.append(len(data[1]))

In [29]:
datalen[:5]

[7958, 7615, 6848, 7088, 7937]

In [30]:
# testing if keras can use the dataset
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(None,4)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train_dataset, epochs=1)



<keras.src.callbacks.History at 0x7f8f195dad10>