In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tfx==0.21.2
    print("You can safely ignore the package incompatibility errors.")
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Common imports
import numpy as np
import pandas as pd
import os
from pathlib import Path
from glob import glob

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

Num GPUs Available:  4


In [2]:
def calc_records_in_tfr_folder(tfr_dir):
    """ Calc total number of examples (tiles) in all tfrecords. """
    count = 0
    for tfr_path in sorted(tfr_dir.glob('*.tfrec*')):
        count += sum(1 for _ in tf.data.TFRecordDataset(str(tfr_path)))
    print('Number of examples in all tfrecords in the folder:', count)

def calc_examples_in_tfrecord(tfr_path):
    """ Calc total number of examples (tiles) in all tfrecords. """
    count = sum(1 for _ in tf.data.TFRecordDataset(str(tfr_path)))
    print('Number of examples in the tfrecord:', count)

def show_img(img, title=None):
    """ Show a single image tile. """
    plt.imshow(img)
    plt.title(title)
    plt.axis("off")
    plt.show()
    
def show_images(img_list, ncols=4):
    """ Show  single image tile. """
    fig, ax = plt.subplots(nrows=1, ncols=ncols, figsize=(15, 20))
    
    for i, img_id in enumerate(np.random.randint(0, len(img_list), ncols)):
        ax[i].imshow(img_list[img_id]['image']); ax[i].axis("off"); ax[i].set_title(img_list[img_id]['slide'])
        
def encode_type(df, label_name, label_value):
    """ 
    Args:
        label_name:  name of the label
        label_value: numerical value assigned to the label
    Returns:
        dict of unique label names the appropriate values {label_name: label_value}
    """
    aa = data[[label_name, label_value]].drop_duplicates().sort_values(label_value).reset_index(drop=True)
    return dict(zip(aa[label_name], aa[label_value]))

# PDX data

In [3]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import tensorflow as tf

In [5]:
data = pd.read_csv('../data/data_merged.csv')
csite_enc = encode_type(df=data, label_name='csite', label_value='csite_label')
ctype_enc = encode_type(df=data, label_name='ctype', label_value='ctype_label')
CSITE_NUM_CLASSES = len(csite_enc.keys())
CTYPE_NUM_CLASSES = len(ctype_enc.keys())

In [12]:
# Divide the df into smaller dfs
n_tfrecords = 5
d = int(data.shape[0]/n_tfrecords)
dfs = []
for i in range(n_tfrecords):
    i0 = i*d
    i1 = (i+1)*d
    if i == n_tfrecords-1:
        i1 = None
    dfs.append(data.iloc[i0:i1])
    
[df.shape for df in dfs]

[(403, 2607), (403, 2607), (403, 2607), (403, 2607), (405, 2607)]

In [13]:
# next(dfs[1].iterrows())[1]
# next(dfs[1].iterrows())[1].index
# next(dfs[1].iterrows())[1].values

# item = next(dfs[1].iterrows())[1]
# ge_vec = [value for col_name, value in zip(item.index, item.values) if col_name.startswith('ge_')]
# dd_vec = [value for col_name, value in zip(item.index, item.values) if col_name.startswith('dd_')]
# item['Sample']

In [14]:
# Create tfrecords
# ----------------
# Ourdir
tfr_out = Path.cwd()/'tfr_from_csv'
os.makedirs(tfr_out, exist_ok=True)

# Randomize the dfs
randomize = False
if randomize:
    import random
    random.shuffle(dfs)
    
n_tr_dfs = 4  # num of train files
n_vl_dfs = 1  # num of val files
tr_sz = 0
vl_sz = 0

for i, df in enumerate(dfs):
    i += 1

    if i <= n_tr_dfs:
        tfr_fname = tfr_out/f'train_{i}.tfrecord'
        tr_sz += df.shape[0]
    else:
        tfr_fname = tfr_out/f'val_{i}.tfrecord'
        vl_sz += df.shape[0]
        
    # Create tfr writer
    writer = tf.io.TFRecordWriter(str(tfr_fname))

    for i, item in df.iterrows():
        # Prefix expression and drug features
        ge_vec = [value for col_name, value in zip(item.index, item.values) if col_name.startswith('ge_')]
        dd_vec = [value for col_name, value in zip(item.index, item.values) if col_name.startswith('dd_')]
        
        ex = tf.train.Example(features=tf.train.Features(
            feature={
                'ge_vec': tf.train.Feature(float_list=tf.train.FloatList(value=ge_vec)),
                'dd_vec': tf.train.Feature(float_list=tf.train.FloatList(value=dd_vec)),
                'Sample': tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(item['Sample'], 'utf-8')])),
                'ctype_label': tf.train.Feature(int64_list=tf.train.Int64List(value=[item['ctype_label']])),
                'csite_label': tf.train.Feature(int64_list=tf.train.Int64List(value=[item['csite_label']])),
                'ctype': tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(item['ctype'], 'utf-8')])),
                'csite': tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(item['csite'], 'utf-8')]))
            }
        ))
 
        writer.write(ex.SerializeToString())

    writer.close()
    
print(f'Train samples {tr_sz}')
print(f'Val samples   {vl_sz}')

calc_examples_in_tfrecord(str(tfr_out/'val_5.tfrecord'))

Train samples 1612
Val samples   405
Number of examples in the tfrecord: 405


In [15]:
fea_spec = {
    'ge_vec': tf.io.FixedLenFeature(shape=(len(ge_vec),), dtype=tf.float32, default_value=None),
    'dd_vec': tf.io.FixedLenFeature(shape=(len(dd_vec),), dtype=tf.float32, default_value=None),    
    'Sample': tf.io.FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
    'ctype_label':  tf.io.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'csite_label':  tf.io.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
    'ctype':  tf.io.FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
    'csite':  tf.io.FixedLenFeature(shape=[], dtype=tf.string, default_value=None),
}

ds = tf.data.TFRecordDataset(str(tfr_out/'train_1.tfrecord'))
ex = next(ds.__iter__())
ex = tf.io.parse_single_example(ex, features=fea_spec)
print(ex['csite'].numpy().decode('UTF-8'))
print(ex['csite_label'].numpy())
print(csite_enc)

digestive/gastrointestinal
1
{'bladder/urothelial': 0, 'digestive/gastrointestinal': 1, 'endocrine and neuroendocrine': 2, 'gynecologic': 3, 'head & neck': 4, 'kidney': 5, 'lung': 6, 'musculoskeletal': 7, 'skin': 8}


### Create tf datasets

In [16]:
# csite_classes = data['simplified_tumor_site'].values.tolist()
# ctype_classes = data['simplified_tumor_type'].values.tolist()
# tt = tf.convert_to_tensor(csite_classes)
tt = tf.convert_to_tensor(list(csite_enc.keys()))
tt = tf.data.Dataset.from_tensor_slices(tt)
for i in tt:
    # tf.print(i)
    print(i)

tf.Tensor(b'bladder/urothelial', shape=(), dtype=string)
tf.Tensor(b'digestive/gastrointestinal', shape=(), dtype=string)
tf.Tensor(b'endocrine and neuroendocrine', shape=(), dtype=string)
tf.Tensor(b'gynecologic', shape=(), dtype=string)
tf.Tensor(b'head & neck', shape=(), dtype=string)
tf.Tensor(b'kidney', shape=(), dtype=string)
tf.Tensor(b'lung', shape=(), dtype=string)
tf.Tensor(b'musculoskeletal', shape=(), dtype=string)
tf.Tensor(b'skin', shape=(), dtype=string)


In [17]:
# class Dataset():
    
#     def __init__(self, filenames):
#         self.filenames = filenames
    
#     def define_fea_spec(fea_spec):
#         self.fea_spec = fea_spec

fea_spec = {
    'ge_vec':      tf.io.FixedLenFeature(shape=(976,), dtype=tf.float32, default_value=None),
    'dd_vec':      tf.io.FixedLenFeature(shape=(1613,), dtype=tf.float32, default_value=None),    
    'Sample':      tf.io.FixedLenFeature(shape=[1], dtype=tf.string, default_value=None),
    'ctype_label': tf.io.FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
    'csite_label': tf.io.FixedLenFeature(shape=[1], dtype=tf.int64, default_value=None),
    'ctype':       tf.io.FixedLenFeature(shape=[1], dtype=tf.string, default_value=None),
    'csite':       tf.io.FixedLenFeature(shape=[1], dtype=tf.string, default_value=None),
}    

def read_tfr_example(ex):
    """ Read and parse example from a tfrecord. """
    ex = tf.io.parse_single_example(ex, fea_spec)
    
    # Inputs
    ge_vec = tf.cast(ex['ge_vec'], tf.float32)
    dd_vec = tf.cast(ex['dd_vec'], tf.float32)
    inputs = {'ge_vec': ge_vec}
    
    # Outputs
    csite_label = tf.cast(ex['csite_label'], tf.int64)
    ctype_label = tf.cast(ex['ctype_label'], tf.int64)
    
    # One-hot
    csite_label = tf.one_hot(indices=csite_label, depth=CSITE_NUM_CLASSES, on_value=1.0, off_value=0.0)
    ctype_label = tf.one_hot(indices=ctype_label, depth=CTYPE_NUM_CLASSES, on_value=1.0, off_value=0.0)
    
    csite_label = tf.reshape(csite_label, [CSITE_NUM_CLASSES,])
    ctype_label = tf.reshape(ctype_label, [CTYPE_NUM_CLASSES,])    
    
    # Put into dict
#     outputs = {'csite_label': csite_label}
    outputs = {'csite_label': csite_label, 'ctype_label': ctype_label}
    return inputs, outputs


# Train and val filenames
train_filenames = glob(os.path.join(tfr_out, 'train*.tfrecord'))
val_filenames   = glob(os.path.join(tfr_out, 'val*.tfrecord'))

ds = tf.data.TFRecordDataset(filenames=train_filenames)
ds = ds.map(read_tfr_example)

# Take a sample
ii = next(ds.take(1).__iter__())
print('Inputs: ', ii[0].keys())
print('Outputs:', ii[1].keys())

# print(ii[0]['ge_vec'].numpy().shape)
for i, k in enumerate(ii[0].keys()):
    print(k, ii[0][k].numpy().shape)
    
# print(ii[1]['csite_label'].numpy())
# print(ii[1]['ctype_label'].numpy())
for i, k in enumerate(ii[1].keys()):
    print(k, ii[1][k].numpy().shape)

Inputs:  dict_keys(['ge_vec'])
Outputs: dict_keys(['csite_label', 'ctype_label'])
ge_vec (976,)
csite_label (9,)
ctype_label (11,)


In [18]:
# read this:  https://www.tensorflow.org/datasets/performances
def get_tfr_dataset(filenames):
    """ Create tf dataset. """
    ds = tf.data.TFRecordDataset(filenames=filenames)
    ds = ds.shuffle(500, reshuffle_each_iteration=True)
    ds = ds.map(read_tfr_example)
    # ds = ds.batch(32, drop_remainder=True)
    ds = ds.batch(32)
    return ds

ds_train = get_tfr_dataset(train_filenames)
ds_val   = get_tfr_dataset(val_filenames)

# Take a sample
ii = next(ds_train.take(1).__iter__())
# ii = next(ds_val.take(1).__iter__())
print('Inputs: ', ii[0].keys())
print('Outputs:', ii[1].keys())

# print(ii[0]['ge_vec'].numpy().shape)
# print(ii[1]['csite_label'].numpy().shape)
# print(ii[1]['ctype_label'].numpy().shape)

# print(ii[0]['ge_vec'].numpy().shape)
for i, k in enumerate(ii[0].keys()):
    print(k, ii[0][k].numpy().shape)
    
# print(ii[1]['csite_label'].numpy())
# print(ii[1]['ctype_label'].numpy())
for i, k in enumerate(ii[1].keys()):
    print(k, ii[1][k].numpy().shape)

Inputs:  dict_keys(['ge_vec'])
Outputs: dict_keys(['csite_label', 'ctype_label'])
ge_vec (32, 976)
csite_label (32, 9)
ctype_label (32, 11)


In [19]:
# def preprocess(ex):
#     ex = tf.io.parse_single_example(ex, features=fea_spec)
#     # sample = ex['Sample'][0].numpy().decode('utf-8')
#     sample = tf.cast(ex['Sample'], tf.string)
#     return sample

# # Train dataset
# ds = tf.data.TFRecordDataset(filenames=train_filenames)
# ds = ds.map(preprocess)
# ds = ds.shuffle(500, reshuffle_each_iteration=True)
# # ds = ds.batch(64)
# ii = next(ds.take(1).__iter__())
# print(ii[0].numpy().decode('utf-8'))

In [20]:
# Toy dataset
# -----------
# def power_two(x):
#     return x**2

# ds = tf.data.Dataset.range(10)
# ds = ds.map(power_two)
# ds = ds.repeat(1)
# ds = ds.shuffle(100, reshuffle_each_iteration=True)
# ds = ds.batch(2)
# for i, item in enumerate(ds):
#     tf.print(item, end=' ')

### Train Keras Type Classifier

In [21]:
import tensorflow as tf

# Inputs
input_ge = tf.keras.Input(shape=(len(ge_vec),), name='ge_vec')

# Hidden
fc = tf.keras.layers.Dense(128, activation='relu', name='dense_1')(input_ge)
fc = tf.keras.layers.Dropout(0.2, name='dropout_1')(fc)
fc = tf.keras.layers.Dense(128, activation='relu', name='dense_2')(fc)
fc = tf.keras.layers.Dropout(0.2, name='dropout_2')(fc)

# Output
ctype_out = tf.keras.layers.Dense(CTYPE_NUM_CLASSES, activation='softmax', name='ctype_label')(fc)
csite_out = tf.keras.layers.Dense(CSITE_NUM_CLASSES, activation='softmax', name='csite_label')(fc)
# ctype_out = tf.keras.layers.Dense(1, activation='softmax', name='ctype_label')(fc)
# csite_out = tf.keras.layers.Dense(1, activation='softmax', name='csite_label')(fc)

# Model
model = tf.keras.Model(inputs=[input_ge], outputs=[ctype_out, csite_out])
# model = tf.keras.Model(inputs=[input_ge], outputs=[csite_out])
model.summary()

# ----------
# Compile
# ----------
# model.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss={'csite_label': tf.keras.losses.categorical_crossentropy},
#               metrics=[tf.keras.metrics.categorical_crossentropy])

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss={'csite_label': tf.keras.losses.categorical_crossentropy,
                    'ctype_label': tf.keras.losses.categorical_crossentropy},
              metrics=[tf.keras.metrics.categorical_crossentropy])

# model.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss={'csite_label': tf.keras.losses.mean_absolute_error},
#               metrics=[tf.keras.metrics.mean_absolute_error])

# model.compile(optimizer=tf.keras.optimizers.Adam(),
#               loss={'csite_label': tf.keras.losses.mean_absolute_error,
#                     'ctype_label': tf.keras.losses.mean_absolute_error},
#               metrics=[tf.keras.metrics.mean_absolute_error])

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ge_vec (InputLayer)             [(None, 976)]        0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          125056      ge_vec[0][0]                     
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128)          0           dense_1[0][0]                    
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 128)          16512       dropout_1[0][0]                  
_______________________________________________________________________________________

In [22]:
# The choice of categorical_crossentropy vs sparse_categorical_crossentropy depends on the shape of labels depennds
# stackoverflow.com/questions/49161174/tensorflow-logits-and-labels-must-have-the-same-first-dimension
# ================================================================================================================
# history = model.fit(X_train, y_train, epochs=30, validation_data=(X_valid, y_valid))

history = model.fit(ds_train,
                    epochs=20,
                    validation_data=ds_val,
                    verbose=False)

In [23]:
result = model.evaluate(ds_val, verbose=False)
dict(zip(model.metrics_names, result))

{'loss': 0.9979113936424255,
 'ctype_label_loss': 0.5338507890701294,
 'csite_label_loss': 0.4640606641769409,
 'ctype_label_categorical_crossentropy': 0.5338507890701294,
 'csite_label_categorical_crossentropy': 0.4640606641769409}

In [24]:
rr = model.predict(ds_val.repeat(1))
if isinstance(rr, list):
    print(rr[0].shape)
    print(rr[1].shape)
else:
    print(rr.shape)

(405, 11)
(405, 9)


In [78]:
for i, item in enumerate(ds_val):
    pass # print(item)
print(i)
print(ii[0]['ge_vec'].shape)
print(ii[1]['csite_label'].shape)
# print(ii[1]['ctype_label'].shape)

11
(32, 976)
(32, 1)


288