In [None]:
r = 'r72'

In [None]:
import pandas as pd
import glob

In [None]:
ls ../data/r72/

In [None]:
n_2 = sorted(
    glob.glob(
        '../data/r72/*'), reverse = True)

In [None]:
import math
import collections

import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization, Activation, Layer
from tensorflow.keras import metrics, optimizers
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import Callback
import tensorflow.compat.v1.keras.backend as K
tf.compat.v1.disable_eager_execution()

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
# VAE functions
def compute_latent(x):
    mu, sigma = x
    batch = K.shape(mu)[0]
    dim = K.shape(mu)[1]
    eps = K.random_normal(shape=(batch,dim), mean=0., stddev=1.0 )
    return mu + K.exp(sigma/2)*eps

class CustomVariationalLayer(Layer):
    """
    Define a custom layer
    """
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x_input, x_decoded):
        reconstruction_loss = original_dim * metrics.binary_crossentropy(
            x_input, x_decoded)
        kl_loss = - 0.5 * K.sum(1 + z_log_var_encoded - K.square(z_mean_encoded) - 
                                K.exp(z_log_var_encoded), axis=-1)
        return K.mean(reconstruction_loss + (K.get_value(beta) * kl_loss))

    def call(self, inputs):
        x = inputs[0]
        x_decoded = inputs[1]
        loss = self.vae_loss(x, x_decoded)
        self.add_loss(loss, inputs=inputs)
        return x
    
class WarmUpCallback(Callback):
    def __init__(self, beta, kappa):
        self.beta = beta
        self.kappa = kappa

    def on_epoch_end(self, epoch, logs={}):
        if K.get_value(self.beta) <= 1:
            K.set_value(self.beta, K.get_value(self.beta) + self.kappa)

In [None]:
# Loss plot visualization to determine degree of model fitting
def plot_loss(loss_dict, cancer, modality, systems, latent_dim, train_file):

    # Create a new figure
    plt.figure(figsize=(10, 6))

    # Plot loss values
    plt.plot(loss_dict['vs0'], marker='o', linestyle='-', color='b')

    # Add a grid
    plt.grid(True, linestyle='--', alpha=0.6)

    # Add titles and labels
    plt.title(cancer+' '+modality+' '+systems+' VAE Loss', fontsize=16, fontweight='bold')
    plt.xlabel('Epoch', fontsize=14)
    plt.ylabel('Loss', fontsize=14)

    # Annotation for epochs and latent dimension
    plt.annotate('Samples: {}'.format(
        len(train_file))+'\nLatent dim: '+str(
        latent_dim)+'\nRaw features: {}'.format(len(train_file.columns)), 
                 xy=(0.7, 0.3), xycoords='axes fraction', 
                 bbox=dict(boxstyle='round, pad=0.5', fc='white', ec='black'),
                 fontsize=12)

    # Save the plot
    plt.savefig(#'loss_plots/'+data_type+'_vae_loss.png'
               '../results/r72/tybalt/'+cancer+'_'+modality+'_'+
                         systems+'_'+str(latent_dim)+'-ltnt-dim_'+
                         str(epochs)+'-epchs_loss.png')
    # plt.close()  # Close the figure - not for interactive devel / demo

In [None]:
epochs = 12
for fl_pth in n_2:
    print(fl_pth)
    cancer = fl_pth.split('_')[0].split('/')[-1]
    modality = fl_pth.split('_')[1]
    systems = fl_pth.split('_')[2].split('.')[0]
    latent_header_prefix = modality[:4]
    train_file = pd.read_csv(fl_pth,
                        sep = '\t', index_col = 0)
    sys_lbls = train_file.System
    cncr_lbls = train_file.Cancer_type
    train_file = train_file.iloc[:, 2:]

    # Normalize
    scaler = MinMaxScaler()
    train_file = pd.DataFrame(
        scaler.fit_transform(train_file),
        columns=train_file.columns,
        index=train_file.index)
    # break
    # Variational auto-encoder, Tybalt
    loss_dict = {}
    vs_list = ['vs0']
    validation_split = vs_list[0]
    
    features = train_file.columns
    
    original_dim = len(features)
    feature_dim = len(features)
    latent_dim = 250
    batch_size = 50
    
    encoder_inputs = keras.Input(shape=(feature_dim,))
    z_mean_dense_linear = layers.Dense(
        latent_dim, kernel_initializer='glorot_uniform', name="encoder_1")(encoder_inputs)
    z_mean_dense_batchnorm = layers.BatchNormalization()(z_mean_dense_linear)
    z_mean_encoded = layers.Activation('relu')(z_mean_dense_batchnorm)
    
    z_log_var_dense_linear = layers.Dense(
        latent_dim, kernel_initializer='glorot_uniform', name="encoder_2")(encoder_inputs)
    z_log_var_dense_batchnorm = layers.BatchNormalization()(z_log_var_dense_linear)
    z_log_var_encoded = layers.Activation('relu')(z_log_var_dense_batchnorm)
    
    latent_space = layers.Lambda(
        compute_latent, output_shape=(
            latent_dim,), name="latent_space")([z_mean_encoded, z_log_var_encoded])
    
    decoder_to_reconstruct = layers.Dense(
        feature_dim, kernel_initializer='glorot_uniform', activation='sigmoid')
    decoder_outputs = decoder_to_reconstruct(latent_space)
    
    learning_rate = 0.0005
    
    kappa = 1
    beta = K.variable(0)
    
    adam = optimizers.Adam(learning_rate=learning_rate)
    vae_layer = CustomVariationalLayer()([encoder_inputs, decoder_outputs])
    vae = Model(encoder_inputs, vae_layer)
    vae.compile(optimizer=adam, loss=None, loss_weights=[beta])
    
    history = vae.fit(train_file,
                epochs=epochs,
                      batch_size=batch_size,
                      shuffle=True,
                      callbacks=[WarmUpCallback(beta, kappa)],
                      verbose=1)
    loss_dict[validation_split] = history.history['loss']
    
    encoder = Model(encoder_inputs, z_mean_encoded)
    latent_object = pd.DataFrame(
        encoder.predict(train_file),
        index=train_file.index
    )
    latent_object.index.name = train_file.index.name
    # break
    # Convert latent object headers to dtype specific strings for input to transformer
    new_column_headers = []
    for column_header in latent_object.columns:
        # latent_header_prefix
        new_column_header = latent_header_prefix+'_' + str(column_header)
        new_column_headers.append(new_column_header)
    latent_object.columns = new_column_headers
    latent_object.insert(0, 'Cancer_type', cncr_lbls)
    latent_object.insert(0, 'System', sys_lbls)
    # break
    latent_object.to_csv('../results/r72/tybalt/'+cancer+'_'+modality+'_'+
                         systems+'.'+str(latent_dim)+'-ltnt-dim_'+
                         str(epochs)+'-epchs.tsv', sep = '\t')
    plot_loss(loss_dict, cancer, modality, systems, latent_dim, train_file)
    print('VAE done, latent object and loss plot written to disk')
    # break