In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import pearsonr
import scipy.sparse as sp_sparse
import scanpy as sc
from math import log
from statistics import median
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras import backend as K
import keras.losses

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data Preparation

In [None]:
adata = sc.read_10x_mtx(
    '/home/ahmadazim/data/filtered_gene_bc_matrices/hg19',  # the directory with the `.mtx` file
    var_names='gene_symbols',                      # use gene symbols for the variable names (variables-axis index)
    cache=True)

adata.var_names_make_unique()

In [None]:
data = pd.DataFrame.sparse.from_spmatrix(adata.X)
print('Working on {} cells and {} genes'.format(*data.shape))

In [None]:
# Filter out genes that are not expressed in any cells
geneSum = data.sum(axis=0)
x = geneSum.index[geneSum == 0].tolist()
data = data.drop(x, axis = 1)
data.shape


In [None]:
# Normalizing data (using method from Rao, et al.)
cellSum  = data.sum(axis=1)
median_j = median(cellSum)
npData = np.asarray(data)
for j in range(2700):
    cellSum_j = cellSum[j]
    for i in range(16634):
        npData[j,i] = log( ( (npData[j,i])/(cellSum_j) * median_j ) + 1)

In [None]:
dataNorm = pd.DataFrame(npData)
dataNorm

# Implementing Variational Autoencoder


In [None]:
# class Sampling(layers.Layer): 
#     def call(self, inputs):
#         mean, log_var = inputs
#         return K.random_normal(tf.shape(log_var)) * K.exp(log_var / 2) + me

In [None]:
# Create a sampling layer
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
fullData = np.asarray(dataNorm)

In [None]:
codings_size = 32

inputs = layers.Input(shape=16634)
z = layers.Dense(8000)(inputs)
z = layers.Dense(4000, activation= "relu")(z)
z = layers.Dense(1000, activation="relu")(z)
z = layers.Dense(256, activation="relu")(z) 

codings_mean = layers.Dense(codings_size)(z) # μ 
codings_log_var = layers.Dense(codings_size)(z) # γ 
codings = Sampling()([codings_mean, codings_log_var]) 

variational_encoder = Model(
    inputs=[inputs], outputs=[codings_mean, codings_log_var, codings])
variational_encoder.summary()

In [None]:
decoder_inputs = layers.Input(shape=[codings_size])

x = layers.Dense(256, activation="relu")(decoder_inputs)
x = layers.Dense(1000, activation="relu")(x)
x = layers.Dense(4000, activation="relu")(x)
x = layers.Dense(8000, activation="relu")(x)
outputs = layers.Dense(16634, activation= "relu")(x)

variational_decoder = Model(inputs=[decoder_inputs], outputs=[outputs])
variational_decoder.summary()

In [None]:
# _, _, codings = variational_encoder(inputs)
# reconstructions = variational_decoder(codings)
# variational_ae = Model(inputs=[inputs], outputs=[reconstructions])
# variational_ae.summary()

In [None]:
# Define the VAE as a Model with a custom train_step
class VAE(keras.Model):
    def __init__(self, variational_encoder, variational_decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.variational_encoder = variational_encoder
        self.variational_decoder = variational_decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            
            _, _, codings = variational_encoder(data)
            reconstruction = variational_decoder(codings)
            
            omega = tf.sign(data)  # 0 if 0, 1 if > 0
            reconstruction_loss = tf.reduce_mean(tf.multiply(tf.pow( (data - reconstruction), 2), omega))
    
            kl_loss = 1 + codings_log_var - tf.square(codings_mean) - tf.exp(codings_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        tf.config.experimental_run_functions_eagerly(True)
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

In [None]:
variational_ae = VAE(variational_encoder, variational_decoder)
variational_ae.compile(optimizer= 'adam')

In [None]:
# Train model
VAEresults = variational_ae.fit(fullData, 
                                epochs= 10, 
                                batch_size= 256)

So this approach didn't work (too large of a size?). Trying a different approach...

# Variational Autoencoder (again)
http://louistiao.me/posts/implementing-variational-autoencoders-in-keras-beyond-the-quickstart-tutorial/

In [None]:
# Define variables
fullData = np.asarray(dataNorm)[:,:1000]
original_dim = 1000
latent_dim = 16
batch_size = 64
epochs = 10
epsilon_std = 1.0

In [None]:
def nzMSE(y_true, y_pred):
    """ MSE for nonzero values. """
    omega = tf.sign(y_true)  # 0 if 0, 1 if > 0
    mse_nz = tf.reduce_mean(tf.multiply(tf.pow( (y_pred - y_true), 2), omega))
    return mse_nz

In [None]:
class KLDivergenceLayer(layers.Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs

In [None]:
# Creating the Decoder 
decoder = Sequential([
    layers.Dense(128, input_dim=latent_dim, activation='relu'),
    layers.Dense(512, activation="relu"),
    layers.Dense(original_dim, activation= "relu")
])

decoder.summary()

In [None]:
x = layers.Input(shape= (original_dim,))
xh = layers.Dense(512, activation="relu")(x)
h = layers.Dense(128, activation="relu")(xh)

In [None]:
z_mu = layers.Dense(latent_dim)(h)
z_log_var = layers.Dense(latent_dim)(h)

z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])
z_sigma = layers.Lambda(lambda t: K.exp(.5*t))(z_log_var)

In [None]:
eps = layers.Input(tensor=K.random_normal(stddev=epsilon_std,
                                          shape=(K.shape(x)[0], latent_dim)))
z_eps = layers.Multiply()([z_sigma, eps])
z = layers.Add()([z_mu, z_eps])

x_pred = decoder(z)

In [None]:
vae = Model(inputs=[x, eps], outputs=x_pred)
vae.compile(optimizer= 'adam', loss= nzMSE)
vae.summary()

In [None]:
vae.fit(fullData,
        fullData,
        epochs=epochs,
        batch_size=batch_size)