In [16]:
# Build an autoencoder for dimensionaltiy reduction
# 
# Model attributes: use dataset API to avoid feed dict
import tensorflow as tf
from tensorflow.data import Dataset as Ds
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [17]:
data  =  pd.read_csv('/Users/dawnstear/desktop/chop_cellpred/data.csv')  
print(np.shape(data))

(1078, 26596)


In [18]:
np.random.seed(42)
data = shuffle(data)
celltypes = data['TYPE'] # save cell type vector in case we need it later
labels = data['Labels'] # save labels
data_ = data.drop(['Labels','TYPE'],axis=1) # Take off types & labels for input (AE is unsupervised) 

cellcount, genecount = np.shape(data_)
BUFFER = 55 # .shuffle(BUFFER), already shuffled
BATCH_SIZE = 50

X_train, X_test, y_train, y_test = train_test_split(data_.values,labels.values,test_size=0.2,random_state=144)

# Create dataset to avoid using feed_dict() (its very slow) 
train_dataset = Ds.from_tensor_slices((X_train)).repeat().batch(BATCH_SIZE)
test_dataset = Ds.from_tensor_slices((X_test)).repeat().batch(BATCH_SIZE)

# Create general iterator, seamlessly switch bt train data and test data sets
iterator = tf.data.Iterator.from_structure(train_dataset.output_types,train_dataset.output_shapes)

# This will return a tuple where next_element[0] = data, if we have labels [via .zip],  next_element[1] = labels
next_element = iterator.get_next()

# Make datasets that we can initialize separately, but using the same structure via the common iterator
training_init_op = iterator.make_initializer(train_dataset)
testing_init_op = iterator.make_initializer(test_dataset)

# do we need to normalize/regularize or do batch correction ?

In [24]:
# Vanilla AUTOENCODER model adapted from: 
#   Author: Aymeric Damien
#   Project: https://github.com/aymericdamien/TensorFlow-Examples/

# Training Parameters
learning_rate = 0.01
num_steps = 30000
batch_size = 256
display_step = 1000
examples_to_show = 10

# Network Parameters
num_hidden_1 = 256 # 1st layer num features
num_hidden_2 = 128 # 2nd layer num features (the latent space aka # of dimensions we've reduced to)
num_input =  genecount # number of features per cell sample 


# Define weights & biases
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2])),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1])),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2])),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
    'decoder_b2': tf.Variable(tf.random_normal([num_input])),
}

In [25]:
# Building the encoder
def encoder(expression_matrix): # what to add to this fcn?
    # Perform "exponential linear unit"  activation fcn on X*W + b
    layer_1 = tf.nn.elu(tf.add(tf.matmul(expression_matrix, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.elu(tf.add(tf.matmul(layer_1, weights['encoder_h2']),biases['encoder_b2']))
    return layer_2

# Building the decoder
def decoder(latent_space):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(latent_space, weights['decoder_h1']),biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),biases['decoder_b2']))
    return layer_2

# Construct model
encoder_op = encoder(tf.cast(next_element,tf.float32))  # cast expression matrix to float32
decoder_op = decoder(encoder_op)

In [27]:
# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = tf.cast(next_element,tf.float32)

# Define loss and optimizer, minimize the  mean squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()


In [None]:
# Start Training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)
    # Run Data.dataset iterator initializer
    sess.run(training_init_op)   # reset weights each time ? if train flag == 1

    # Training
    for i in range(1, num_steps+1):
        # Prepare Data
        # Get the next batch of MNIST data (only images are needed, not labels)
        # batch_x, _ = mnist.train.next_batch(batch_size)

        # Run optimization op (backprop) and cost op (to get loss value)
        _, l = sess.run([optimizer, loss], feed_dict={X: batch_x})
        # Display logs per step
        if i % display_step == 0 or i == 1:
            print('Step %i: Minibatch Loss: %f' % (i, l))