# Use py27
usage: `runipy scVI_cmd.ipynb`

In [None]:
%pwd

# Import the scVI model

In [None]:
%load_ext autoreload
%autoreload 2
import scVI
import tensorflow as tf
from benchmarking import *
from helper import *
import numpy as np
import time
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
%matplotlib inline

plot_dir = 'plots'
in_file = '~/data/cell_row/pbmc.g949_c10k.msk90.csv.gz'  # cell_row
out_name = 'pbmc.g949_c10k.msk90'

# Loading data

In [None]:
# pbmb (cell_row)
df = pd.read_csv(in_file, index_col=0)
print('input.shape', df.shape)
expression_data =df.values
gene_names = df.columns
labels = df.index

expression_train, expression_test, c_train, c_test = train_test_split(expression_data, labels, random_state=0)

# Getting prior for scaling parameters

In [None]:
log_library_size = np.log(np.sum(expression_train, axis=1))
mean, var = np.mean(log_library_size), np.var(log_library_size)

# Loading scVI on memory

In [None]:
batch_size = 128
learning_rate = 0.001
epsilon = 0.01
latent_dimension = 10
epochs=250

In [None]:
tf.reset_default_graph()
expression = tf.placeholder(tf.float32, (None, expression_train.shape[1]), name='x')
kl_scalar = tf.placeholder(tf.float32, (), name='kl_scalar')
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon)
training_phase = tf.placeholder(tf.bool, (), name='training_phase')

# getting priors
log_library_size = np.log(np.sum(expression_train, axis=1))
mean, var = np.mean(log_library_size), np.var(log_library_size)

# loading data
model = scVI.scVIModel(expression=expression, kl_scale=kl_scalar, \
                         optimize_algo=optimizer, phase=training_phase, \
                          library_size_mean=mean, library_size_var=var, n_latent=latent_dimension)

#starting computing session
sess = tf.Session()

In [None]:
# Initialize the graph and fit the training set
# this takes less than a minute on a Tesla K80
%time
sess.run(tf.global_variables_initializer())
result = train_model(model, (expression_train, expression_test), sess, epochs, batch_size=batch_size)

In [None]:
plot_training_info(result)

# Latent

We can here sample from the latent space, you can verify that the clustering metrics are robust with respect to different samples from the variational posterior

In [None]:
dic_full = {expression: df.values, training_phase:False}
latent = sess.run(model.z, feed_dict=dic_full)
latent = pd.DataFrame(data=latent, index=df.index)
print('latent.shape', latent.shape)

# Imputation

In the original submission, we recomplete scVI's inference on corrupted data and we evaluate how well different algorithms can recover the original information. In this notebook, we instead show how to sample the parameters of the generative model

In [None]:
dic_full = {expression: df.values, training_phase:False, kl_scalar:1.} 

px_rate = sess.run(model.px_rate,  feed_dict=dic_full)
px_rate = pd.DataFrame(data=px_rate, index=df.index, columns=df.columns)

px_scale = sess.run(model.px_scale,  feed_dict=dic_full)
px_scale = pd.DataFrame(data=px_scale, index=df.index, columns=df.columns)

+ px_scale: normalized expression level inside scVI (rho param in the paper, used for DE)
+ px_rate: mean of the negative binomial (used for imputation)
+ px_r: log of the dispersion parameter for every gene (used for posterior analysis)
+ px_dropout: logit of the zero inflation mixture weight

#  SAVE

In [None]:
print('saving csv')
px_rate.to_csv(out_name+'.px_rate.csv')
latent.to_csv(out_name+'.latent.csv')
px_scale.to_csv(out_name+'.px_scale.csv')