In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [3]:
import scanpy.api as sc

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, color_map='viridis')  # low dpi (dots per inch) yields small inline figures
sc.logging.print_versions()

scanpy==1.3.2+25.g8ac9c03 anndata==0.6.11 numpy==1.14.6 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.0 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [3]:
import sys
sys.path.append('/home/ahsvargo/xvalid')
sys.path.append('/home/ahsvargo/.local/bin')

In [4]:
from picturedrocks import Rocks
from picturedrocks.performance import FoldTester, PerformanceReport, NearestCentroidClassifier

## Make the data files

Need a separate file for each fold in the data

In [5]:
rawdata = np.load("zeisel/zeisel-proc.npz")

In [12]:
# looking at the top 5k genes
rawdata['X'].shape

(3005, 4999)

In [6]:
test = Rocks(rawdata['X'], rawdata['y'])

In [7]:
ft = FoldTester(test)
ft.loadfolds('zeisel/zeisel14-5folds.npz')

In [12]:
ft.folds[0].shape

(601,)

In [11]:
np.all(ft.folds[0] == np.array(range(601)))

False

In [14]:
ft.folds[0][:20]

array([1833, 1632, 1444, 1008, 1163,  773, 1963,  308, 1809,  403, 2273,
       1107, 1237,  571, 2190, 2553,  354,  697, 1645,  490])

In [10]:
import anndata

In [11]:
# the full dataset
adata = anndata.AnnData(X = rawdata['X'])
adata.obs['zeisel_clusters'] = rawdata['y']
adata.write("ZeiselProc.h5ad")

In [12]:
# the folds
for i,fold in enumerate(ft.folds):
    mask = np.zeros(test.N, dtype=bool)
    mask[fold] = True
    fname = "zeisel-fold" + str(i) + ".h5ad"
    print("Writing fold {} to file {}".format(i,fname), flush=True)
    print(len(fold))
    
    currAdata = adata[~mask]
    print(currAdata.X.shape)
    currAdata.write(fname)

Writing fold 0 to file zeisel-fold0.h5ad
601
(2404, 4999)
Writing fold 1 to file zeisel-fold1.h5ad
601
(2404, 4999)
Writing fold 2 to file zeisel-fold2.h5ad
601
(2404, 4999)
Writing fold 3 to file zeisel-fold3.h5ad
601
(2404, 4999)
Writing fold 4 to file zeisel-fold4.h5ad
601
(2404, 4999)


## Start the scVI stuff and make the model

In [6]:
import scvi
from scvi.dataset import AnnDataset
from scvi.models import *
from scvi.inference import UnsupervisedTrainer

In [14]:
currFold = 1

In [14]:
data = AnnDataset("zeisel-fold" + str(currFold) + ".h5ad", save_path="zeisel/")

File zeisel/zeisel-fold4.h5ad already downloaded
Preprocessing dataset
Finished preprocessing dataset


In [15]:
np.all(np.array(adata.var.index) == data.gene_names)

True

In [16]:
data.labels.shape

(2404, 1)

In [17]:
mask = np.zeros(rawdata['X'].shape[0], dtype=bool)
mask[ft.folds[currFold]] = True
data.labels = rawdata['y'][~mask].reshape((2404,1))

In [18]:
rawdata['y'][~mask].shape

(2404,)

In [19]:
np.unique(rawdata['y'][~mask])

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [20]:
np.unique(rawdata['y'])

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [21]:
data.n_labels = 9

In [22]:
n_epochs=400
lr=1e-3
use_batches=False
use_cuda=True

In [None]:
import torch
torch.cuda.is_available()

In [None]:
torch.cuda.device_count()

In [25]:
vae = VAE(data.nb_genes, n_batch=data.n_batches * use_batches)
trainer = UnsupervisedTrainer(vae,
                              data,
                              train_size=0.75,
                              use_cuda=use_cuda,
                              frequency=5)
trainer.train(n_epochs=n_epochs, lr=lr)

training: 100%|██████████| 400/400 [01:19<00:00,  4.68it/s]


## find "p values" for their fold

In [26]:
%%time
stuff = trainer.train_set.differential_expression_stats()

CPU times: user 7.89 s, sys: 3.92 s, total: 11.8 s
Wall time: 15.4 s


In [27]:
stuff[0].shape

(180300, 4999)

In [28]:
np.unique(rawdata['y'], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([290, 390, 948, 820,  98, 175, 198,  26,  60]))

In [29]:
np.savez("zeisel-scvi-fold{}-stuff".format(currFold), stuff0=stuff[0], stuff1=stuff[1])

**Fold 0**

In [30]:
np.unique(rawdata['y'][~mask], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([230, 312, 772, 650,  83, 136, 150,  21,  50]))

In [31]:
np.unique(stuff[1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([17800, 23000, 57100, 48900,  6500, 11200, 11000,  1900,  2900]))

**Fold 1**

In [31]:
np.unique(rawdata['y'][~mask], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([234, 316, 749, 665,  76, 142, 157,  21,  44]))

In [32]:
np.unique(stuff[1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([18000, 23500, 55400, 49900,  6000, 11600, 11700,  1800,  2400]))

**Fold 2**

In [30]:
np.unique(rawdata['y'][~mask], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([232, 313, 766, 649,  78, 144, 156,  19,  47]))

In [31]:
np.unique(stuff[1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([18000, 23100, 56500, 48800,  6300, 11700, 11600,  1600,  2700]))

**Fold 3**

In [30]:
np.unique(rawdata['y'][~mask], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([235, 309, 761, 656,  74, 135, 162,  20,  52]))

In [31]:
np.unique(stuff[1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([18100, 22900, 56100, 49400,  5900, 11100, 11900,  1800,  3100]))

**Fold 4**

In [111]:
np.unique(rawdata['y'][ft.folds[4]], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([ 61,  80, 204, 160,  17,  32,  31,   3,  13]))

In [112]:
np.unique(stuff[1], return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([ 4900,  6400, 14400, 11400,  1600,  2300,  2600,   300,  1100]))

## Process the `stuff` to find the p-values and markers

This takes about 16GB but doesn't require GPU acceleration to complete fast enough

In [8]:
M_sampling = 100

M_permutation = 100000
permutation = False

In [32]:
currFold = 4

In [33]:
file = np.load("zeisel-scvi-fold{}-stuff.npz".format(currFold))
stuff = (file["stuff0"], file["stuff1"])

In [34]:
stuff[0].shape

(180300, 4999)

Markers for one cluster:

In [11]:
%%time
pvals0 = scvi.inference.posterior.get_bayes_factors(stuff[0], stuff[1], 0, M_permutation=M_permutation,
                                                   permutation=permutation)

CPU times: user 4.98 s, sys: 9.52 s, total: 14.5 s
Wall time: 14.5 s


Markers for all clusters:

In [35]:
%%time
allMarks = []
allPvals = []
for clust in np.unique(stuff[1]):
    print(clust, flush=True)
    
    pvals = scvi.inference.posterior.get_bayes_factors(stuff[0], stuff[1], clust, M_permutation=M_permutation,
                                                   permutation=permutation)
    markers = np.flipud(np.argsort(np.abs(pvals)))
    allMarks.append(markers)
    
    allPvals.append(pvals)
    
allMarks=np.array(allMarks)

0
1
2
3
4
5
6
7
8
CPU times: user 44.4 s, sys: 1min 26s, total: 2min 10s
Wall time: 2min 10s


In [36]:
np.savez("zeisel-scvi-marks-fold{}".format(currFold), allMarks)
np.savez("zeisel-scvi-pvals-fold{}".format(currFold), allPvals)

### Just look at the markers on one fold - extrapolate to full dataset

In [15]:
from picturedrocks import Rocks
from picturedrocks.performance import FoldTester, PerformanceReport, NearestCentroidClassifier

The `test` object is created at the start of this notebook

In [16]:
test.normalize(totalexpr=10000, log=True)

In [17]:
myMarks = [list(table[:2]) for table in allMarks]

NameError: name 'allMarks' is not defined

In [None]:
len(list(set().union(*myMarks)))

In [60]:
marks_list = [2,4,6,8,10,12,14,16,18,20,25,30,35,40,45,50,55,60]
xlist = []
ylist = []

for marks_per_clust in marks_list:
    
    print("markers per cluster: {}".format(marks_per_clust))
    myMarks = [list(table[:marks_per_clust]) for table in allMarks]
    myMarks = list(set().union(*myMarks))
    
    xlist.append(len(myMarks))
    
    cla = NearestCentroidClassifier()
    cla.train(Rocks(test.X[:, myMarks], test.y))
    yTest = cla.test(test.X[:, myMarks], False)
    
    ylist.append(np.sum( (rawdata['y'].flatten() != yTest)*1.0))

markers per cluster: 2
markers per cluster: 4
markers per cluster: 6
markers per cluster: 8
markers per cluster: 10
markers per cluster: 12
markers per cluster: 14
markers per cluster: 16
markers per cluster: 18
markers per cluster: 20
markers per cluster: 25
markers per cluster: 30
markers per cluster: 35
markers per cluster: 40
markers per cluster: 45
markers per cluster: 50
markers per cluster: 55
markers per cluster: 60


In [62]:
np.array(xlist)

array([ 18,  36,  54,  72,  90, 108, 126, 144, 162, 180, 225, 270, 313,
       358, 403, 448, 493, 538])

In [63]:
np.array(ylist)

array([523., 334., 264., 208., 204., 205., 196., 203., 196., 190., 183.,
       183., 188., 182., 175., 178., 185., 177.])

### Run the full cross-validation analysis

In [6]:
path="./"
pvalMarkerList = []
for fold in range(5):
    dayta = np.load(path + "zeisel-scvi-marks-fold" + str(fold) + ".npz")
    pvalMarkerList.append(dayta['arr_0'])

In [7]:
len(pvalMarkerList)

5

In [19]:
marks_list = [2,4,6,8,10,12,14,16,18,20,25,30,35,40,45,50,55,60]
xlist = []
ylist = []

for marks_per_clust in marks_list:
    
    print("markers per cluster: {}".format(marks_per_clust))
    myMarks = [list(set().union(*table[:,:marks_per_clust])) for table in pvalMarkerList]
    ft.markers = myMarks
    
    xlist.append(np.array([len(a) for a in ft.markers]).mean())
    
    ft.classify(NearestCentroidClassifier)
    ylist.append(np.sum( (rawdata['y'].flatten() != ft.yhat)*1.0))

markers per cluster: 2
markers per cluster: 4
markers per cluster: 6
markers per cluster: 8
markers per cluster: 10
markers per cluster: 12
markers per cluster: 14
markers per cluster: 16
markers per cluster: 18
markers per cluster: 20
markers per cluster: 25
markers per cluster: 30
markers per cluster: 35
markers per cluster: 40
markers per cluster: 45
markers per cluster: 50
markers per cluster: 55
markers per cluster: 60


In [20]:
np.array(xlist)

array([ 18. ,  36. ,  54. ,  72. ,  90. , 108. , 126. , 144. , 162. ,
       180. , 225. , 269.6, 314.4, 358.8, 403.6, 447.4, 491.4, 535.8])

In [21]:
np.array(ylist)

array([588., 379., 262., 246., 242., 226., 220., 205., 195., 201., 190.,
       188., 188., 193., 188., 190., 187., 184.])