# Generating and processing the 1bcs consts for (sparse) Zheng data

I've done this before - previously - but I don't completely trust the scripts that I was using and I want to test the framework developed with the 10x data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scanpy.api as sc

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [3]:
import anndata

In [4]:
import sys
sys.path.append('/home/ahsvargo/xvalid')

In [5]:
from picturedrocks import Rocks
from picturedrocks.performance import FoldTester, PerformanceReport, NearestCentroidClassifier

In [6]:
from scipy.stats import rankdata
import scipy.sparse as spsp

## Load data

Replace with your own data paths

In [7]:
path= "/home/ahsvargo/turbo/scData/zheng17/filtered_matrices_mex/hg19/"

In [46]:
adata = sc.read_h5ad(path + "write/raw_data_all_nz_genes_with_clusters.h5ad")

In [8]:
adata = sc.read_h5ad(path + "write/raw_data_5000_genes_with_clusters.h5ad")

In [9]:
sc.logging.print_memory_usage()

Memory usage: current 0.30 GB, difference +0.30 GB


Folds

In [10]:
bulkfolds = np.load(path + "zheng17-5folds.npz")
bulkfolds = [bulkfolds["fold{}".format(i)] for i in range(5)]

louvfolds = np.load(path + "zheng17-5folds-lvals.npz")
louvfolds = [louvfolds["fold{}".format(i)] for i in range(5)]

Lookups and y vectors for the two clusterings

In [11]:
bulk_lookup = list(np.unique(adata.obs['bulk_labels'].values))
louv_lookup = list(np.unique(adata.obs['louv_labels'].values))

In [12]:
bulky = np.array([bulk_lookup.index( adata.obs['bulk_labels'][i] ) for i in range(adata.obs['bulk_labels'].shape[0]) ]) 
louvy = np.load(path + 'zheng17_yVec_lvals.npz')['y']

Side note for the 5000 genes: add the louv labels into the dataset and save again.  Commented out because we don't need to do this again.  The original file - without the louv labels - is `raw_data_5000_genes.h5ad`.

In [46]:
#adata.obs['louv_labels'] = pd.Series( [str(a) for a in np.load(path + 'zheng17_yVec_lvals.npz')['y']], dtype="category", index=adata.obs.index)

In [50]:
#adata.write("./write/raw_data_5000_genes_with_clusters.h5ad")

## Run 1bcs on a fold

#### Collect the information about constants - the vector of dot products (essentially the rank correlation values)

In [47]:
type(adata.X)

scipy.sparse.csr.csr_matrix

In [48]:
adata.X.shape

(68579, 20387)

In [49]:
folds = bulkfolds
yVec = bulky

In [59]:
data = Rocks(adata.X, yVec)
data.X = data.X.tocsc()
data.cs_currX = data.X

In [60]:
type(data.X)

scipy.sparse.csc.csc_matrix

In [73]:
%%time
foldNum = 4

mask = np.zeros(data.N, dtype=bool)
mask[folds[foldNum]] = True
foldData = Rocks(data.X[~mask], data.y[~mask], verbose=1)
foldData.cs_currX = foldData.X

print("Loaded data for fold {}, starting generation of constants".format(foldNum), flush=True)

def clustConsts(clust):
    print("Working on cluster {}".format(clust), flush=True)
    setindices = set(foldData.clusterindices[clust])
    
    rankvec = rankdata(foldData.clust2vec(clust+1))
    rankvec = rankvec - rankvec.mean()
    
    # sparse_tau_dot automatically ranks the input vector.
    consts = list(
        map( lambda x: Rocks.sparse_tau_dot(
            foldData, 
            foldData.cs_currX.getcol(x), 
            foldData.clusterindices[clust], 
            #highval=1, 
            #lowval=-1, 
            highval=rankvec.max(), 
            lowval=rankvec.min(), 
            dim=foldData.N, 
            setindices=setindices
            ),
            range(data.P)
        )
    )
    
    return np.array(consts)



Loaded data for fold 4, starting generation of constants
CPU times: user 224 ms, sys: 96.3 ms, total: 321 ms
Wall time: 318 ms


In [74]:
%%time
foldConsts = np.array(list( map(clustConsts, range(foldData.K)) ))

Working on cluster 0
Working on cluster 1
Working on cluster 2
Working on cluster 3
Working on cluster 4
Working on cluster 5
Working on cluster 6
Working on cluster 7
Working on cluster 8
Working on cluster 9
Working on cluster 10
CPU times: user 1min 41s, sys: 3.15 s, total: 1min 44s
Wall time: 1min 44s


In [75]:
np.savez("/home/ahsvargo/publicData/zheng/zhengFilt-fold4-rankConsts.npz", consts=foldConsts)

## Find the s values from these consts

These are the scores that we will be looking at - comparing to the input parameter $s$.

### Load the consts vectors

In [78]:
consts = [np.load("/home/ahsvargo/publicData/zheng/zhengFilt-fold{}-rankConsts.npz".format(i))['consts'] for i in range(5)]
consts = np.array(consts)

In [79]:
consts.shape

(5, 11, 5000)

### Define the methods

`stsScore` works by soft-thresholding the input and then finding the s value, `sScore` just finds the s value.  The former is good in a map, the latter could be used in a loop.

In practice, I have found that using a loop (over the entries of the consts vector in an increasing manner) has some numerical instability that leads to wrong s scores when the scores are close to 1 (i.e. for the largest entries).

In [17]:
# soft threshold inVec by decreasing all entries by param
def softThreshold(inVec, param):
    
    signs = np.sign(inVec)
    inVec = inVec - param * signs
    inVec[ np.invert(signs == np.sign(inVec)) ] = 0
    return inVec

In [18]:
# input the consts and the index to get the 1-norm of the unit version of the soft-thesholded consts vector
def stsScore(consts, ind):
    
    stVec = softThreshold(consts, np.abs(consts[ind]))
    norm = np.linalg.norm(stVec)
    norm = norm if norm > 0 else 1
    return np.abs( stVec / norm ).sum()

# input the soft-thresholded vector to get the 1-norm of the unit version of the input
def sScore(stVec):
    norm = np.linalg.norm(stVec)
    norm = norm if norm > 0 else 1
    return np.abs( stVec / norm ).sum()

The map version

In [19]:
def sScores(consts):

    return np.array(list(map( lambda ind: stsScore(consts, ind), range(consts.shape[0]) )))

Could speed this up by only looking at the largest values of the sScores.  In other methods, we only look at the top 1000 genes for each cluster, for example

In [80]:
%%time
fold0 = np.array(list(map( sScores, consts[0] )))

CPU times: user 7.71 s, sys: 42.1 ms, total: 7.75 s
Wall time: 7.8 s


In [81]:
%%time
fold1 = np.array(list(map( sScores, consts[1] )))

CPU times: user 7.77 s, sys: 31.3 ms, total: 7.8 s
Wall time: 7.87 s


In [82]:
%%time
fold2 = np.array(list(map( sScores, consts[2] )))

CPU times: user 7.95 s, sys: 693 ms, total: 8.64 s
Wall time: 8.64 s


In [83]:
%%time
fold3 = np.array(list(map( sScores, consts[3] )))

CPU times: user 8.09 s, sys: 40.9 ms, total: 8.13 s
Wall time: 8.13 s


In [84]:
%%time
fold4 = np.array(list(map( sScores, consts[4] )))

CPU times: user 7.96 s, sys: 12.5 ms, total: 7.98 s
Wall time: 7.97 s


In [85]:
np.savez("zhengFilt-rankCorr-sValues.npz", svals=np.array([fold0, fold1, fold2, fold3, fold4]))

In [90]:
svals=np.array([fold0, fold1, fold2, fold3, fold4])

In [91]:
svals.shape

(5, 11, 5000)

## Find the markers for a given value of lamb ($s$)

How to do a union of lists:

In [92]:
len(list(set().union(*[np.where(clust <= 1.2)[0] for clust in svals[0]])))

21

In [50]:
# svals should be a folds x clusters x genes array
def findMarkers(svals, lamb):
    # Stay consistent with the Rocks code
    lamb = np.sqrt(lamb)
    
    # only want the positions in each row
    marks = []
    for fold in svals:
        foldMarks = list(set().union(*[np.where(clust < lamb)[0] for clust in fold]))
        marks.append(foldMarks)
    
    return marks

## The classifiers

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
class RandomForest:
    def __init__(self):
        self.traindata = None
        self.RFC = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
        
    def train(self, data, yVec):
        self.traindata = data
        self.RFC.fit(data, yVec)
        
    def test(self, Xtest, sparse):
        return self.RFC.predict(Xtest)
    

In [53]:
import scipy.spatial.distance

In [54]:
# want to log normalize the data before including it here.

class NearestCentroid:
    def __init__(self):
        self.traindata = None
        self.xkibar = None
        self.clusterindices ={}
    
    def train(self, data, yVec, sparse=True):
        self.traindata = data
        
        self.order = np.unique(yVec)
        
        for ind in self.order:
            self.clusterindices[ind] = np.nonzero(yVec == ind)[0]
        
        if sparse:
            self.xkibar = np.array([ 
                np.squeeze(
                    np.asarray( data[self.clusterindices[ind]].mean(axis=0) )
                ) for ind in self.order
                ])
        else:
            self.xkibar = np.array([data[indices].mean(axis=0) for
                indices in self.clusterindices])
            
    def test(self, Xtest, sparse):
        if sparse:
            dxixk = scipy.spatial.distance.cdist(np.squeeze(np.asarray(Xtest.todense())), self.xkibar)
        else:
            dxixk = scipy.spatial.distance.cdist(Xtest, self.xkibar)

        return self.order[dxixk.argmin(axis=1)]


## Normalize before classification

In [55]:
labels = "bulk"
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
sc.pp.log1p( adata )
#data = Rocks(adata.X, bulky)
#data.normalize(totalexpr=10000, log=True)
#ft = FoldTester(data)

#ft.loadfolds(path + "zheng17-5folds.npz")

#ft.makerocks(1)

## Find the number of errors for a set of markers

In [56]:
def classify(adata, yVec, folds, marks, lookup=None, classifier=RandomForest, debug=True):
    
    yhat = np.zeros(yVec.shape[0])
    
    for i, fold in enumerate(folds):
        if debug:
            print("*** Working on fold {} ***".format(i), flush = True)
            
        mask = np.zeros(adata.X.shape[0], dtype=bool)
        mask[fold] = True
        
        markers = marks[i]
        # I haven't really tested this.  I don't think that it will save much time.
        if lookup is not None:
            markers = [lookup.index(mark) for mark in markers]
        
        # adata.X should be sparse.csc_matrix
        if not spsp.isspmatrix_csc(adata.X):
            adata.X = adata.X.tocsc()
        
        
        train_data = adata.X[:, markers]
        if debug: 
            print("Type of training data: {}".format(type(train_data)))
        
        # convert to csr to quickly get the million rows
        train_data = train_data.tocsr()
        if debug: 
            print("Type of training data: {}".format(type(train_data)))
            
        train_data = train_data[~mask,:]
        
        # convert back to csc for the classification
        train_data = train_data.tocsc()
        if debug: 
            print("Type of training data: {}".format(type(train_data)))

        # could maybe test to see if this will help speed things up...
        #train_data = train_data.X.todense()
        
        if debug:
            print("Size of training data: {}".format(train_data.shape))
            print("Training on fold {}".format(i), flush=True)
            
        fold_classifier = classifier()
        fold_classifier.train(train_data, yVec[~mask])
        
        if debug:
            print("Testing on fold {}".format(i), flush=True)
        
        test_data = adata.X[:, markers]
        test_data = test_data.tocsr()
        test_data = test_data[mask,:]
        test_data = test_data.tocsc()

        if debug:
            print("Size of test data: {}".format(test_data.shape))

        
        yhat[mask] =  fold_classifier.test( test_data, True)
        
    return yhat

### Find one data point

In [59]:
%%time
marks = findMarkers(svals, 1.2**2)
yhat = classify(adata, yVec, folds, marks, lookup=None, debug=True, classifier=RandomForest)

*** Working on fold 0 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (54863, 21)
Training on fold 0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.7s


Testing on fold 0


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.0s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s


Size of test data: (13716, 21)
*** Working on fold 1 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (54863, 23)
Training on fold 1


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.3s


Testing on fold 1


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


Size of test data: (13716, 23)
*** Working on fold 2 ***


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (54863, 21)
Training on fold 2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.8s


Testing on fold 2


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s


Size of test data: (13716, 21)
*** Working on fold 3 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (54863, 23)
Training on fold 3


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.8s


Testing on fold 3


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s


Size of test data: (13716, 23)
*** Working on fold 4 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (54864, 22)
Training on fold 4


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.9s


Testing on fold 4


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s


Size of test data: (13715, 22)
CPU times: user 5min 41s, sys: 8.05 s, total: 5min 49s
Wall time: 46.7 s


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.2s finished


In [35]:
marks = findMarkers(svals, 1.5**2)
[len(a) for a in marks]

[60, 72, 67, 66, 77]

In [36]:
np.where(yhat == yVec)[0].shape[0]/yVec.shape[0]

0.8516601291940682

In [123]:
adata.X.shape[0] - np.where(yhat == yVec)[0].shape[0]

20999

In [109]:
adata.X.shape[0] - np.where(yhat == yVec)[0].shape[0]

33838

### Calculate data for a bunch of $s$ values using Nearest Centroids

In [72]:
sValFile = np.load("zheng-rankCorr-sValues-louv.npz")
svals = sValFile['svals']

In [73]:
folds = louvfolds
yVec = louvy

In [81]:
%%time

xvals = []
lvals = []
yhats = []

for val in range(21):
    lamb = 1.0 + 0.25*val
    lvals.append(lamb)
    print("Working on s = {}".format(lamb), flush=True)

    marks = findMarkers(svals, lamb**2)
    xvals.append([len(a) for a in marks])
    
    %time yhat = classify(adata, yVec, folds, marks, lookup=None, debug=False, classifier=NearestCentroid)
    yhats.append(yhat)


Working on s = 1.0
CPU times: user 219 ms, sys: 4.96 ms, total: 224 ms
Wall time: 223 ms
Working on s = 1.25
CPU times: user 368 ms, sys: 996 µs, total: 369 ms
Wall time: 369 ms
Working on s = 1.5
CPU times: user 633 ms, sys: 2.03 ms, total: 635 ms
Wall time: 636 ms
Working on s = 1.75
CPU times: user 974 ms, sys: 1e+03 µs, total: 975 ms
Wall time: 975 ms
Working on s = 2.0
CPU times: user 1.27 s, sys: 6.05 ms, total: 1.28 s
Wall time: 1.28 s
Working on s = 2.25
CPU times: user 1.91 s, sys: 44 ms, total: 1.96 s
Wall time: 1.96 s
Working on s = 2.5
CPU times: user 3.03 s, sys: 128 ms, total: 3.16 s
Wall time: 3.16 s
Working on s = 2.75
CPU times: user 3.75 s, sys: 463 ms, total: 4.21 s
Wall time: 4.22 s
Working on s = 3.0
CPU times: user 5.14 s, sys: 420 ms, total: 5.56 s
Wall time: 5.56 s
Working on s = 3.25
CPU times: user 5.26 s, sys: 568 ms, total: 5.83 s
Wall time: 5.83 s
Working on s = 3.5
CPU times: user 5.58 s, sys: 815 ms, total: 6.39 s
Wall time: 6.39 s
Working on s = 3.75
CPU

In [82]:
np.array([np.array(a).mean() for a in xvals])

array([  12. ,   36.6,   65.6,   90. ,  117.2,  159.6,  204.8,  250. ,
        292.6,  337.4,  386.8,  444.4,  499.8,  558.6,  618.4,  685.2,
        754.2,  824.2,  900.2,  988.4, 1082.8])

In [83]:
errs = np.array(list( map(lambda yhat: adata.X.shape[0] - np.where(yhat == louvy)[0].shape[0], yhats) ))

In [84]:
errs

array([19752, 11805, 10585,  9496,  8852,  8435,  7719,  7243,  7053,
        6887,  6819,  6663,  6517,  6436,  6422,  6380,  6362,  6360,
        6315,  6368,  6290])

In [43]:
np.array(list( map(lambda yhat: adata.X.shape[0] - np.where(yhat == louvy)[0].shape[0], yhats) ))

array([19876, 11391, 10148,  8996,  8225,  7596,  7372,  7112,  7043,
        6896,  6840,  6753,  6629,  6496,  6409,  6389,  6377,  6395,
        6368,  6365,  6371])

In [113]:
adata.X.shape

(68579, 5000)

In [85]:
np.savez("zheng-rankCorr-louv-plotInfo.npz", xvals=xvals, yhats=yhats, ytrue=bulky, lvals=lvals, errs=errs)

In [70]:
stuff = np.load("zhengFilt-rankCorr-louv-plotInfo.npz")['yhats']

In [24]:
np.array([np.array(a).mean() for a in stuff])

array([ 11.6,  22.6,  34. ,  50.6,  70.4,  90.8, 113.4, 137. , 174.4,
       210.4, 239.8, 276.6, 310.8, 345.6, 384.4, 416.6, 453.2, 489.4,
       531.6, 568.6, 606. ])

In [71]:
np.array(list( map(lambda yhat: adata.X.shape[0] - np.where(yhat == bulky)[0].shape[0], stuff) ))

array([37401, 33718, 32278, 31628, 31312, 30575, 30256, 29732, 28903,
       28682, 28526, 28484, 28395, 28328, 28323, 28314, 28248, 28203,
       28087, 27830, 27679])