# Evaluating markers on the 1M mouse cell data set

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scanpy.api as sc

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [3]:
import anndata

In [4]:
from scipy.stats import rankdata
import scipy.sparse as spsp

In [5]:
from sklearn.ensemble import RandomForestClassifier

## Load marker/pvalue/svalue information

For 1bcs:

In [6]:
svals = np.load("10x-rankCorr-sValues.npz")['svals']
svals = np.load("10x-ranks-sValues.npz")['svals']

In [7]:
stuff = np.load("rc-svals-map.npz")['svals']

In [8]:
np.sort(np.abs(stuff - svals).flatten())[::-1][:200]

array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
      

In [9]:
svals = stuff

For scanpy methods:

In [10]:
methods = ['wilcoxon', 't-test_overestim_var', 'logreg']
method = methods[1]
scmarks = []
pvals = []
names = []
for fold in range(5):
    foldInfo = np.load("./scanpy/fold{}/1M-fold{}-{}.npz".format(fold,fold,method))
    pvals.append(foldInfo['pvals'])
    scmarks.append(foldInfo['marks'])
    names.append(foldInfo['names'])
    
pvals = np.array(pvals)
scmarks = np.array(scmarks)
pvals.shape

(5, 39, 24015)

In [11]:
scmarks.shape

(5, 39, 24015)

compare the markers and look at some s values

In [38]:
scmarks[0,1,:10]

array([19404,  4404, 19159, 19948,  5997, 21572,  2593,  4075, 16006,
        6173])

In [39]:
np.argsort(svals[0,1])[:10]

array([21572, 13442, 17950, 18146,  5997, 19404, 19354,  2593, 23012,
       17333])

In [40]:
svals[0,1,13442]

1.0000000000001752

In [43]:
np.where(svals[0,0] <= 1.2)

(array([ 331, 4814]),)

In [10]:
np.sqrt(24000)

154.91933384829667

## Load the anndata object
For our classifiers, we want to use the normalized data

In [12]:
#adata = sc.read_h5ad("1M-nzGenes-clusts.h5ad")  # not normalized
adata = sc.read_h5ad("1M-10knorm-clusts.h5ad") # log transformed after setting rows to 10000
folds = np.load("10x-5folds.npz")
folds = [folds["fold{}".format(i)] for i in range(5)]

keep the index of the genes so that we know which one we are looking for if we do any subsetting

In [13]:
adata.var.loc[:,'ind'] = pd.Series(np.array(range(adata.var.index.shape[0])), index=adata.var.index)

In [14]:
adata

AnnData object with n_obs × n_vars = 1306127 × 24015 
    obs: 'louvain', 'graphclust', 'n_counts'
    var: 'gene_ids', 'n_counts', 'ind'

The y vector

In [15]:
yVec = np.array([int(val) for val in adata.obs['louvain']])

Make sure that the adata object is in the correct format - we really want csc to easily subset markers before folds (since we will only want ~1000 markers but ~1000000 cells).

In [18]:
type(adata.X)

scipy.sparse.csc.csc_matrix

In [17]:
adata.X = adata.X.tocsc()

## Scanpy: Find a number of markers per cluster

In [19]:
def scMarkers(scmarks, marksPerClust):
    
    marks = []
    for fold in scmarks:
        foldMarks = list(set().union(*[clust[:marksPerClust] for clust in fold]))
        marks.append(foldMarks)
    
    return marks

In [20]:
[len(a) for a in scMarkers(scmarks, 150)]

[2005, 2007, 2003, 2000, 2001]

In [23]:
[len(a) for a in findMarkers(svals, 4.0)]

[608, 686, 642, 712, 698]

## 1BCS: Find the markers for a given value of lamb

Also: make a way to get the columns of the data without copying the entire data set

How to do a union of lists:

In [61]:
len(list(set().union(*[np.where(clust <= 1.2)[0] for clust in svals[0]])))

95

In [21]:
# svals should be a folds x clusters x genes array
def findMarkers(svals, lamb):
    # Stay consistent with the Rocks code
    lamb = np.sqrt(lamb)
    
    # only want the positions in each row
    marks = []
    for fold in svals:
        foldMarks = list(set().union(*[np.where(clust < lamb)[0] for clust in fold]))
        marks.append(foldMarks)
    
    return marks

Use this to avoid getting all of the data for a given fold.  Not sure how much extra overhead time this will take.

In [22]:
def lambAdata(adata, svals, lamb, copy=False, debug=0):
    
    marks = findMarkers(svals, lamb)
    
    # all of the markers for all of the clusters in all of the folds
    totalMarks = list(set().union( *marks ))
    if (debug > 0): print("Total markers for all folds: {}".format(len(totalMarks)))
    
    return adata[:, totalMarks] if not copy else adata[:,totalMarks].copy(), totalMarks

A test: Almost certainly faster than making fold objects (nearly full copies of the data)

In [20]:
%%time
subset, lookup = lambAdata(adata, svals, 1.2**2, debug=1, copy=True)
subset.X = subset.X.tocsc()

Total markers for all folds: 98
CPU times: user 7.06 s, sys: 719 ms, total: 7.78 s
Wall time: 8.03 s


In [19]:
 print(subset.X.data.nbytes + subset.X.indptr.nbytes + subset.X.indices.nbytes)

193248072


In [21]:
adata.X

<1306127x24015 sparse matrix of type '<class 'numpy.float32'>'
	with 2624828308 stored elements in Compressed Sparse Column format>

I have no idea what data type this is and the internet doesn't either.  You need to convert it to a `scipy` sparse structure of some sort.

In [70]:
type(subset.X)

anndata.base.SparseCSRView

### Run this in order to select the markers that we use below

In [18]:
marks = findMarkers(svals, 2.5**2)

In [30]:
[len(a) for a in marks]

[1553, 1642, 1533, 1612, 1561]

In [158]:
yum = []
for clust in svals[0]:
    yum.append(np.sort(clust)[:2])

yum = np.array(yum)
yum.shape

(39, 2)

In [165]:
np.max(svals[0,1])

91.59735795977517

In [159]:
np.max(yum)

1.0

In [160]:
yum

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [161]:
np.unique(np.where(svals[0] < 1)[1], return_counts=True)

(array([  628,  2063,  2268,  4271,  4570,  4814,  5097,  6144,  9345,
        10116, 10533, 11174, 11522, 11695, 12928, 13564, 13792, 14133,
        14450, 14555, 14633, 15077, 15314, 15648, 15832, 15875, 16006,
        16436, 16479, 17467, 19263, 20297, 20938, 21511, 21572, 23092,
        23384]),
 array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [100]:
[len(a) for a in marks]

[54, 51, 56, 55, 56]

In [86]:
39*2

78

In [70]:
39-16 + 39

62

In [20]:
yum = [lookup.index(mark) for mark in marks[0]]
[thing for thing in list(range(95)) if thing not in yum]

[52, 63, 78]

In [19]:
len(marks[0])

95

In [169]:
%%time
testrange = np.array(range(1,51))/10

testmarks = []
for s in testrange:
    marks = findMarkers(svals, s**2)
    testmarks.append([len(a) for a in marks])

CPU times: user 499 ms, sys: 832 µs, total: 499 ms
Wall time: 497 ms


In [163]:
testrange

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. ])

In [170]:
for i, vals in enumerate(testmarks):
    print("{}: {}".format((i+1)/10, vals))

0.1: [37, 37, 37, 37, 37]
0.2: [37, 37, 37, 37, 37]
0.3: [37, 37, 37, 37, 37]
0.4: [37, 37, 37, 37, 37]
0.5: [37, 37, 37, 37, 37]
0.6: [37, 37, 37, 37, 37]
0.7: [37, 37, 37, 37, 37]
0.8: [37, 37, 37, 37, 37]
0.9: [37, 37, 37, 37, 37]
1.0: [37, 37, 37, 37, 37]
1.1: [79, 79, 79, 79, 82]
1.2: [95, 90, 91, 91, 94]
1.3: [117, 115, 111, 122, 118]
1.4: [141, 153, 149, 173, 160]
1.5: [183, 208, 194, 248, 221]
1.6: [234, 287, 251, 307, 294]
1.7: [310, 365, 328, 388, 366]
1.8: [398, 461, 422, 483, 461]
1.9: [494, 568, 518, 594, 566]
2.0: [608, 686, 642, 712, 698]
2.1: [732, 806, 763, 813, 805]
2.2: [885, 927, 899, 941, 902]
2.3: [1017, 1050, 1024, 1066, 1019]
2.4: [1156, 1187, 1132, 1213, 1144]
2.5: [1286, 1334, 1249, 1344, 1285]
2.6: [1416, 1493, 1394, 1469, 1423]
2.7: [1553, 1642, 1533, 1612, 1561]
2.8: [1698, 1787, 1674, 1761, 1718]
2.9: [1833, 1920, 1808, 1901, 1871]
3.0: [1976, 2054, 1965, 2017, 2002]
3.1: [2114, 2201, 2134, 2168, 2131]
3.2: [2277, 2359, 2288, 2330, 2293]
3.3: [2403, 2486, 

In [28]:
testSet = []
for val in range(1,11):
    lamb = 2 + val/10
    testMarks = findMarkers(svals, lamb**2)
    aveMarks = np.array([len(a) for a in testMarks]).mean()
    testSet.append(aveMarks)
    print("{}: {}".format(lamb, aveMarks))
    
np.array(testSet)

2.1: 783.8
2.2: 910.8
2.3: 1035.2
2.4: 1166.4
2.5: 1299.6
2.6: 1439.0
2.7: 1580.2
2.8: 1727.6
2.9: 1866.6
3.0: 2002.8


array([ 783.8,  910.8, 1035.2, 1166.4, 1299.6, 1439. , 1580.2, 1727.6,
       1866.6, 2002.8])

In [123]:
consts = [np.load("1M-fold{}-consts.npz".format(i))['consts'] for i in range(5)]
consts = np.array(consts)

In [109]:
# soft threshold inVec by decreasing all entries by param
def softThreshold(inVec, param):
    
    signs = np.sign(inVec)
    inVec = inVec - param * signs
    inVec[ np.invert(signs == np.sign(inVec)) ] = 0
    return inVec

In [110]:
# input the soft-thresholded vector to get the 1-norm of the unit version of the input
def sScore(stVec):
    norm = np.linalg.norm(stVec)
    norm = norm if norm > 0 else 1
    return np.abs( stVec / norm ).sum()

In [124]:
sScore(softThreshold(consts[0,1], np.max(consts[0,1])))

0.0

In [125]:
args = np.argsort(np.abs(consts[0,1]))

In [126]:
stVec = softThreshold(consts[0,1], np.abs(consts[0,1][args[-2]]))

In [127]:
stuff = softThreshold(stVec, np.abs(consts[0,1][args[-1]]) - np.abs(consts[0,1][args[-2]]))
np.nonzero(stuff)

(array([], dtype=int64),)

In [128]:
sScore(stuff)

0.0

In [None]:
total = 0
for arg in args:
    stVec = softThreshold(stVec, np.abs(consts[arg]) - prevParam)
    prevParam = np.abs(consts[arg])
    outputs[arg] = sScore(stVec)

In [120]:
np.nonzero(stVec)

(array([], dtype=int64),)

In [66]:
np.sort(consts[0,1])[::-1][:10]

array([195260.1495208 , 188196.89122723, 167751.41339037, 152395.09642644,
       152066.70798079, 145876.5638342 , 144644.96181145, 143007.57115349,
       141227.84460192, 129449.84156123])

In [167]:
for i in range(1,len(testmarks)):
    print(testmarks[i][0] - testmarks[i-1][0])

0
0
0
0
0
0
0
0
36
6
16
22
24
42
51
76
88
96
114
124
153
132
139
130
130
137
145
135
143
138
163
126
163
169
168
145
136
153
188
137
150
165
161
151
154
147
143
186
137


In [51]:
39*2

78

### IF you need to use the index when doing a subset
Here are some explorations about how to slice the adata object correctly.  I never got any of them to work.

In [90]:
adata[:,list(adata.var.loc[adata.var['ind'].isin(range(10))].index.values)][0]

KeyboardInterrupt: 

In [87]:
list(adata.var.loc[adata.var['ind'].isin(range(10))].index.values)

['Xkr4',
 'Gm1992',
 'Rp1',
 'Sox17',
 'Gm37323',
 'Mrpl15',
 'Lypla1',
 'Gm37988',
 'Tcea1',
 'Rgs20']

In [None]:
adata[:,['Xkr4',
 'Gm1992',
 'Rp1',
 'Sox17',
 'Gm37323',
 'Mrpl15',
 'Lypla1',
 'Gm37988',
 'Tcea1',
 'Rgs20']][0].X

## The classifiers

In [23]:
class RandomForest:
    def __init__(self):
        self.traindata = None
        self.RFC = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
        
    def train(self, data, yVec):
        self.traindata = data
        self.RFC.fit(data, yVec)
        
    def test(self, Xtest, sparse):
        return self.RFC.predict(Xtest)
    

In [24]:
import scipy.spatial.distance

In [25]:
# want to log normalize the data before including it here.

class NearestCentroid:
    def __init__(self):
        self.traindata = None
        self.xkibar = None
        self.clusterindices ={}
    
    def train(self, data, yVec, sparse=True):
        self.traindata = data
        
        self.order = np.unique(yVec)
        
        for ind in self.order:
            self.clusterindices[ind] = np.nonzero(yVec == ind)[0]
        
        if sparse:
            self.xkibar = np.array([ 
                np.squeeze(
                    np.asarray( data[self.clusterindices[ind]].mean(axis=0) )
                ) for ind in self.order
                ])
        else:
            self.xkibar = np.array([data[indices].mean(axis=0) for
                indices in self.clusterindices])
            
    def test(self, Xtest, sparse):
        if sparse:
            dxixk = scipy.spatial.distance.cdist(np.squeeze(np.asarray(Xtest.todense())), self.xkibar)
        else:
            dxixk = scipy.spatial.distance.cdist(Xtest, self.xkibar)

        return self.order[dxixk.argmin(axis=1)]


## Find the number of errors for a set of markers

In [1]:
def classify(adata, yVec, folds, marks, lookup=None, classifier=RandomForest, debug=True):
    
    yhat = np.zeros(yVec.shape[0])
    
    for i, fold in enumerate(folds):
        if debug:
            print("*** Working on fold {} ***".format(i), flush = True)
            
        mask = np.zeros(adata.X.shape[0], dtype=bool)
        mask[fold] = True
        
        markers = marks[i]
        # I haven't really tested this.  I don't think that it will save much time.
        if lookup is not None:
            markers = [lookup.index(mark) for mark in markers]
        
        # adata.X should be sparse.csc_matrix
        if not spsp.isspmatrix_csc(adata.X):
            adata.X.tocsc()
        
        
        train_data = adata.X[:, markers]
        if debug: 
            print("Type of training data: {}".format(type(train_data)))
        
        # convert to csr to quickly get the million rows
        train_data = train_data.tocsr()
        if debug: 
            print("Type of training data: {}".format(type(train_data)))
            
        train_data = train_data[~mask,:]
        
        # convert back to csc for the classification
        train_data = train_data.tocsc()
        if debug: 
            print("Type of training data: {}".format(type(train_data)))

        # could maybe test to see if this will help speed things up...
        #train_data = train_data.X.todense()
        
        if debug:
            print("Size of training data: {}".format(train_data.shape))
            print("Training on fold {}".format(i), flush=True)
            
        fold_classifier = classifier()
        fold_classifier.train(train_data, yVec[~mask])
        
        if debug:
            print("Testing on fold {}".format(i), flush=True)
        
        test_data = adata.X[:, markers]
        test_data = test_data.tocsr()
        test_data = test_data[mask,:]
        test_data = test_data.tocsc()

        if debug:
            print("Size of test data: {}".format(test_data.shape))

        
        yhat[mask] =  fold_classifier.test( test_data, True)
        
    return yhat

NameError: name 'RandomForest' is not defined

### Find one data point

In [99]:
%%time
marks = findMarkers(svals, 2.9**2)
marks = scMarkers(scmarks, 150)
yhat = classify(adata, yVec, folds, marks, lookup=None, debug=True, classifier=NearestCentroid)

*** Working on fold 0 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 2005)
Training on fold 0
Testing on fold 0
Size of test data: (261226, 2005)
*** Working on fold 1 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 2007)
Training on fold 1
Testing on fold 1
Size of test data: (261226, 2007)
*** Working on fold 2 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 2003)
Training on fold 2
Testing on fold 2
Size of test data: (261225, 2003)
*** Working on fold 3 ***
Type of training 

In [100]:
method

't-test_overestim_var'

In [101]:
np.savez("NC-{}-150.npz".format(method), yhat=yhat)

### Calculate data for a bunch of $s$ values using Nearest Centroids

In [23]:
%%time

xvals = []
lvals = []
yhats = []

for val in range(21):
    lamb = 1.0 + 0.05*val
    lvals.append(lamb)
    print("Working on s = {}".format(lamb), flush=True)

    marks = findMarkers(svals, lamb**2)
    xvals.append([len(a) for a in marks])
    
    %time yhat = classify(adata, yVec, folds, marks, lookup=None, debug=False, classifier=NearestCentroid)
    yhats.append(yhat)


Working on s = 1.0
CPU times: user 1min 34s, sys: 13.7 s, total: 1min 48s
Wall time: 1min 56s
Working on s = 1.05
CPU times: user 1min 52s, sys: 19.1 s, total: 2min 11s
Wall time: 2min 21s
Working on s = 1.1
CPU times: user 1min 53s, sys: 19.5 s, total: 2min 13s
Wall time: 2min 22s
Working on s = 1.15
CPU times: user 1min 58s, sys: 20.5 s, total: 2min 18s
Wall time: 2min 28s
Working on s = 1.2
CPU times: user 2min, sys: 21.1 s, total: 2min 22s
Wall time: 2min 31s
Working on s = 1.25
CPU times: user 2min 8s, sys: 23.1 s, total: 2min 31s
Wall time: 2min 41s
Working on s = 1.3
CPU times: user 2min 12s, sys: 25.7 s, total: 2min 38s
Wall time: 2min 49s
Working on s = 1.35
CPU times: user 2min 21s, sys: 27.6 s, total: 2min 48s
Wall time: 3min
Working on s = 1.4
CPU times: user 2min 31s, sys: 30 s, total: 3min 1s
Wall time: 3min 13s
Working on s = 1.45
CPU times: user 2min 49s, sys: 33.8 s, total: 3min 23s
Wall time: 3min 37s
Working on s = 1.5
CPU times: user 3min 19s, sys: 42.9 s, total: 4m

In [64]:
method

'logreg'

In [66]:
%%time

xvals = []
lvals = []
yhats = []

for val in range(24):
    marksPerClust = 1 + 4*val
    lvals.append(marksPerClust)
    print("Working on {} markers per cluster".format(marksPerClust), flush=True)

    marks = scMarkers(scmarks, marksPerClust)
    xvals.append([len(a) for a in marks])
    
    %time yhat = classify(adata, yVec, folds, marks, lookup=None, debug=False, classifier=NearestCentroid)
    yhats.append(yhat)


Working on 1 markers per cluster
CPU times: user 1min 52s, sys: 27.7 s, total: 2min 20s
Wall time: 2min 20s
Working on 5 markers per cluster
CPU times: user 3min 59s, sys: 48 s, total: 4min 47s
Wall time: 4min 47s
Working on 9 markers per cluster
CPU times: user 5min 12s, sys: 1min 6s, total: 6min 19s
Wall time: 6min 19s
Working on 13 markers per cluster
CPU times: user 6min, sys: 1min 22s, total: 7min 23s
Wall time: 7min 23s
Working on 17 markers per cluster
CPU times: user 6min 53s, sys: 1min 40s, total: 8min 34s
Wall time: 8min 34s
Working on 21 markers per cluster
CPU times: user 7min 31s, sys: 1min 54s, total: 9min 25s
Wall time: 9min 25s
Working on 25 markers per cluster
CPU times: user 8min 15s, sys: 2min 7s, total: 10min 22s
Wall time: 10min 22s
Working on 29 markers per cluster
CPU times: user 9min 5s, sys: 2min 22s, total: 11min 27s
Wall time: 11min 27s
Working on 33 markers per cluster
CPU times: user 10min 4s, sys: 2min 35s, total: 12min 40s
Wall time: 12min 40s
Working on 

In [67]:
np.savez("NC-{}-small.npz".format(method), lvals=lvals, xvals=xvals, yhats=yhats)

In [27]:
%%time
marks = findMarkers(svals, 1.1**2)
yhat = classify(adata, yVec, folds, marks, lookup=None, debug=True, classifier=RandomForest)

*** Working on fold 0 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 79)
Training on fold 0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.8min


Testing on fold 0


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 30.3min finished


Size of test data: (261226, 79)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.5s


*** Working on fold 1 ***


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.6s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 79)
Training on fold 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 12.0min


Testing on fold 1


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 31.1min finished


Size of test data: (261226, 79)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.3s


*** Working on fold 2 ***


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.0s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 79)
Training on fold 2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.7min


Testing on fold 2


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 30.1min finished


Size of test data: (261225, 79)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.4s


*** Working on fold 3 ***


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.0s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 79)
Training on fold 3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.9min


Testing on fold 3


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 31.2min finished


Size of test data: (261225, 79)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.4s


*** Working on fold 4 ***


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.3s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 82)
Training on fold 4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 12.4min


Testing on fold 4


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 32.0min finished


Size of test data: (261225, 82)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.4s finished


CPU times: user 19h 37min 27s, sys: 11min 20s, total: 19h 48min 48s
Wall time: 2h 38min


In [27]:
%%time
marks = findMarkers(svals, 1.1**2)
yhat = classify(adata, yVec, folds, marks, lookup=None, debug=True, classifier=RandomForest)

*** Working on fold 0 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 79)
Training on fold 0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 23.7min


Testing on fold 0


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 54.6min finished


Size of test data: (261226, 79)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.4s


*** Working on fold 1 ***


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    9.9s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044901, 79)
Training on fold 1


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.5min


Testing on fold 1


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 56.2min finished


Size of test data: (261226, 79)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s


*** Working on fold 2 ***


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   10.0s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 79)
Training on fold 2


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 23.1min


Testing on fold 2


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 54.3min finished


Size of test data: (261225, 79)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.4s


*** Working on fold 3 ***


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    9.9s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 79)
Training on fold 3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.2min


Testing on fold 3


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 57.1min finished


Size of test data: (261225, 79)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s


*** Working on fold 4 ***


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    9.8s finished


Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 82)
Training on fold 4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 24.8min


Testing on fold 4


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 58.2min finished


Size of test data: (261225, 82)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    9.7s finished


CPU times: user 18h 10min 14s, sys: 13min 28s, total: 18h 23min 42s
Wall time: 4h 43min 55s


In [30]:
np.savez("RF-1bcs-1.1.npz", yhat=yhat)

In [97]:
np.where(yhat == yVec)[0].shape[0]/yVec.shape[0]

0.757815281362379

In [85]:
np.where(yhat == yVec)[0].shape[0]/yVec.shape[0]

0.7538355764791632

In [30]:
np.savez("1M-yVec.npz", yVec = yVec)

A better way to do the markers function:

In [28]:
for mark in marks[0]:
    if mark not in set(np.where(svals[0] <= 1.2)[1]):
        print("Marker {} is shitty".format(mark))

In [29]:
len(list(set(np.where(svals[0] <= 1.2)[1])))

95

In [67]:
len(marks[0])

95

### Testing the classifier step by step

Load the correct training data

In [107]:
%%time

debug = True
i = 4
fold = folds[i]

if debug:
    print("*** Working on fold {} ***".format(i), flush = True)

mask = np.zeros(subset.X.shape[0], dtype=bool)
mask[fold] = True

markers = marks[i]
#if lookup is not None:
#    markers = [lookup.index(mark) for mark in markers]

# adata.X should be sparse.csc_matrix
print("Type of training data: {}".format(type(adata.X)))
#adata.X = adata.X.tocsc()
train_data = adata.X[:, markers]
print("Type of training data: {}".format(type(train_data)))
train_data = train_data.tocsr()
print("Type of training data: {}".format(type(train_data)))
train_data = train_data[~mask,:]
train_data = train_data.tocsc()
print("Type of training data: {}".format(type(train_data)))


# test to see if this will help speed things up...
#train_data = train_data.X.todense()

if debug:
    print("Size of training data: {}".format(train_data.shape))

fold_classifier1 = RandomForest()
#fold_classifier.train(train_data, yVec[~mask])

*** Working on fold 4 ***
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>
Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Size of training data: (1044902, 94)
CPU times: user 12.2 s, sys: 9.23 s, total: 21.5 s
Wall time: 21.4 s


Train the method

In [108]:
%%time
print("Training on fold {}".format(i), flush=True)
fold_classifier1.train(train_data, yVec[~mask])

Training on fold 4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 16.2min


CPU times: user 5h 8min 47s, sys: 3min 8s, total: 5h 11min 56s
Wall time: 40min 45s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 40.7min finished


Load the test data and test

In [109]:
%%time

if debug:
    print("Testing on fold {}".format(i), flush=True)

test_data = adata.X[:, markers]
test_data = test_data.tocsr()
test_data = test_data[mask,:]
test_data = test_data.tocsc()

if debug:
    print("Size of test data: {}".format(test_data.shape))

yhat4 = fold_classifier1.test( test_data, True)

Testing on fold 4
Size of test data: (261225, 94)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.1s


CPU times: user 47 s, sys: 20.1 s, total: 1min 7s
Wall time: 25.8 s


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    8.0s finished


Evaluate how well we have done - the error rate

In [70]:
np.where(yhat0 == yVec[np.sort(folds[0])])[0].shape

(175096,)

In [75]:
np.where(yhat0 == yVec[mask])[0].shape

(202274,)

In [27]:
np.where(yhat == yVec)[0].shape[0]/yVec.shape[0]

0.830274544512134

In [76]:
202274/261226.

0.7743256796796644

In [111]:
yhat = np.zeros(yVec.shape[0])

In [117]:
yhat[np.sort(folds[4])] = yhat4

In [32]:
np.savez("NC-1bcs-2.1.npz", yhat=yhat)

In [100]:
fold_classifier.order

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38])

In [77]:
np.unique(yVec[folds[0]], return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38]),
 array([23600, 23167, 17939, 16581, 16320, 12797, 12668, 12089, 11059,
        10096,  9758,  9449,  9310,  8476,  8328,  7201,  7261,  5933,
         5320,  4933,  4538,  3664,  3566,  3486,  3208,  2201,  2149,
         1757,  1755,  1244,   306,   237,   167,   147,   134,   102,
          110,    98,    72]))

### Getting strange results - check that the nearest centroid classifier (on fold 0) is working properly

In [99]:
fold_classifier.xkibar[0]

array([1.3881541e-02, 0.0000000e+00, 1.6251558e-02, 4.4226246e-03,
       1.0538121e-02, 4.3062400e-03, 3.2986436e+00, 3.7243159e-03,
       1.1215268e-03, 7.8825784e+00, 2.4631267e-02, 2.1160886e-05,
       2.2705628e-02, 3.5973504e-02, 0.0000000e+00, 3.0990115e-01,
       0.0000000e+00, 0.0000000e+00, 2.1160886e-05, 9.3445408e-01,
       9.3033835e-02, 1.3079883e+01, 5.7144970e-02, 5.3192121e-01,
       9.9233963e-02, 4.5707505e-03, 5.0151288e-03, 3.0749603e+01,
       2.7645636e-01, 2.1160886e-05, 1.6822905e-03, 7.7352554e-01,
       1.9342901e+01, 1.3287342e+00, 3.5656092e-03, 1.0580443e-05,
       1.1035401e-02, 1.2696531e-04, 8.5701579e-03, 1.0180926e+00,
       1.3331354e-03, 1.2061702e-03, 1.1426877e-03, 2.7614955e-03,
       5.6605362e-03, 6.8557882e+00, 5.3484134e-02, 0.0000000e+00,
       3.5338681e-03, 1.0580443e-05, 2.1619104e+01, 0.0000000e+00,
       4.3219628e+00, 6.4180970e-02, 1.0580443e-05, 7.0888974e-04,
       2.8567194e-04, 1.8938993e-03, 1.0580443e-04, 7.6179206e

Load the training data and find the center of cluster 0

In [103]:
train_data = adata.X[:, markers]
print("Type of training data: {}".format(type(train_data)))
train_data = train_data.tocsr()
print("Type of training data: {}".format(type(train_data)))
train_data = train_data[~mask,:]
train_data = train_data[np.where(yVec[~mask] == 0)[0], :]
train_data = train_data.tocsc()

center = train_data.mean(axis = 0)

Type of training data: <class 'scipy.sparse.csc.csc_matrix'>
Type of training data: <class 'scipy.sparse.csr.csr_matrix'>


In [105]:
center.max()

78.40206