In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [8]:
# replace with path to the included version of picturedRocks on your system
import sys
sys.path.append('/home/ahsvargo/xvalid')

In [9]:
from picturedrocks import Rocks
from picturedrocks.performance import FoldTester, PerformanceReport, NearestCentroidClassifier

In [4]:
import plotly.graph_objs as go
from plotly.offline import plot, iplot, init_notebook_mode

In [5]:
import plotly.io as pio

In [6]:
init_notebook_mode(connected=True)

In [10]:
import scanpy.api as sc
import anndata



examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.



In [12]:
import loadData as ld

scanpy==1.3.7+56.gde16b79.dirty anndata==0.6.11 numpy==1.14.6 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.0 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


## Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
class RandomForest:
    def __init__(self):
        self.traindata = None
        self.RFC = RandomForestClassifier(n_estimators=100, n_jobs=-1)
        
    def train(self, data):
        self.traindata = data
        self.RFC.fit(data.X, data.y[:,0])
        
    def test(self, Xtest, sparse):
        return self.RFC.predict(Xtest)
    

## Load data

### Paul

In [15]:
rocks, ft = ld.load_paul('Rocks')
rocks.normalize(totalexpr=10000, log=True)

... 100%

... storing 'paul15_clusters' as categorical


Memory usage: current 0.32 GB, difference +0.32 GB


In [16]:
path = "paul/"
dataset = "paul15"

### Zeisel

In [12]:
rocks, ft, = ld.load_zeisel('Rocks')
rocks.normalize(totalexpr=10000, log=True)

In [13]:
path = "zeisel/"
dataset = "zeisel"

## Load markers

You will need to change these paths in order to load the markers.  See further on for loading the yhats.

In [14]:
methods = ['wilcoxon', 't-test_overestim_var', 'logreg']
meths = methods + ['edgeR', 'edgeRdet', 'MAST', 'MASTdet'] + ['enets', 'scvi', 'random'];

In [15]:
def load_markers(method, dataset, path, labels=""):
    
    # scanpy methods
    if method in ['wilcoxon', 't-test_overestim_var', 'logreg']:
        fname = path + dataset + "-" + method + "-markerList"
        if labels is "":
            fname += ".npz"
        else:
            fname += "-" + labels + ".npz"
        
        print("Loading markers from " + fname)
        
        pvalMarkerList = np.load(fname)['marks']
        pvalScoreList = np.load(fname)['pvals']
       
    # R based methods
    elif method in ['edgeR', 'edgeRdet', 'MAST', 'MASTdet']:
        if dataset is not "paul15" and dataset is not "zeisel":
            print("Error: R methods only run on datasets 'paul15' and 'zeisel'")
            return
        
        pmethods = ["edgeR/", 'edgeRdet/', 'MASTtpm/', 'MASTtpmDet/']
        zmethods = ["edgeR/", 'edgeRdet/', 'MAST/', 'MASTdet/']
        ind = ["edgeR", 'edgeRdet', 'MAST', 'MASTdet'].index(method)
        
        if dataset is "paul15":
            method  = pmethods[ind]
        else:
            method = zmethods[ind]
            
        pvalMarkerList = []
        for fold in range(5):
            dayta = np.load(path + method + "markerList-fold" + str(fold) + ".npz")
            pvalMarkerList.append(dayta['arr_0'])
            
    # random markers
    elif method is "random":
        pvalMarkerList = np.load(dataset + "-randomMarkers.npz")['arr_0']
        
    # scvi    
    elif method is "scvi":
        path="./scvi/"
        pvalMarkerList = []
        for fold in range(5):
            dayta = np.load(path + dataset + "-scvi-marks-fold" + str(fold) + ".npz")
            pvalMarkerList.append(dayta['arr_0'])
            
    # elastic nets
    elif method is "enets":
        path = "./enets/"
        pvalMarkerList = []
        for fold in range(5):
            dayta = np.load(path + dataset + "-nets-fold" + str(fold) + "-marks.npz")
            pvalMarkerList.append(dayta['arr_0'])
            
    else: 
        print("Error: Method must be one of the following:")
        print("'wilcoxon', 't-test_overestim_var', 'logreg', 'edgeR', 'edgeRdet', 'MAST', 'MASTdet', 'random', 'scvi', 'enets'")
        print("Return None")
        return None
    
    return pvalMarkerList

In [16]:
meths = methods + ['edgeR', 'edgeRdet', 'MAST', 'MASTdet'] + ['enets', 'scvi', 'random'];
method = methods[0]

In [17]:
pvalMarkerList = load_markers(method, dataset=dataset, path=path)

Loading markers from zeisel/zeisel-wilcoxon-markerList.npz


## Collect y hat data
For p value methods.  See the notebook `1bcs.ipynb` for more the y hats for RankCorr and Spa.  Again, see below to load these yhat data that has already been collected.

In [23]:
marks_list = [2,3,4,5,6,7,8,9,10,12,14,16,18,20,25,30,35,40,45,50,55,60,65,70,75]

def classify(ft, pvalMarkerList, marks_list=marks_list, classifier=NearestCentroidClassifier):
    xvals = []
    yhats = []

    for marks_per_clust in marks_list:

        print("markers per cluster: {}".format(marks_per_clust), flush=True)
        myMarks = [list(set().union(*[marks[:marks_per_clust] for marks in table])) for table in pvalMarkerList]
        ft.markers = myMarks

        print("Number of markers per fold: {}".format([len(a) for a in ft.markers]))
        xvals.append([len(a) for a in ft.markers])

        ft.classify(classifier)
        yhats.append(ft.yhat)
        
    return xvals, yhats

In [24]:
%%time
xvals, yhats = classify(ft, pvalMarkerList, classifier=NearestCentroidClassifier)

markers per cluster: 2
Number of markers per fold: [18, 18, 18, 18, 18]
markers per cluster: 3
Number of markers per fold: [27, 26, 27, 27, 27]
markers per cluster: 4
Number of markers per fold: [36, 35, 35, 36, 36]
markers per cluster: 5
Number of markers per fold: [44, 43, 43, 43, 44]
markers per cluster: 6
Number of markers per fold: [53, 52, 51, 51, 53]
markers per cluster: 7
Number of markers per fold: [62, 60, 60, 60, 61]
markers per cluster: 8
Number of markers per fold: [71, 69, 68, 69, 69]
markers per cluster: 9
Number of markers per fold: [79, 78, 77, 78, 78]
markers per cluster: 10
Number of markers per fold: [88, 86, 86, 87, 87]
markers per cluster: 12
Number of markers per fold: [104, 103, 102, 103, 104]
markers per cluster: 14
Number of markers per fold: [121, 118, 118, 120, 120]
markers per cluster: 16
Number of markers per fold: [138, 134, 136, 137, 135]
markers per cluster: 18
Number of markers per fold: [156, 148, 151, 155, 151]
markers per cluster: 20
Number of marke

In [25]:
np.array([np.array(a).mean() for a in xvals])

array([ 18. ,  26.8,  35.6,  43.4,  52. ,  60.6,  69.2,  78. ,  86.8,
       103.2, 119.4, 136. , 152.2, 166.6, 203.4, 237.2, 269.2, 303.4,
       337.8, 370.4, 399.2, 427. , 452.8, 476.4, 500.2])

In [26]:
errs = np.array(list( map(lambda yhat: rocks.X.shape[0] - np.where(yhat == rocks.y.flatten())[0].shape[0], yhats) ))
errs

array([334, 333, 284, 251, 228, 213, 200, 200, 191, 192, 193, 185, 181,
       174, 162, 157, 150, 151, 142, 141, 137, 136, 135, 136, 138])

In [27]:
rocks.X.shape

(3005, 4999)

Collect all of the data for all of the methods at once and save it.

In [29]:
%%time
for method in meths:
    print("######## Working on method {}".format(method), flush=True)
    pvalMarkerList = load_markers(method, dataset=dataset, path=path)
    
    xvals, yhats = classify(ft, pvalMarkerList, classifier=NearestCentroidClassifier)
    errs = np.array(list( map(lambda yhat: rocks.X.shape[0] - np.where(yhat == rocks.y.flatten())[0].shape[0], yhats) ))
    
    outname = dataset + "-" + method + "-plotInfo.npz"
    print('######## Saving data to {}'.format(outname))
    np.savez(path + outname, xvals=xvals, yhats=yhats, ytrue=rocks.y, marks=marks_list, errs=errs)

######## Working on method wilcoxon
Loading markers from zeisel/zeisel-wilcoxon-markerList.npz
markers per cluster: 2
Number of markers per fold: [18, 18, 18, 18, 18]
markers per cluster: 3
Number of markers per fold: [27, 26, 27, 27, 27]
markers per cluster: 4
Number of markers per fold: [36, 35, 35, 36, 36]
markers per cluster: 5
Number of markers per fold: [44, 43, 43, 43, 44]
markers per cluster: 6
Number of markers per fold: [53, 52, 51, 51, 53]
markers per cluster: 7
Number of markers per fold: [62, 60, 60, 60, 61]
markers per cluster: 8
Number of markers per fold: [71, 69, 68, 69, 69]
markers per cluster: 9
Number of markers per fold: [79, 78, 77, 78, 78]
markers per cluster: 10
Number of markers per fold: [88, 86, 86, 87, 87]
markers per cluster: 12
Number of markers per fold: [104, 103, 102, 103, 104]
markers per cluster: 14
Number of markers per fold: [121, 118, 118, 120, 120]
markers per cluster: 16
Number of markers per fold: [138, 134, 136, 137, 135]
markers per cluster: 1

Number of markers per fold: [18, 18, 18, 18, 18]
markers per cluster: 3
Number of markers per fold: [27, 27, 27, 27, 27]
markers per cluster: 4
Number of markers per fold: [36, 36, 36, 36, 36]
markers per cluster: 5
Number of markers per fold: [45, 45, 45, 45, 45]
markers per cluster: 6
Number of markers per fold: [54, 54, 54, 54, 54]
markers per cluster: 7
Number of markers per fold: [63, 63, 63, 63, 63]
markers per cluster: 8
Number of markers per fold: [72, 72, 72, 72, 72]
markers per cluster: 9
Number of markers per fold: [81, 81, 81, 81, 81]
markers per cluster: 10
Number of markers per fold: [90, 90, 90, 90, 90]
markers per cluster: 12
Number of markers per fold: [108, 108, 108, 108, 108]
markers per cluster: 14
Number of markers per fold: [126, 126, 126, 126, 126]
markers per cluster: 16
Number of markers per fold: [144, 144, 144, 144, 144]
markers per cluster: 18
Number of markers per fold: [162, 162, 162, 162, 162]
markers per cluster: 20
Number of markers per fold: [180, 180,

Number of markers per fold: [63, 63, 63, 63, 63]
markers per cluster: 8
Number of markers per fold: [72, 72, 72, 72, 72]
markers per cluster: 9
Number of markers per fold: [81, 81, 81, 81, 81]
markers per cluster: 10
Number of markers per fold: [90, 90, 90, 90, 90]
markers per cluster: 12
Number of markers per fold: [108, 108, 108, 108, 108]
markers per cluster: 14
Number of markers per fold: [126, 126, 126, 126, 126]
markers per cluster: 16
Number of markers per fold: [144, 144, 144, 144, 144]
markers per cluster: 18
Number of markers per fold: [162, 162, 162, 162, 162]
markers per cluster: 20
Number of markers per fold: [180, 180, 180, 180, 180]
markers per cluster: 25
Number of markers per fold: [225, 225, 225, 225, 225]
markers per cluster: 30
Number of markers per fold: [270, 270, 269, 270, 269]
markers per cluster: 35
Number of markers per fold: [315, 315, 314, 314, 314]
markers per cluster: 40
Number of markers per fold: [360, 359, 357, 359, 359]
markers per cluster: 45
Number o

## Aggregate the data and compute the values mentioned in the paper.

This point to the correct location to find the plotInfo information and can be run right now.

In [17]:
import sklearn.metrics as skm

In [18]:
methods = ['wilcoxon', 't-test_overestim_var', 'logreg']
meths = methods + ['edgeR', 'edgeRdet', 'MAST', 'MASTdet'] + ['enets', 'genzel', 'scvi',  'random'];

In [21]:
dataset = "paul15"
path = "../paul/plot/"

In [22]:
yTrue = rocks.y

accs = []
prec = []
recs = []
mccs = []
xvals = []

for method in ['rankCorr'] + meths:
    
    #if method is 
    fname = dataset + "-" + method + "-plotInfo-RF.npz"
    print("PROCESSING DATA FROM {}".format(path + fname))
    
    dayta = np.load(path + fname) 
    
    accs.append( list(map(lambda yhat: 1-skm.accuracy_score(yTrue, yhat), dayta['yhats'])) )
    prec.append( list(map(lambda yhat: skm.precision_score(yTrue, yhat, average='weighted'), dayta['yhats'])) )
    recs.append( list(map(lambda yhat: skm.recall_score(yTrue, yhat, average='weighted'), dayta['yhats'])) )
    mccs.append( list(map(lambda yhat: skm.matthews_corrcoef(yTrue, yhat), dayta['yhats'])) )
    
    xvals.append( np.array([np.array(a).mean() for a in dayta['xvals']]) )
    
    dayta.close()
    
outname = dataset +  "-class-scores-RF.npz"
print("Saving all processed data to {}".format(path + outname))

np.savez(path + outname, accs=accs, prec=prec, recs=recs, mccs=mccs, xvals=xvals)

PROCESSING DATA FROM ../paul/plot/paul15-rankCorr-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-wilcoxon-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-t-test_overestim_var-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-logreg-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-edgeR-plotInfo-RF.npz
PROCESSING DATA FROM ../paul/plot/paul15-edgeRdet-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-MAST-plotInfo-RF.npz
PROCESSING DATA FROM ../paul/plot/paul15-MASTdet-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-enets-plotInfo-RF.npz
PROCESSING DATA FROM ../paul/plot/paul15-genzel-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

PROCESSING DATA FROM ../paul/plot/paul15-scvi-plotInfo-RF.npz
PROCESSING DATA FROM ../paul/plot/paul15-random-plotInfo-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

Saving all processed data to ../paul/plot/paul15-class-scores-RF.npz



Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


Precision is ill-defined and being set to 0.0 in labels with no 

Precisions and accuracies are very similar.  Do a couple of tests to make sure that we are correct.

It seems that we are correct, at least on Paul...  Not sure how to explain this, since accuracies are generally pretty low.

In [23]:
fname = dataset + "-" + "rankCorr-plotInfo-RF.npz"
dayta = np.load(path + fname)
yhats = dayta['yhats']
ytrue = rocks.y.flatten()

In [24]:
len(list(set(np.where(yhats[0] == 1)[0]).intersection(set(np.where(ytrue == 1)[0]))))

245

In [25]:
(1 * np.logical_and(yhats[0]==1, ytrue==1)).sum()

245

In [26]:
def precAcc(yhat, ytrue):
    
    N = yhat.shape[0]
    
    classes = np.unique(ytrue)
    predCounts = np.array([
        np.where(yhat == clust)[0].shape[0] for clust in classes
    ])
    # deal with undefined cases
    predCounts[predCounts==0] = 1
    
    trueCounts = np.array([
        np.where(ytrue == clust)[0].shape[0] for clust in classes 
    ])
    
    corrCounts = np.array([
        (1 * np.logical_and(yhat==clust, ytrue==clust)).sum() for clust in classes
    ])
    
    precs = corrCounts / predCounts * trueCounts
    
    return precs.sum()/N, corrCounts.sum()/N#, corrCounts, predCounts, trueCounts

In [27]:
myPrecs = []
myAccs = []

for yhat in yhats:
    p, a = precAcc(yhat, ytrue)
    myPrecs.append(p)
    myAccs.append(a)

myPrecs = np.array(myPrecs)
myAccs = np.array(myAccs)


In [28]:
RFname = dataset + "-class-scores-RF.npz"
RF = np.load(path + RFname)
calcPrecs = RF['prec'][0]
calcAccs = RF['accs'][0]

In [29]:
np.nonzero(myPrecs - np.array(calcPrecs))

(array([], dtype=int64),)

In [30]:
np.nonzero(1 - myAccs - np.array(calcAccs))

(array([], dtype=int64),)

In [31]:
prec = 0
for i, numCorr in enumerate(corr):
    prec += numCorr/pred[i] * true[i]
    
prec/rocks.X.shape[0]

NameError: name 'corr' is not defined

In [473]:
p

0.6553252359753782

In [496]:
true/pred

array([2.15      , 0.80048662, 1.01234568, 2.10169492, 1.        ,
       0.98295455, 0.76958525, 1.17241379, 6.3       , 0.83606557,
       1.        , 1.11290323, 0.98039216, 0.86542923, 1.24832215,
       1.03144654, 3.14285714, 2.25      , 1.24      ])

In [497]:
true

array([ 43, 329, 246, 124, 180, 173, 167,  68,  63, 153,  30,  69, 300,
       373, 186, 164,  22,   9,  31])

In [498]:
pred

array([ 20, 411, 243,  59, 180, 176, 217,  58,  10, 183,  30,  62, 306,
       431, 149, 159,   7,   4,  25])

The trues an the preds aren't particularly close together for this point at least...  A couple of them are really quite bad in fact.  I guess this rewards not predicting many points for the smaller classes - predict the ones that you are sure about and that will get multiplied.  If you predict a lot in the big classes - it's hard to have too awful precision, and the effect will be mitigated (if you have more in the predicted than in the actual).

Doesn't explain why it's so close to accuracy though.

## Plot

In [32]:
meths

['wilcoxon',
 't-test_overestim_var',
 'logreg',
 'edgeR',
 'edgeRdet',
 'MAST',
 'MASTdet',
 'enets',
 'genzel',
 'scvi',
 'random']

In [33]:
score = 'prec'

daytaname = dataset + "-class-scores-RF.npz"
dayta = np.load(path + daytaname)


methNames = {
    'rankCorr': "RankCorr",
     methods[0]: 'Wilcoxon',
     methods[1]: 't-test',
     methods[2]: 'Log. Reg.',
    'edgeR': 'edgeR',
    'edgeRdet': 'edgeRdet',
    'MAST': 'MAST',
    'MASTdet': 'MASTdet',
    'enets': 'Elastic Nets',
    'genzel': 'Spa',
    'scvi': 'scVI',
    'random': 'Random'
}

traces = []
for i,method in enumerate(['rankCorr'] + meths):
    traces.append(
        go.Scatter(x=dayta['xvals'][i], y=dayta[score][i], name=methNames[method], mode="lines+markers")
    ) 

In [34]:
for i, meth in  enumerate(['rankCorr'] + meths):
    print(np.mean(np.abs(1-np.array(dayta['mccs'][i]) - np.array(dayta['accs'][i]))))

0.026426609913713078
0.02598825013464605
0.025981450006156218
0.026129251574490926
0.026285684702496943
0.02592680688208063
0.026723522144388424
0.02631665481055903
0.02705614959263609
0.026274243737947962
0.033796305223340754
0.05095203879593599


In [519]:
for i, meth in  enumerate(['rankCorr'] + meths):
    print(np.max(np.abs(1-np.array(dayta['mccs'][i]) - np.array(dayta['accs'][i]))))

0.03082814677580037
0.027917558811622833
0.028350079100208814
0.0293960135967839
0.032099050005695906
0.03075187168629423
0.03160452597835672
0.03145043160555783
0.04124540786104247
0.028523338968961087
0.052094301189398506
0.0686652104870028


In [567]:
np.max(np.abs(1-np.array(dayta['prec'][i]) - np.array(dayta['accs'][i])))

0.04446128098526847

In [35]:
iplot(traces)

In [517]:
iplot(traces)

In [91]:
traces = []
for i,method in enumerate(['rankCorr'] + meths):
    traces.append(
        go.Scatter(x=dayta['xvals'][i], y=dayta['recs'][i], name=methNames[method], mode="lines+markers")
    ) 
iplot(traces)

# Zeisel plots

Only show accuracy and precision since recall and mcc look very much like precison

In [41]:
methods = ['wilcoxon', 't-test_overestim_var', 'logreg']
meths = methods + ['edgeR', 'edgeRdet', 'MAST', 'MASTdet'] + ['enets', 'genzel', 'scvi',  'random'];

In [42]:
dataset = "zeisel"
path = "../zeisel/plot/"

In [43]:
NCname = dataset + "-class-scores.npz"
NC = np.load(path + NCname)

RFname = dataset + "-class-scores-RF.npz"
RF = np.load(path + RFname)

score1 = 'accs'
score2 = 'prec'

methNames = {
    'rankCorr': "$\mathrm{R{\small ANK}C{\small ORR}}$",
     methods[0]: 'Wilcoxon',
     methods[1]: 't-test',
     methods[2]: 'Log. Reg.',
    'edgeR': 'edgeR',
    'edgeRdet': 'edgeRdet',
    'MAST': 'MAST',
    'MASTdet': 'MASTdet',
    'enets': 'Elastic Nets',
    'genzel': '$\mathrm{ S{\small PA}}$',
    'scvi': 'scVI',
    'random': 'Random'
}

Color explorations

In [44]:
    # from https://sashat.me/2017/01/11/list-of-20-simple-distinct-colors/
    # also http://bl.ocks.org/aaizemberg/78bd3dade9593896a59d
    # also https://codepen.io/stevekearsley/pen/RRmdRP
    # dd4477
    # "#b610ba", "#66aa00",
    # 4  d62728 #2ca02c
#5  ff9896 #98df8a
    # 109618
#     colorway = ['#4363d8', '#3cb44b', '#fabebe', '#e6194B', '#ff7f0e', '#911eb4', '#42d4f4', '#f032e6',
#                 '#808000', '#800000', '#469990', '#e6beff', '#9A6324', '#fffac8', '#ffe119', '#aaffc3',
#                 '#bfef45', '#ffd8b1', '#000075', '#a9a9a9'],
    #colorway = ["#3366cc", "#ef3774", "#ff9900", "#22aa99", "#9467bd", "#c5b0d5", '#2ca02c', '#98df8a', "#8c564b",  "#0099c6", "#f032e6", '#808080', "#994499",  "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"],

In [45]:
colorMap = {
    
    'rankCorr' : "#3366cc",
#    methods[0] : "#ef3774", # nice pink
    methods[0] : "#d62728", # red
    methods[1] : "#ff9900",
#    methods[2] : "#22aa99",  # green-blue (close to green)
#    methods[2] : "#576362",  #chalkboard
    methods[2] : "#17becf", # teal
    'edgeR' :  "#9467bd",
    'edgeRdet' : "#c5b0d5",
    'MAST' : '#2ca02c',
    'MASTdet': '#98df8a',
    'enets': "#8c564b",
#    'genzel' : "#0099c6", # blueish
    'genzel' : "#aec7e8", #categorical matching blue
    'scvi' : "#f032e6",
    'random' : '#808080'
    
}

In [46]:
colorMap = {
    
    'rankCorr' : "#3366cc",
#    methods[0] : "#ef3774", # nice pink
    methods[0] : "#d62728", # red
    methods[1] : "#ff9900",
#    methods[2] : "#22aa99",  # green-blue (close to green)
    methods[2] : "#2ca02c",  #chalkboard
    'edgeR' :  "#9467bd",
    'edgeRdet' : "#c5b0d5",
    'MAST' : '#17becf',
    'MASTdet': '#9edae5',
    'enets': "#8c564b",
#    'genzel' : "#0099c6", # blueish
    'genzel' : "#bcbd22", #vom
    'scvi' : "#f032e6",
    'random' : '#808080'
    
}

In [47]:
styles = []
for i, meth in enumerate(['rankCorr'] + meths):
    
    styles.append(
        dict(
            name = methNames[meth],
            legendgroup = methNames[meth],
            mode = "lines+markers",
            line = dict(
                color = colorMap[meth]
            ),
            marker = dict(
                color = colorMap[meth]
            )
        )
    )

In [48]:
traces11 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    traces11.append(
        go.Scatter(x=NC['xvals'][i], y=NC[score1][i], **styles[i])
    )
    
traces11b = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    traces11b.append(
        go.Scatter(x=NC['xvals'][i], y=NC[score1][i], showlegend=False,**styles[i])
    )
    
traces12 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    traces12.append(
        go.Scatter(x=NC['xvals'][i], y=NC[score2][i], name=methNames[method], mode="lines+markers")
    ) 

traces21 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    traces21.append(
        go.Scatter(x=RF['xvals'][i], y=RF[score1][i], showlegend=False,  **styles[i]) # xaxis='x2',
    ) 
    
traces22 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    traces22.append(
        go.Scatter(x=RF['xvals'][i], y=RF[score2][i], name=methNames[method], mode="lines+markers")
    ) 

In [49]:
layout1 = go.Layout(
    colorway = [colorMap[meth] for meth in ['rankCorr'] + meths],
    font = dict(
            family='CMU Serif'
    ),
    showlegend=False,
    xaxis=dict(
        title='Number of markers',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[10,170],
        range=[0,420],
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    yaxis=dict(
        title='Classification error rate',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[0.03,0.16],
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=20,
            #color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    ),
    margin=go.layout.Margin(
        l=100,
        r=10,
        b=90,
        t=10,
        #pad=4
    ),
)

layout2 = go.Layout(
    colorway = [colorMap[meth] for meth in ['rankCorr'] + meths],
    font = dict(
            family='CMU Serif'
    ),
    #showlegend=False,
    xaxis=dict(
        title='Number of markers',
        showgrid=False,
        ticks='outside',
        showline=True,
        range=[10,170],
        titlefont=dict(
            #family='Computer Modern',
            size=20,
        ),
        tickfont=dict(
            size=16,
        )
    ),
    yaxis=dict(
        title='Average precision',
        titlefont=dict(
            #family='Computer Modern',
            size=20,
        ),
        tickfont=dict(
            size=16,
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=16,
            color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    ),
    margin=go.layout.Margin(
        #l=50,
        #r=10,
        #b=50,
        #t=10,
        #pad=4
    ),
)

In [50]:
iplot({'data': traces21, 'layout': layout1})

In [423]:
pio.write_json({'data': traces21, 'layout': layout1}, "pics2/zeisel-rf-acc.json")

In [156]:
from plotly import tools

In [201]:
?tools.make_subplots

In [251]:
fig = tools.make_subplots(rows=1, cols=2, horizontal_spacing=0.12)

for trace in traces11:
    fig.append_trace(trace, 1, 1)
    
for trace in traces11b:
    fig.append_trace(trace, 1, 2)
    
fig['layout']['xaxis1'].update(
    title='Number of markers',
    showgrid=False,
    ticks='inside',
    showline=True,
    mirror='ticks',
    range=[0,420],
    titlefont=dict(
        #family='Computer Modern',
        size=20,
    ),
    tickfont=dict(
        size=20,
    )
)

fig['layout']['xaxis2'].update(
    title='Number of markers',
    showgrid=False,
    ticks='inside',
    showline=True,
    mirror='ticks',
    range=[10,130],
    titlefont=dict(
        #family='Computer Modern',
        size=20,
    ),
    tickfont=dict(
        size=20,
        color="#000"
    )
)

fig['layout']['yaxis1'].update(
    title='Classification error rate (NCC)',
    showgrid=False,
    ticks='inside',
    showline=True,
    mirror='ticks',
    titlefont=dict(
        #family='Computer Modern',
        size=20,
    ),
    tickfont=dict(
        size=20,
    )
)

fig['layout']['yaxis2'].update(
    title='Classification error rate (RFC)',
    showgrid=False,
    ticks='inside',
    showline=True,
    mirror='ticks',
    titlefont=dict(
        #family='Computer Modern',
        size=20,
    ),
    tickfont=dict(
        size=20,
        
    )
)


fig['layout'].update(
    font = dict(
            family='CMU Serif'
    ),
    margin=go.layout.Margin(
        #l=50,
        r=20,
        b=60,
        #t=10,
        #pad=4
    )
)
fig['layout']['legend'].update(
    orientation = 'h',
    y=1.3,
    font=dict(
        #family='Computer Modern',
        size=18,
        #color='#000'
    )
)

#fig['layout']['grid'].update(xgap=0.8)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



layout.Legend({
    'font': {'size': 18}, 'orientation': 'h', 'y': 1.3
})

In [252]:
iplot(fig)

In [253]:
pio.write_json(fig, "test.json")

In [85]:
iplot({'data': traces11, 'layout': layout1})

# Paul plots

In [526]:
dataset = "paul15"
path = "paul/"

In [527]:
NCname = dataset + "-class-scores.npz"
NC = np.load(path + NCname)

RFname = dataset + "-class-scores-RF.npz"
RF = np.load(path + RFname)

score1 = 'accs'
score2 = 'prec'

methNames = {
    'rankCorr': "$\mathrm{\small R{\scriptsize ANK}C{\scriptsize ORR}}$",
     methods[0]: 'Wilcoxon',
     methods[1]: 't-test',
     methods[2]: 'Log. Reg.',
    'edgeR': 'edgeR',
    'edgeRdet': 'edgeRdet',
    'MAST': 'MAST',
    'MASTdet': 'MASTdet',
    'enets': 'Elastic Nets',
    'genzel': '$\mathrm{\small S{\scriptsize PA}}$',
    'scvi': 'scVI',
    'random': 'Random'
}

In [528]:
traces11 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    arglist = np.argsort(NC['xvals'][i])
    traces11.append(
        go.Scatter(x=NC['xvals'][i][arglist], y=np.array(NC[score1][i])[arglist], name=methNames[method], mode="lines+markers")
    ) 
    
traces12 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    arglist = np.argsort(NC['xvals'][i])
    traces12.append(
        go.Scatter(x=NC['xvals'][i][arglist], y=np.array(NC[score2][i])[arglist], name=methNames[method], mode="lines+markers")
    ) 

traces21 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    arglist = np.argsort(RF['xvals'][i])
    traces21.append(
        go.Scatter(x=RF['xvals'][i][arglist], y=np.array(RF[score1][i])[arglist], name=methNames[method], mode="lines+markers")
    ) 
    
traces22 = []
for i,method in enumerate(['rankCorr'] + meths[:-1]):
    arglist = np.argsort(RF['xvals'][i])
    traces22.append(
        go.Scatter(x=RF['xvals'][i][arglist], y=np.array(RF[score2][i])[arglist], name=methNames[method], mode="lines+markers")
    ) 

In [561]:
layout1 = go.Layout(
    
    colorway = [colorMap[meth] for meth in ['rankCorr'] + meths],
    font = dict(
            family='CMU Serif'
    ),
    showlegend=False,
    xaxis=dict(
        title='Number of markers',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[20,230], # zoom ncc
        range=[20,350], # zoom rfc
        #range=[0,550],
        tick0=20,
        dtick=40,
        #tickmode='linear',
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    yaxis=dict(
        title='Classification error rate',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[0.3,0.45], # zoom ncc
        #range=[0.27,0.55], # ncc
        #range=[0.25,0.47], # rfc
        range=[0.25,0.37], # zoom rfc
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=20,
            color='#000'
        ),
    ),
    margin=go.layout.Margin(
        l=100,
        r=10,
        b=90,
        t=10,
        #pad=4
    ),
)


layout2 = go.Layout(
    
    colorway = [colorMap[meth] for meth in ['rankCorr'] + meths],
    font = dict(
            family='CMU Serif'
    ),
    #showlegend=False,
    xaxis=dict(
        title='Number of markers',
        range=[0,550],
        titlefont=dict(
            #family='Computer Modern',
            size=20,
        ),
        tickfont=dict(
            size=16,
        )
    ),
    yaxis=dict(
        title='Average precision',
        titlefont=dict(
            #family='Computer Modern',
            size=20,
        ),
        tickfont=dict(
            size=16,
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=16,
            #color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    ),
    margin=go.layout.Margin(
        #l=50,
        #r=50,
        #b=100,
        t=20,
        #pad=4
    ),
)

In [562]:
iplot({'data': traces21, 'layout': layout1})

In [563]:
pio.write_json({'data': traces21, 'layout': layout1}, "pics2/paul-rf-acc-zoom.json")