In [7]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np

In [2]:
import sklearn

In [3]:
sklearn.__version__

'0.20.0'

In [4]:
import plotly.graph_objs as go
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.io as pio

In [5]:
init_notebook_mode(connected=True)

In [6]:
yVec = np.load("1M-yVec.npz")['yVec']

### Load the 1bcs data

In [8]:
cs_flink = np.load("NC-1bcs-1-2.npz")
times = [1+56/60, 2+21/60, 2+22/60, 2+28/60, 2+31/60, 2 + 41/60, 2 + 49/60, 3, 3 + 13/60, 
         3 + 37/60, 4 + 19/60, 4 + 54/60, 5 + 33/60, 6.5, 7 + 18/60, 8 + 11/60, 9.1, 9 + 52/60,
        11 + 2/60, 12 + 17/60, 13 + 7/60]

In [9]:
bigYhats = []
for val in range(1,10):
    bigYhats.append(np.load("NC-1bcs-2.{}.npz".format(val))['yhat'])
    
bigYhats.append(np.load("NC-1bcs-3.0.npz")['yhat'])

In [10]:
svals = np.load("rc-svals-map.npz")['svals']

In [11]:
# svals should be a folds x clusters x genes array
def findMarkers(svals, lamb):
    # Stay consistent with the Rocks code
    lamb = np.sqrt(lamb)
    
    # only want the positions in each row
    marks = []
    for fold in svals:
        foldMarks = list(set().union(*[np.where(clust < lamb)[0] for clust in fold]))
        marks.append(foldMarks)
    
    return marks

In [12]:
bigXvals = []
for val in range(1,10):
    bigXvals.append([len(a) for a in findMarkers( svals, float("2.{}".format(val))**2 )])
    
bigXvals.append([len(a) for a in findMarkers( svals, 3.0**2 )])

### Load the scanpy data

In [14]:
methods = ['wilcoxon', 't-test_overestim_var', 'logreg']

In [23]:
sc_xvals = []
sc_errs = []
sc_yhats = []
for method in methods:
    mfile = np.load("NC-{}-small.npz".format(method))
    
    sc_yhats.append(mfile['yhats'])
    
    sc_xvals.append([np.array(a).mean() for a in mfile['xvals']])
    sc_errs.append( list(map( lambda yhat: np.where(yhat == yVec)[0].shape[0]/yVec.shape[0], mfile['yhats'])) )

In [21]:
len(sc_xvals)

3

In [26]:
cs_flink['xvals']

array([[ 37,  37,  37,  37,  37],
       [ 78,  77,  77,  77,  79],
       [ 79,  79,  79,  79,  82],
       [ 83,  84,  84,  83,  86],
       [ 95,  90,  91,  91,  94],
       [105, 103, 104, 107, 102],
       [117, 115, 111, 122, 118],
       [128, 131, 130, 148, 137],
       [141, 153, 149, 173, 160],
       [162, 178, 172, 209, 188],
       [183, 208, 194, 248, 221],
       [207, 248, 218, 280, 256],
       [234, 287, 251, 307, 294],
       [269, 319, 289, 347, 332],
       [310, 365, 328, 388, 366],
       [347, 417, 372, 436, 412],
       [398, 461, 422, 483, 461],
       [442, 505, 464, 541, 509],
       [494, 568, 518, 594, 566],
       [546, 624, 579, 656, 628],
       [608, 686, 642, 712, 698]])

### Analysis of the timing of the Nearest Centroid classifier on 1bcs data

x-axis is average number of markers
y-axis is time
Looks pretty linear so do a regression to predict the times for working with more markers

In [20]:
traces = [
    go.Scatter(x = cs_xvals, y=times, name='Timing')
]

fig = go.Figure(data=traces)

In [21]:
iplot(fig)

In [12]:
from sklearn import linear_model

In [14]:
regr = linear_model.LinearRegression()
regr.fit(np.array(xvals).reshape(-1,1), times)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [17]:
slope = regr.coef_[0]

In [18]:
slope

0.018667386267288234

In [23]:

slope* + regr.intercept_

4.619914990640517

In [25]:
predTimes = list(map( lambda x : x*slope + regr.intercept_, [ 783.8,  910.8, 1035.2, 1166.4, 1299.6, 1439. , 1580.2, 1727.6,
       1866.6, 2002.8]))

In [28]:
np.array(predTimes).sum() - predTimes[4] - predTimes[6] - predTimes[-1]

171.48236869359664

In [27]:
predTimes

[15.316327321796674,
 17.687085377742278,
 20.00930822939294,
 22.458469307661154,
 24.944965158463944,
 27.547198804123926,
 30.183033745065025,
 32.93460648086331,
 35.52937317201637,
 38.071871181621034]

## Compute various classification metrics

In [18]:
import sklearn.metrics as skm

In [19]:
meths = ['rankCorr'] + methods

In [20]:
meths

['rankCorr', 'wilcoxon', 't-test_overestim_var', 'logreg']

In [33]:
methods.index('wilcoxon')

0

In [39]:
yTrue = yVec

In [40]:
dayta = []
accs = []
prec = []
recs = []
mccs = []
xvals = []

for method in meths:
    
    # all data is already loaded
    print("PROCESSING DATA FOR {}".format(method))
    
    if method is 'rankCorr':
        dayta = cs_flink
    else: 
        dayta = {
            'xvals' : sc_xvals[ methods.index(method) ],
            'yhats' : sc_yhats[ methods.index(method) ]
        }
    
    

    
    accs.append( list(map(lambda yhat: 1-skm.accuracy_score(yTrue, yhat), dayta['yhats'])) )
    prec.append( list(map(lambda yhat: skm.precision_score(yTrue, yhat, average='weighted'), dayta['yhats'])) )
    recs.append( list(map(lambda yhat: skm.recall_score(yTrue, yhat, average='weighted'), dayta['yhats'])) )
    mccs.append( list(map(lambda yhat: skm.matthews_corrcoef(yTrue, yhat), dayta['yhats'])) )
    
    xvals.append( np.array([np.array(a).mean() for a in dayta['xvals']]) )
    
    

outname = "10x-class-scores.npz"
print("Saving all processed data to {}".format(outname))

np.savez(outname, accs=accs, prec=prec, recs=recs, mccs=mccs, xvals=xvals)

PROCESSING DATA FOR rankCorr
PROCESSING DATA FOR wilcoxon
PROCESSING DATA FOR t-test_overestim_var
PROCESSING DATA FOR logreg
Saving all processed data to 10x-class-scores.npz


## Plot the 10x clustering error data

The scanpy data is loaded in the section above

In [43]:
meths

['rankCorr', 'wilcoxon', 't-test_overestim_var', 'logreg']

In [41]:
colorMap = {
    
    'rankCorr' : "#3366cc",
#    methods[0] : "#ef3774", # nice pink
    methods[0] : "#d62728", # red
    methods[1] : "#ff9900",
#    methods[2] : "#22aa99",  # green-blue (close to green)
    methods[2] : "#2ca02c",  #chalkboard
    'edgeR' :  "#9467bd",
    'edgeRdet' : "#c5b0d5",
    'MAST' : '#17becf',
    'MASTdet': '#9edae5',
    'enets': "#8c564b",
#    'genzel' : "#0099c6", # blueish
    'genzel' : "#bcbd22", #vom
    'scvi' : "#f032e6",
    'random' : '#808080'
    
}

In [102]:
layout1 = go.Layout(
    colorway = [colorMap[meth] for meth in meths],
    font = dict(
            family='CMU Serif'
    ),
    #showlegend=False,
    xaxis=dict(
        title='Number of markers',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[10,170],
        range=[0,1620],
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    yaxis=dict(
        title='Classification error rate',
        #title='Average precision',
        #title='Matthews correlation coefficient',
        showgrid=False,
        ticks='inside',
        showline=True,
        mirror='ticks',
        #range=[0.03,0.16],
        titlefont=dict(
            #family='Computer Modern',
            size=30,
            color='#000'
        ),
        tickfont=dict(
            size=24,
            color='#000'
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=24,
            #color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    ),
    margin=go.layout.Margin(
        l=100,
        r=20,
        b=90,
        t=10,
        #pad=4
    ),
)

In [66]:
scores = ['accs', 'prec', 'mccs']
score = 'accs'


dayta = np.load("10x-class-scores.npz")

methNames = {
    'rankCorr': "RankCorr",
    methods[0]: "Wilcoxon",
    methods[1]: "t-test",
    methods[2]: "Log. Reg.",
    'edgeR' : 'edgeR',
    'MAST' : 'MAST'
}

traces = []
for i,method in enumerate(meths):
    traces.append(
        go.Scatter(x=dayta['xvals'][i], y=dayta[score][i], name=methNames[method], mode="lines+markers")
    )
    
    

In [67]:
iplot({'data':traces, 'layout': layout1})

In [68]:
pio.write_json({'data': traces, 'layout': layout1}, "/home/ahsvargo/publicData/pics2/10x/10x-ncc-acc.json")

### Old plots - before colorMaps

In [71]:
cs_xvals = [np.array(a).mean() for a in cs_flink['xvals']]

In [72]:
cs_errs = list(map( lambda yhat: np.where(yhat == yVec)[0].shape[0]/yVec.shape[0], cs_flink['yhats'])) 

In [15]:
traces = [
    go.Scatter(x = cs_xvals, y=list(100-100*np.array(cs_errs)), name='RankCorr', mode='lines+markers')
]

for i, method in enumerate(methods):
    traces.append(
        go.Scatter( x = sc_xvals[i], y=list(100-100*np.array(sc_errs[i])), name=method, mode='lines+markers' )
    )

layout = go.Layout(
    font = dict(
            family='CMU Serif'
    ),
    xaxis=dict(
        title='Number of markers',
        titlefont=dict(
            #family='Computer Modern',
            size=18,
        ),
        tickfont=dict(
            #family='Computer Modern',
            size=14,
        )
    ),
    yaxis=dict(
        title='Percent error',
        titlefont=dict(
            #family='Computer Modern',
            size=18,
        ),
        tickfont=dict(
            #family='Computer Modern',
            size=14,
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            #family='Computer Modern',
            size=14,
            #color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    )
)

fig = go.Figure(data=traces, layout=layout)

In [16]:
iplot({'data': traces, 'layout': layout})

In [19]:
pio.orca.config.executable = '/home/ahsvargo/bin/orca'

In [42]:
pio.write_image({'data': traces, 'layout': layout}, '10x-NCerrs.png', format='png')

In [25]:
pio.orca.config

orca configuration
------------------
    executable: /home/ahsvargo/bin/orca
    port: None
    timeout: None
    default_width: None
    default_height: None
    default_scale: 1
    default_format: png
    mathjax: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
    topojson: None
    mapbox_access_token: None

constants
---------
    plotlyjs: /home/ahsvargo/miniconda3/envs/r35py37/lib/python3.7/site-packages/plotly/package_data/plotly.min.js 
    config_file: /home/ahsvargo/.plotly/.orca


In [26]:
pio.orca.status

orca status
-----------
    state: running
    executable: /home/ahsvargo/bin/orca
    version: 1.2.1
    port: 41183
    pid: 6731
    command: ['/home/ahsvargo/bin/orca', 'serve', '-p', '41183', '--plotly', '/home/ahsvargo/miniconda3/envs/r35py37/lib/python3.7/site-packages/plotly/package_data/plotly.min.js', '--graph-only', '--mathjax', 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js']
    

## Plot the random forest clustering data along with the asymptotic NC 1bcs data

In [73]:
cs_fullx = cs_xvals.copy()
for stuff in bigXvals:
    cs_fullx.append(np.array(stuff).mean())

cs_fully = cs_errs.copy()
for yhat in bigYhats:
    cs_fully.append( np.where(yhat == yVec)[0].shape[0]/yVec.shape[0] )


In [74]:
rf_yhats= []
rf_xlists = []
for sval in [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.7]:
    rf_yhats.append( np.load("RF-1bcs-{}.npz".format(sval))['yhat'] )
    rf_xlists.append( [len(a) for a in findMarkers(svals, sval**2)] )
    
rf_xvals = list( map( lambda a: np.array(a).mean(), rf_xlists ) )
rf_errs = list(map( lambda yhat: np.where(yhat == yVec)[0].shape[0]/yVec.shape[0], rf_yhats)) 

In [75]:
rf_xvals

[37.0, 79.6, 92.2, 116.6, 155.2, 210.8, 274.6, 351.4, 445.0, 548.0, 1580.2]

In [103]:
traces = [
    go.Scatter(x = cs_fullx[:-3], y=list(100-100*np.array(cs_fully[:-3])), name='NCC', mode='lines+markers'),
    go.Scatter(x = rf_xvals, y=list(100-100*np.array(rf_errs)), name='RFC', mode='lines+markers')
]

layout = go.Layout(
    xaxis=dict(
        title='Number of markers',
        titlefont=dict(
            family='Computer Modern',
            size=18,
        ),
        tickfont=dict(
            family='Computer Modern',
            size=14,
        )
    ),
    yaxis=dict(
        title='Percent error',
        titlefont=dict(
            family='Computer Modern',
            size=18,
        ),
        tickfont=dict(
            family='Computer Modern',
            size=14,
        )
    ),
    legend=dict(
        #x=0,
        #y=1,
        #traceorder='normal',
        font=dict(
            family='Computer Modern',
            size=14,
            #color='#000'
        ),
        #bgcolor='#E2E2E2',
        #bordercolor='#FFFFFF',
        #borderwidth=2
    )
)

fig = go.Figure(data=traces, layout=layout1)

In [104]:
iplot(fig)

In [105]:
pio.write_json(fig, "/home/ahsvargo/publicData/pics2/10x/10x-nc-vs-rf.json")

In [23]:
?pio.write_json

In [22]:
pio.write_json(fig, "10x-nc-vs-rf.json")