In [1]:
import numpy as np

import numba as nb
from numba import prange, njit

import scipy.optimize
import scipy.stats
import scipy.special as sc
import numba_scipy.special

from lbfgs import LBFGS, LBFGSError, fmin_lbfgs

import matplotlib.pyplot as plt
import datetime

exec(open("paracell/utils.py").read())
exec(open("paracell/poisson.py").read())
exec(open("paracell/simulate.py").read())

In [2]:
import os
print(os)

<module 'os' from '/home/ch6845/tools/miniconda3/envs/pytorch/lib/python3.6/os.py'>


In [3]:
from scipy.stats import pearsonr
import pandas as pd

from scipy.io import mmread
#import sys

# load data

## expression data

In [4]:
#exp_data=mmread('/data01/ch6845/single_cell/data/extract/HumanLiver.data.counts.mm').toarray()

In [5]:
exp_data=mmread('data/HumanLiver_extract/HumanLiver.data.counts.mm').toarray()
with open('data/HumanLiver_extract/HumanLiver.data.col','r') as f: exp_data_col=[i.strip().strip('"') for i in f.read().split()]
with open('data/HumanLiver_extract/HumanLiver.data.row','r') as f: exp_data_row=[i.strip().strip('"') for i in f.read().split()]
assert exp_data.shape==(len(exp_data_row),len(exp_data_col))
#mm2=mm.tocsc(copy=True)
#sys.getsizeof(mm)
#mm.data.nbytes
#a=mm.toarray()
#type(mm)

In [6]:
exp_data,exp_data.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]), (20007, 8444))

In [7]:
exp_data_row[:5],exp_data_col[:5]

(['RP11-34P13.7', 'FO538757.2', 'AP006222.2', 'RP4-669L17.10', 'RP5-857K21.4'],
 ['P1TLH_AAACCTGAGCAGCCTC_1',
  'P1TLH_AAACCTGTCCTCATTA_1',
  'P1TLH_AAACCTGTCTAAGCCA_1',
  'P1TLH_AAACGGGAGTAGGCCA_1',
  'P1TLH_AAACGGGGTTCGGGCT_1'])

In [8]:
exp_data_meta=pd.read_csv('data/HumanLiver_extract/HumanLiver.metadata.tsv',sep='\t')
exp_data_meta.head()

Unnamed: 0,total_counts,total_features,orig.ident,res.0.8,S.Score,G2M.Score,Phase
P1TLH_AAACCTGAGCAGCCTC_1,2943,1427,P1TLH,12,0.046089,0.000349,S
P1TLH_AAACCTGTCCTCATTA_1,10897,2522,P1TLH,17,-0.000357,0.009434,G2M
P1TLH_AAACCTGTCTAAGCCA_1,1914,1018,P1TLH,12,0.012811,-0.056561,S
P1TLH_AAACGGGAGTAGGCCA_1,5574,1798,P1TLH,10,-0.011324,-0.047102,G1
P1TLH_AAACGGGGTTCGGGCT_1,3700,1417,P1TLH,2,0.057467,-0.003861,S


`clusterid2name` is used to convert integers in `res.0.8` to cell-type name

In [9]:
clusterid2clustername=pd.read_csv('data/HumanLiver_extract/HumanLiver.clusterid_to_clustername.tsv',sep='\t',header=None,index_col=0)
len(clusterid2clustername[1].unique()),clusterid2clustername

(11,                          1
 0                         
 1              Hepatocytes
 2               ab T cells
 3              Hepatocytes
 4              Macrophages
 5              Hepatocytes
 6              Hepatocytes
 7             Plasma cells
 8                 NK cells
 9               gd T cells
 10             Macrophages
 11                   LSECs
 12                   LSECs
 13                   LSECs
 14             Hepatocytes
 15             Hepatocytes
 16          Mature B cells
 17          Cholangiocytes
 18              gd T cells
 19         Erythroid cells
 20  Hepatic Stellate Cells)

In [10]:
clusternameunique=clusterid2clustername[1].unique().tolist()
exp_data_meta_clusterid_clusteridunique=clusterid2clustername.loc[exp_data_meta['res.0.8'].values][1].apply(lambda x: clusternameunique.index(x))

In [11]:
clustername2markers={'Hepatocytes':['ALB','HAMP','ARG1','PCK1','AFP','BCHE'],
'LSECs':['CALCRL','CD32B','VWF'],
'Cholangiocytes':['KRT19','EPCAM','FXDY2','CLDN4','CLDN10','SOX9','MMP7','CXCL1','CFTR','TFF2','KRT7','CD24'],
'Hepatic_Stellate_Cells':['ACTA2','COL1A1','TAGLN','COL1A2','COL3A1','SPARC','RBP1','DCN','MYL9'],
'Macrophages':['CD68','MARCO'],
'ab_T_cells':['CD2','CD3D','TRAC','IL32','CD3E'],
'gd_T_cells':['NKG7','FCGR3A','HOPX','GNLY'],
'NK_cells':['GZMK','KLRF1','CCL3','CMC1'],
'Plasma_cells':['CD27','IGHG1'],
'Mature_B_cells':['MS4A1','LTB','CD52','IGHD'],
'Erythroid_cells':['HBB','SLC25A37','CA1','ALAS2']
}

In [12]:
clustername2markers_new={'Cholangiocytes':['KRT19','EPCAM','FXYD2','CLDN4','CLDN10','SOX9','MMP7','CXCL1','CFTR','TFF2','KRT7','CD24'],
'Mature_B_cells':['MS4A1','IGHD','CD79A','PTPRC','IGKC','CD19'],
'Hepatocytes':['ALB','HAMP','ARG1','PCK1','AFP','BCHE'],
'LSECs':['CALCRL','VWF','PECAM1','CLEC14A','EMCN'],
'Hepatic_Stellate_Cells':['ACTA2','COL1A1','TAGLN','COL1A2','COL3A1','SPARC','RBP1','DCN','MYL9'],
'Macrophages':['CD68','MARCO','FCGR3A','LYZ','PTPRC','AIF1'],
'ab_T_cells':['CD2','CD3D','TRAC','IL32','CD3E','PTPRC'],
'gd_T_cells':['NKG7','FCGR3A','HOPX','GNLY','CMC1','KLRF1','CCL3','PTPRC'],
'NK_cells':['GZMK','KLRF1','CCL3','CMC1','NKG7','PTPRC'],
'Plasma_cells':['CD27','IGHG1','IGHA1','IGHM','CD79A','PTPRC','IGKC'],
'Erythroid_cells':['HBB','SLC25A37','CA1','ALAS2']}

#for key,value in clustername2markers_new.items():
#    for i in value:
#        print(markers_db[(markers_db['official gene symbol']==i)].shape)
#markers_db[(markers_db['official gene symbol']=='CD32B') |(markers_db['nicknames'].str.contains('CD32B'))]

In [13]:
np.unique(clustername2markers_new.values())

array([dict_values([['KRT19', 'EPCAM', 'FXYD2', 'CLDN4', 'CLDN10', 'SOX9', 'MMP7', 'CXCL1', 'CFTR', 'TFF2', 'KRT7', 'CD24'], ['MS4A1', 'IGHD', 'CD79A', 'PTPRC', 'IGKC', 'CD19'], ['ALB', 'HAMP', 'ARG1', 'PCK1', 'AFP', 'BCHE'], ['CALCRL', 'VWF', 'PECAM1', 'CLEC14A', 'EMCN'], ['ACTA2', 'COL1A1', 'TAGLN', 'COL1A2', 'COL3A1', 'SPARC', 'RBP1', 'DCN', 'MYL9'], ['CD68', 'MARCO', 'FCGR3A', 'LYZ', 'PTPRC', 'AIF1'], ['CD2', 'CD3D', 'TRAC', 'IL32', 'CD3E', 'PTPRC'], ['NKG7', 'FCGR3A', 'HOPX', 'GNLY', 'CMC1', 'KLRF1', 'CCL3', 'PTPRC'], ['GZMK', 'KLRF1', 'CCL3', 'CMC1', 'NKG7', 'PTPRC'], ['CD27', 'IGHG1', 'IGHA1', 'IGHM', 'CD79A', 'PTPRC', 'IGKC'], ['HBB', 'SLC25A37', 'CA1', 'ALAS2']])],
      dtype=object)

In [14]:
marker_unique=np.unique([j for i in list(clustername2markers_new.values()) for j in i])
marker_unique_idx=[exp_data_row.index(marker) for marker in marker_unique]

In [15]:
pi_true=np.array([np.sum(exp_data_meta_clusterid_clusteridunique==i) for i in sorted(np.unique(exp_data_meta_clusterid_clusteridunique))])/exp_data_meta_clusterid_clusteridunique.shape[0]
M_true=np.array([np.mean(exp_data[marker_unique_idx,:][:,exp_data_meta_clusterid_clusteridunique==i],axis=1) for i in sorted(np.unique(exp_data_meta_clusterid_clusteridunique))])

In [16]:
M_true = M_true.T

In [17]:
zero_idx = (M_true == 0)
M_true[zero_idx] += 1e-8

In [18]:
Y=exp_data[marker_unique_idx].transpose().astype(float)
Y.shape

(8444, 63)

In [19]:
exp_data_col_patient=pd.Series(exp_data_col).str.slice(start=1,stop=2).astype(int).values
x_data_covariate=np.eye(len(np.unique(exp_data_col_patient)))[exp_data_col_patient-1]
x_data_intercept=np.array([np.ones(Y.shape[0])]).transpose()
x_data_null=np.concatenate([x_data_intercept,x_data_covariate[:,:]],axis=1)
x_data_null.shape

(8444, 6)

In [20]:
x_data_null

array([[1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1.]])

In [21]:
cell_size_factor=pd.read_csv('data/analysis/size_factor_cluster.tsv',sep='\t',header=None)[0].values.astype(float)#.reshape(-1,1)
#cell_size_factor=np.ones_like(cell_size_factor)
cell_size_factor.shape

(8444,)

In [22]:
#pd.Series(exp_data_row).reset_index().set_index(0).loc[marker_unique]
#np.eye(len(marker_unique_idx))
#np.eye(len(marker_unique_idx))
clustername2marker_new_index=[[marker_unique.tolist().index(j) for j in i] for i in list(clustername2markers_new.values())]
M_init=np.array([np.sum(np.eye(len(marker_unique_idx))[idx],axis=0) for idx in clustername2marker_new_index])

In [23]:
marker_onehot=np.array([np.sum(np.eye(len(marker_unique))[[marker_unique.tolist().index(marker) for marker in value]],axis=0) for key,value in clustername2markers_new.items()])
marker_onehot.T.shape

(63, 11)

In [24]:
g, t, p = 63, 11, 3

In [25]:
marker = marker_onehot.T.astype(bool)
M_true[~marker] = 1e-20

In [26]:
import cProfile

In [27]:
print('Start time:',datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
cProfile.run('probs, type_label, Beta, Gamma = classify(libbfgs_func_wrapper,\
                                                        np.log(M_true), \
                                                        np.zeros(shape=(g,p)),\
                                                        Y,\
                                                        x_data_null[:,:3],\
                                                        cell_size_factor,\
                                                        marker_onehot.T,\
                                                        epsilon=1e-3)')
print('End time:',datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

Start time: 2020-05-10 01:10:49




loss:  1419.3145933446788 grad:  143.743324078401
loss:  1277.8065873609346 grad:  138.29552861392568
loss:  1134.5752679331756 grad:  119.3106539469483
loss:  5981.537205970863 grad:  318.74319496034576
loss:  4641.586353789573 grad:  2287.824385922056
loss:  1009.9624865535075 grad:  381.4809917509337
loss:  837.4618623983663 grad:  99.83708576422941
loss:  838.4838616121742 grad:  68.44917750594956
loss:  828.1656271864008 grad:  67.35541239068986
loss:  785.474017759645 grad:  56.244669617247666
loss:  2165.6471926942513 grad:  620.4760994959123
loss:  685.9307174408461 grad:  142.03943410371022
loss:  665.6069458268158 grad:  42.76663131047703
loss:  639.7849409192529 grad:  23.422375121665457
loss:  623.1032585291601 grad:  41.095696073999676
loss:  600.6275296072015 grad:  36.798916517548925
loss:  646.78250364746 grad:  93.12214179924517
loss:  586.9482500435497 grad:  37.19821923861028
loss:  578.185570248811 grad:  25.911810238648595
loss:  574.7555997867861 grad:  10.4560776

In [43]:
np.unique(exp_data_meta_clusterid_clusteridunique.values,return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([3501,  961, 1192,  511,  488,  569,  844,  129,  119,   93,   37]))

In [47]:
np.unique(probs.argmax(axis=1),return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 144,  210, 3512,  630,  179, 1217,  425, 1368,  231,  416,  112]))

In [35]:
np.sum(probs.argmax(axis=1)==exp_data_meta_clusterid_clusteridunique.values)

49

In [46]:
#pd.Series(probs.argmax(axis=1)).values,exp_data_meta_clusterid_clusteridunique.values
#Beta.shape,Gamma.shape
probs.argmax(axis=1).shape

(8444,)

In [28]:
#M_output,pi_output,gamma_output =para_cell_poisson(exp_data[marker_unique_idx,:].transpose().astype(float),11)

In [48]:
pd.Series(probs.argmax(axis=1)).value_counts()

2     3509
5     1250
7     1170
8      642
3      626
6      293
1      250
9      247
4      198
0      146
10     113
dtype: int64

In [29]:
a = pd.Series(probs.argmax(axis=1)).value_counts().sort_index()
print(pi_true * 8444)

[3501.  961. 1192.  511.  488.  569.  844.  129.  119.   93.   37.]


In [37]:
pd.Series(probs.argmax(axis=1)).value_counts()

2     3509
5     1250
7     1170
8      642
3      626
6      293
1      250
9      247
4      198
0      146
10     113
dtype: int64

In [38]:
b= -np.sort(-pi_true * 8444)
b


array([3501., 1192.,  961.,  844.,  569.,  511.,  488.,  129.,  119.,
         93.,   37.])

In [39]:
c = pd.DataFrame({'infer':a.sort_values(ascending=False), 'true':b})
c

Unnamed: 0,infer,true
2,3512,3501.0
7,1368,1192.0
5,1217,961.0
3,630,844.0
6,425,569.0
9,416,511.0
8,231,488.0
1,210,129.0
4,179,119.0
0,144,93.0


In [40]:
np.abs(a-b).sum()

NameError: name 'a' is not defined

In [19]:
gamma_output_argmax=np.argmax(gamma_output,axis=1)

In [20]:
output_true_matching=[np.argmax([pearsonr(value,value2)[0] for value2 in M_true]) for value in M_output.transpose()]
output_true_matching

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [21]:
#output_true_matching[gamma_output_argmax]

In [22]:
gamma_output_argmax_match=[output_true_matching[idx] for idx in gamma_output_argmax]

In [23]:
np.sum(exp_data_meta_clusterid_clusteridunique==gamma_output_argmax_match)#cluster#gamma_output_argmax_match

3501

In [29]:
shape(500*M_init),500*M_init

NameError: name 'shape' is not defined

In [67]:
#np.sum(M_output,axis=0),500*np.sum(M_init,axis=0)

In [17]:
M_output,pi_output,gamma_output =para_cell_poisson(exp_data[marker_unique_idx,:].transpose().astype(float),11,M_init=500*M_init)

  M = max_M(N, gamma)


In [18]:
gamma_output_argmax=np.argmax(gamma_output,axis=1)

In [24]:
pi_output=exp_data

In [26]:
gamma_output_argmax,pi_output,gamma_output,M_output

(array([0, 0, 0, ..., 0, 0, 0]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]),
 array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]))

In [21]:
gamma_output_argmax_match=[output_true_matching[idx] for idx in gamma_output_argmax]

In [22]:
np.sum(exp_data_meta_clusterid_clusteridunique==gamma_output_argmax_match)#cluster#gamma_output_argmax_match

3501

In [25]:
pi_true*8444

array([3501.,  961., 1192.,  511.,  488.,  569.,  844.,  129.,  119.,
         93.,   37.])

In [23]:
clustername2marker_new_index

[[44, 29, 31, 20, 19, 57, 48, 26, 18, 60, 45, 12],
 [49, 38, 17, 54, 41, 10],
 [4, 34, 5, 52, 1, 6],
 [8, 62, 53, 21, 28],
 [0, 23, 59, 24, 25, 58, 55, 27, 50],
 [16, 47, 30, 46, 54, 2],
 [11, 14, 61, 42, 15, 54],
 [51, 30, 36, 32, 22, 43, 9, 54],
 [33, 43, 9, 22, 51, 54],
 [13, 39, 37, 40, 17, 54, 41],
 [35, 56, 7, 3]]

In [None]:
len(gamma_output_argmax_match)

In [None]:
exp_data

In [None]:
plt.hist(np.mean(exp_data,axis=0),bins=30)

In [79]:
import matplotlib.pyplot as plt

In [67]:
M_output_,exp_output_,gamma_output_ =para_cell_poisson(exp_data[marker_unique_idx,:].transpose().astype(float),11)

In [68]:
exp_output_

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [None]:
exp_M, exp_pi, exp_gamma =para_cell_poisson(exp_data[marker_unique_idx,:].transpose().astype(float),12,np.concatenate((cluster_M.transpose(),np.random.rand(63,1)),axis=1))

In [272]:
cluster_M.transpose().shape

(63, 11)

In [261]:
np.concatenate((cluster_M.transpose(),np.random.rand(63,1)),axis=1)

(63, 12)

In [286]:
exp_M

array([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, na

In [57]:
exp_data[marker_unique_idx,:].transpose()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [171]:
np.sum(cluster_M),np.sum(cluster_M,axis=0)#,np.sum(cluster_M,axis=1),

(4720.065861189582,
 array([3.59720846e+01, 4.05614016e-01, 7.64517451e+00, 9.35315611e+00,
        3.20051071e+02, 4.00558579e+00, 1.56640608e+00, 4.14873451e+01,
        1.69758061e+00, 1.38333436e+01, 3.50061236e-01, 3.54607039e+00,
        7.40204813e+00, 2.88848684e+00, 3.93058166e+00, 3.06951415e+00,
        2.88140458e+00, 3.38562697e+00, 3.36705719e-01, 1.95095426e+00,
        1.70622204e+00, 2.22294895e+00, 1.67108366e+01, 2.89717272e+01,
        1.92030051e+01, 1.71755649e+01, 2.54609710e+00, 1.61218396e+01,
        8.17576866e-01, 1.46335117e+00, 4.14096212e+00, 9.50879221e+00,
        1.63503873e+01, 3.84008165e+00, 1.12334049e+01, 2.69210491e+03,
        2.62252119e+00, 1.57487465e+02, 8.86014508e-01, 1.45369442e+02,
        1.22378325e+02, 7.71918302e+02, 1.18697742e+01, 4.91600293e+00,
        3.65661516e+00, 4.31083156e+00, 1.32389152e+01, 4.28280884e+00,
        5.89913146e-01, 1.58563987e+00, 1.07625337e+01, 3.09878849e+01,
        4.78435539e+00, 2.69366503e+00, 5.55

In [177]:
np.sum(cluster_M),np.sum(exp_M),cluster_M.shape,exp_M.shape

(4720.065861189582, 16668.865126492095, (11, 63), (63, 11))

(11, 63)

In [172]:
np.sum(exp_M),np.sum(exp_M,axis=0)

(16668.865126492095,
 array([  31.42786605,   74.82706233,  439.11767356,  553.47661701,
        7187.62      , 3919.05769231, 2446.        ,  618.10380304,
        1128.48333333,  231.02073773,   39.73034114]))

In [139]:
exp_data[marker_unique_idx,:].transpose().astype(float).shape

(8444, 63)

In [201]:
exp_M, exp_pi, exp_gamma =para_cell_poisson(exp_data[marker_unique_idx,:].transpose().astype(float),11)

In [203]:
#exp_M

In [220]:
exp_cluster_true_matching=[np.argmax([pearsonr(value,value2)[0] for value2 in cluster_true]) for value in exp_M.transpose()]
exp_cluster_true_matching
np.unique(exp_cluster_true_matching)

array([ 0,  1,  2,  3,  5,  9, 10])

In [228]:
np.array([exp_cluster_true_matching[i] for i in np.argmax(exp_gamma,axis=1)]).shape

(8444,)

In [233]:
#exp_data_meta_clusterid_unique.values.shape
is_match=exp_data_meta_clusterid_unique.values==np.array([exp_cluster_true_matching[i] for i in np.argmax(exp_gamma,axis=1)])

In [None]:
for i in enumerate(is_match):
    print(i[1])
    if i[1]>100:
        break

In [237]:
#exp_cluster[np.argmax(exp_gamma,axis=1).tolist()]
#np.argmax(exp_gamma,axis=1).tolist()
#exp_data_meta_clusterid_unique.values.shape
print(np.sum(is_match))

6488


In [205]:
np.unique(exp_cluster)

array([ 0,  1,  2,  3,  5,  9, 10])

In [24]:
np.argmax(exp_gamma,axis=1)

array([0, 2, 0, ..., 2, 0, 0])

In [156]:
exp_cluster#.cluster_count

In [241]:
clusterid2clustername.loc[exp_data_meta['res.0.8'].values][1].unique()

array(['LSECs', 'Cholangiocytes', 'Macrophages', 'ab_T_cells', 'NK_cells',
       'gd_T_cells', 'Hepatocytes', 'Mature_B_cells',
       'Hepatic_Stellate_Cells', 'Plasma cells', 'Erythroid_cells'],
      dtype=object)

In [145]:
cluster_M=np.array([np.mean(exp_data[marker_unique_idx,:][:,exp_data_meta_clusterid_unique==i],axis=1) for i in sorted(np.unique(exp_data_meta_clusterid_unique))])
cluster_count=np.array([np.sum(exp_data_meta_clusterid_unique==i) for i in sorted(np.unique(exp_data_meta_clusterid_unique))])

In [77]:
len(clusterid2clustername[1].unique().tolist()),len(np.unique(exp_data_meta_clusterid_unique))

(11, 11)

In [75]:
len(exp_data_meta['res.0.8'].unique())

20

In [113]:
len([pearsonr(value,value2)[0] for value2 in cluster_true])

11

[2, 0, 3, 0, 3, 9, 3, 3, 9, 0, 5]

In [109]:
[np.max([pearsonr(value,value2)[0] for value2 in cluster_true]) for value in exp_M.transpose()]

[0.9888974013249603,
 0.9425146290225916,
 0.9518499770546153,
 0.938134175344367,
 0.9999251553873691,
 0.9999983180014328,
 0.9765514667808816,
 0.9999617569098788,
 0.8502483265742401,
 0.9316606534561576,
 0.9824628222065825]

In [101]:
cluster_true.shape#[0].shape

(11, 63)