1. import libraries

In [1]:
import sys
sys.path.append('/home/yq238/.local/lib/python3.11/site-packages')

import os
import pandas as pd
import re
import numpy as np
import glob
from copy import deepcopy
import sys
import phenograph
import scanpy as sc
import itertools
from sklearn.metrics import pairwise_distances

2. read the file

In [2]:
adata = sc.read_h5ad('sce_obj.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 48028 × 21926
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'mnn_clusters', 'seurat_clusters', 'coarse_annotation', 'ne_subtype', 'tumorstatus', 'cancer_type', 'ident'
    uns: 'X_name'
    obsm: 'INTEGRATED.MNN', 'PCA', 'UMAP.MNN'
    layers: 'logcounts'

In [4]:
print(adata.X[:10, :10].toarray())
adata.X.shape

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [4. 0. 0. 3. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 3. 2. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 2. 0. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0. 0. 0. 0.]
 [5. 0. 0. 0. 0. 3. 2. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]


(48028, 21926)

adata.X line: cells ; col: genes

3. load DEG from bulk seq

In [5]:
SCLC_subtype_dir = 'bulk_ref/'

ct_dict = {}
for i in ['SCLC-A','SCLC-N','SCLC-P']:
    SCLC_subtype_file = SCLC_subtype_dir + '%s_DEG.csv' % i
    deg_rest = pd.read_csv(SCLC_subtype_file,sep=',')
    ind = deg_rest.logFC > 0
    gene_names = deg_rest.loc[ind, :].sort_values(by='adj.P.Val')['Gene'].values[:30]
    ct_dict[i] = gene_names

ct_dict

{'SCLC-A': array(['ASCL1', 'RGS17', 'SLC36A4', 'FOXA2', 'DLL3', 'LRP11', 'DDC',
        'DNALI1', 'COX17', 'ICA1', 'WSB2', 'RIMKLA', 'COX8A', 'DYNLT3',
        'TMEM30B', 'BEX1', 'DEDD2', 'EPCAM', 'SCN3A', 'RUNDC3A', 'SEC11C',
        'PSMG1', 'SYP', 'TMEM198', 'FAM174A', 'CYP51A1', 'SMYD3', 'RIMS2',
        'SYT1', 'NR0B2'], dtype=object),
 'SCLC-N': array(['NEUROD1', 'EDA', 'NHLH1', 'NHLH2', 'HPCA', 'CHRNB4', 'ST7',
        'SLC17A6', 'GLCE', 'SSTR2', 'CNTN2', 'NGB', 'NFIA', 'FNDC5',
        'KIAA1614', 'FAM212B', 'ZDHHC22', 'EYA2', 'DACH1', 'CDC42EP2',
        'SGCD', 'NEUROD4', 'PRDM8', 'GKAP1', 'PTX3', 'TCP10L', 'ALK',
        'LRFN5', 'MAMSTR', 'RTN4R'], dtype=object),
 'SCLC-P': array(['IMP4', 'POU2F3', 'BARX2', 'PVT1', 'ASCL2', 'CPSF4L', 'MOCOS',
        'FOXI1', 'COLCA2', 'GALNT14', 'LINC00639', 'PTPN18', 'EFNA4',
        'VSNL1', 'PLEKHG7', 'DYRK4', 'LINC00675', 'LINC01503', 'RUNX2',
        'FAM124A', 'MYB', 'AZGP1', 'ACADSB', 'LRMP', 'GFI1B', 'GJC3',
        'KYNU', 'ZNF69'

4. scale the counts

In [6]:
adata_tmp = deepcopy(adata)

sc.pp.scale(adata_tmp)

adata_tmp.X[:10, :10]
adata_tmp.X.shape

(48028, 21926)

5. calculate the scores and distance between idea and the truth 

In [7]:
for i in ct_dict.keys():
    sc.tl.score_genes(adata_tmp, gene_list = set(ct_dict[i]),score_name = 'limma_' + i, use_raw = False)

In [8]:
SCLCtype_score = adata_tmp.obs.loc[:,['limma_SCLC-A','limma_SCLC-N','limma_SCLC-P']]
SCLCtype_score

Unnamed: 0,limma_SCLC-A,limma_SCLC-N,limma_SCLC-P
HTA8_1003_1_120726943881004,-0.456687,-0.339013,-0.062342
HTA8_1003_1_126886686124853,-0.372377,-0.191499,2.154015
HTA8_1003_1_131310517143398,-0.507107,-0.191060,-0.430686
HTA8_1003_1_134376318458780,-0.380996,-0.186198,0.318695
HTA8_1003_1_134540478868771,-0.388584,-0.347435,0.603652
...,...,...,...
HTA8_2019_1_241054849649590,-0.260305,-0.001645,0.068433
HTA8_2019_1_241054862980981,-0.266896,-0.021599,0.027428
HTA8_2019_1_241055686643099,-0.754215,-0.124929,0.100250
HTA8_2019_1_241109193116452,-0.546074,-0.050153,0.511456


In [9]:
extrema = pd.DataFrame([[10,0,0],[0,10,0],[0,0,10]],
                          index = ['SCLC-A', 'SCLC-N', 'SCLC-P'],
                          columns = ['limma_SCLC-A','limma_SCLC-N','limma_SCLC-P'])

extrema

Unnamed: 0,limma_SCLC-A,limma_SCLC-N,limma_SCLC-P
SCLC-A,10,0,0
SCLC-N,0,10,0
SCLC-P,0,0,10


In [10]:
dists = pd.DataFrame(pairwise_distances(extrema,
                                        SCLCtype_score, metric='euclidean'),
                     index=extrema.index, columns=SCLCtype_score.index)
dists

Unnamed: 0,HTA8_1003_1_120726943881004,HTA8_1003_1_126886686124853,HTA8_1003_1_131310517143398,HTA8_1003_1_134376318458780,HTA8_1003_1_134540478868771,HTA8_1003_1_166402847061748,HTA8_1003_1_169794778126622,HTA8_1003_1_204816853883636,HTA8_1003_1_230800332613357,HTA8_1003_1_231711851047220,...,HTA8_2019_1_239939129592540,HTA8_2019_1_239997870623156,HTA8_2019_1_240152516680093,HTA8_2019_1_240550042024886,HTA8_2019_1_241046930769837,HTA8_2019_1_241054849649590,HTA8_2019_1_241054862980981,HTA8_2019_1_241055686643099,HTA8_2019_1_241109193116452,HTA8_2019_1_241176079944430
SCLC-A,10.462367,10.595408,10.517666,10.387556,10.411906,10.472252,10.674453,10.425773,10.714415,10.310719,...,10.182955,10.442293,10.417947,10.368033,10.171593,10.260534,10.266956,10.755408,10.558588,10.365257
SCLC-N,10.349282,10.423296,10.212755,10.198301,10.372309,10.710809,10.449693,10.161073,10.747264,10.281263,...,9.97547,10.158429,10.041254,10.251951,9.898795,10.005266,10.02519,10.153477,10.077964,10.156956
SCLC-P,10.078404,7.85715,10.444754,9.690588,9.410795,9.104195,8.18273,9.831016,7.24092,9.804842,...,10.028911,9.977708,10.043589,10.328028,9.934513,9.934977,9.976166,9.929225,9.504377,9.937179


6. seletct the top 100 cells for training based on the distance

In [11]:
cell_types = dists.index
cell_types

train_labels = {}
for cell_type in cell_types:
    train_labels[cell_type] = dists.loc[cell_type,:].sort_values().iloc[:100].index #Top 100 from score_gene

print((pd.Index(itertools.chain(*[train_labels[i] for i in train_labels.keys()])).value_counts() > 1).any())

False


7. construct a Markov graph

In [12]:
adata.obsm['INTEGRATED.MNN']

Unnamed: 0,mnn_1,mnn_2,mnn_3,mnn_4,mnn_5,mnn_6,mnn_7,mnn_8,mnn_9,mnn_10,...,mnn_41,mnn_42,mnn_43,mnn_44,mnn_45,mnn_46,mnn_47,mnn_48,mnn_49,mnn_50
HTA8_1003_1_120726943881004,0.208995,-0.051135,0.010350,-0.083555,0.004949,-0.102148,-0.151026,0.107580,0.075929,-0.047318,...,-0.001851,-0.009781,0.014244,-0.003969,-0.003411,-0.084930,0.022380,-0.036099,0.000050,-0.090091
HTA8_1003_1_126886686124853,0.190186,-0.205525,0.234104,0.065813,0.065314,-0.066170,0.015803,0.122887,0.149292,-0.018248,...,0.020086,0.010152,0.038750,0.070343,0.038891,-0.088280,0.004534,0.033058,0.002011,-0.088219
HTA8_1003_1_131310517143398,0.263072,-0.042293,0.005757,-0.037254,0.018395,-0.104763,-0.130438,0.137244,0.084946,-0.062313,...,-0.004456,-0.013692,0.040073,0.011264,0.010997,-0.084357,0.015468,-0.007043,-0.010693,-0.081617
HTA8_1003_1_134376318458780,0.180632,-0.081223,-0.002808,0.012161,0.049420,-0.076865,-0.113554,0.129909,0.076138,-0.036847,...,-0.041713,-0.014694,0.002115,-0.010777,0.021292,-0.077130,0.013104,0.003589,-0.021248,-0.081510
HTA8_1003_1_134540478868771,0.226421,-0.040659,-0.012340,0.010819,0.026155,-0.080275,-0.128173,0.136410,0.054649,-0.053557,...,-0.019069,0.007202,0.017209,-0.003094,0.015629,-0.069047,0.016159,-0.014650,-0.020240,-0.081859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,0.168213,-0.214266,0.088872,0.071504,0.076461,-0.070074,-0.071109,0.100440,0.096767,-0.040789,...,-0.065870,0.012651,0.021160,0.018745,0.023497,-0.044266,0.039844,-0.026704,-0.045575,-0.102644
HTA8_2019_1_241054862980981,0.189166,-0.158267,0.021934,0.136567,0.068673,-0.041429,-0.101259,0.100517,0.091631,-0.057058,...,-0.038944,-0.040910,0.054538,0.022402,-0.000683,-0.040297,0.052865,-0.045071,-0.036864,-0.131540
HTA8_2019_1_241055686643099,0.119395,-0.162810,0.040153,-0.117382,0.043490,-0.075757,-0.124231,0.123902,0.142309,-0.041625,...,-0.013144,0.035998,0.001817,0.000165,-0.012845,-0.065431,0.027012,0.016129,0.033443,-0.042919
HTA8_2019_1_241109193116452,0.098949,-0.242575,0.140568,-0.116895,0.096338,-0.068814,-0.078199,0.108482,0.183923,-0.040607,...,-0.020956,-0.012554,0.001675,-0.029079,-0.005868,-0.044896,0.025031,0.016938,0.019530,-0.039404


In [13]:
adata.obsm['INTEGRATED.MNN'] = adata.obsm['INTEGRATED.MNN'].values

In [14]:
adata.obsm['INTEGRATED.MNN']

array([[ 2.08995473e-01, -5.11349846e-02,  1.03498489e-02, ...,
        -3.60991742e-02,  4.95355499e-05, -9.00909404e-02],
       [ 1.90185619e-01, -2.05525457e-01,  2.34104405e-01, ...,
         3.30576505e-02,  2.01124478e-03, -8.82192970e-02],
       [ 2.63071707e-01, -4.22929267e-02,  5.75748002e-03, ...,
        -7.04347051e-03, -1.06931377e-02, -8.16174843e-02],
       ...,
       [ 1.19395405e-01, -1.62810491e-01,  4.01531470e-02, ...,
         1.61292555e-02,  3.34426114e-02, -4.29193441e-02],
       [ 9.89489492e-02, -2.42574728e-01,  1.40567934e-01, ...,
         1.69379470e-02,  1.95303119e-02, -3.94041459e-02],
       [ 1.77466116e-01, -1.12423447e-01,  7.68444027e-03, ...,
        -4.33064552e-02, -2.51342617e-03, -1.01130658e-01]])

In [15]:
sc.pp.neighbors(adata, n_pcs=50, use_rep = 'INTEGRATED.MNN')

In [16]:
adata

AnnData object with n_obs × n_vars = 48028 × 21926
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'mnn_clusters', 'seurat_clusters', 'coarse_annotation', 'ne_subtype', 'tumorstatus', 'cancer_type', 'ident'
    uns: 'X_name', 'neighbors'
    obsm: 'INTEGRATED.MNN', 'PCA', 'UMAP.MNN'
    layers: 'logcounts'
    obsp: 'distances', 'connectivities'

In [17]:
sc.tl.diffmap(adata)

In [18]:
adata

AnnData object with n_obs × n_vars = 48028 × 21926
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'mnn_clusters', 'seurat_clusters', 'coarse_annotation', 'ne_subtype', 'tumorstatus', 'cancer_type', 'ident'
    uns: 'X_name', 'neighbors', 'diffmap_evals'
    obsm: 'INTEGRATED.MNN', 'PCA', 'UMAP.MNN', 'X_diffmap'
    layers: 'logcounts'
    obsp: 'distances', 'connectivities'

In [19]:
num_dcs = 15
dm_ev = pd.DataFrame(adata.obsm['X_diffmap'], index = adata_tmp.obs.index).loc[:,:num_dcs]
train = np.empty((len(cell_types),),dtype=object)

In [20]:
dm_ev

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
HTA8_1003_1_120726943881004,-0.004832,0.003585,-0.003168,0.000936,-0.000292,0.001368,0.000475,0.001802,-0.003681,-0.000105,0.000922,-0.001239,-0.002221,0.000414,-0.001327
HTA8_1003_1_126886686124853,-0.004916,-0.008731,0.000420,0.002469,0.010724,-0.000467,0.003946,-0.004520,-0.005564,0.008015,0.000032,-0.005315,0.005795,0.008690,0.004902
HTA8_1003_1_131310517143398,-0.005115,0.004998,0.000645,0.000057,0.001557,0.000314,0.000823,0.001174,-0.004233,-0.002412,0.004242,-0.000697,-0.005589,0.000371,0.002239
HTA8_1003_1_134376318458780,-0.004836,-0.000230,0.000969,0.001171,-0.004414,-0.000788,0.001361,0.002519,0.000496,0.003058,0.000117,0.000961,0.001323,-0.000218,-0.000790
HTA8_1003_1_134540478868771,-0.004822,0.004601,0.002282,0.000196,0.001182,-0.001070,0.001405,0.001553,-0.001833,-0.001169,0.005879,-0.001027,-0.005447,-0.000028,0.003439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,-0.004460,-0.003679,0.002728,0.004830,-0.005367,-0.001414,0.006166,-0.003716,0.002928,-0.008420,-0.008756,-0.005423,0.001043,0.001581,0.002951
HTA8_2019_1_241054862980981,-0.004839,-0.002440,0.002566,0.004347,-0.003598,-0.001131,0.009438,0.003129,0.003751,0.000260,0.005359,-0.000148,0.010591,-0.003784,-0.002507
HTA8_2019_1_241055686643099,-0.004736,-0.004275,-0.001242,-0.003296,-0.002554,-0.000942,-0.002822,-0.000649,-0.000689,-0.002066,-0.000079,-0.004747,-0.000586,-0.004336,-0.000633
HTA8_2019_1_241109193116452,-0.004988,-0.006301,-0.000820,-0.002830,-0.002899,-0.001488,-0.008404,-0.003972,-0.002339,-0.002198,0.006646,-0.005188,0.000916,-0.004659,-0.001242


In [21]:
for c,cell_type in enumerate(cell_types):
    labels = train_labels[cell_type]
    train[c] = dm_ev.loc[labels,:]
test = dm_ev

In [22]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
HTA8_1003_1_120726943881004,-0.004832,0.003585,-0.003168,0.000936,-0.000292,0.001368,0.000475,0.001802,-0.003681,-0.000105,0.000922,-0.001239,-0.002221,0.000414,-0.001327
HTA8_1003_1_126886686124853,-0.004916,-0.008731,0.000420,0.002469,0.010724,-0.000467,0.003946,-0.004520,-0.005564,0.008015,0.000032,-0.005315,0.005795,0.008690,0.004902
HTA8_1003_1_131310517143398,-0.005115,0.004998,0.000645,0.000057,0.001557,0.000314,0.000823,0.001174,-0.004233,-0.002412,0.004242,-0.000697,-0.005589,0.000371,0.002239
HTA8_1003_1_134376318458780,-0.004836,-0.000230,0.000969,0.001171,-0.004414,-0.000788,0.001361,0.002519,0.000496,0.003058,0.000117,0.000961,0.001323,-0.000218,-0.000790
HTA8_1003_1_134540478868771,-0.004822,0.004601,0.002282,0.000196,0.001182,-0.001070,0.001405,0.001553,-0.001833,-0.001169,0.005879,-0.001027,-0.005447,-0.000028,0.003439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,-0.004460,-0.003679,0.002728,0.004830,-0.005367,-0.001414,0.006166,-0.003716,0.002928,-0.008420,-0.008756,-0.005423,0.001043,0.001581,0.002951
HTA8_2019_1_241054862980981,-0.004839,-0.002440,0.002566,0.004347,-0.003598,-0.001131,0.009438,0.003129,0.003751,0.000260,0.005359,-0.000148,0.010591,-0.003784,-0.002507
HTA8_2019_1_241055686643099,-0.004736,-0.004275,-0.001242,-0.003296,-0.002554,-0.000942,-0.002822,-0.000649,-0.000689,-0.002066,-0.000079,-0.004747,-0.000586,-0.004336,-0.000633
HTA8_2019_1_241109193116452,-0.004988,-0.006301,-0.000820,-0.002830,-0.002899,-0.001488,-0.008404,-0.003972,-0.002339,-0.002198,0.006646,-0.005188,0.000916,-0.004659,-0.001242


In [23]:
train

array([                                   0         1         2         3         4   \
       HTA8_2009_1_170728683956509 -0.004582 -0.003428 -0.001724 -0.003752  0.000468
       HTA8_2006_1_156994702593244 -0.004683 -0.003021  0.001384  0.002025 -0.003161
       HTA8_2018_1_197280153327411 -0.004963 -0.003038 -0.002157 -0.004485 -0.004206
       HTA8_2016_1_195548226734886 -0.003987 -0.000675  0.000066 -0.000743 -0.004050
       HTA8_2018_1_231365389270428 -0.004220 -0.003609 -0.001600 -0.004712 -0.003284
       ...                               ...       ...       ...       ...       ...
       HTA8_2011_1_165380330309356 -0.004294 -0.002847 -0.002249 -0.004866 -0.001243
       HTA8_2016_1_197137720629596 -0.003814 -0.000191  0.001143  0.001930 -0.004121
       HTA8_2009_1_239589205063925 -0.004601  0.000647 -0.000251  0.000645 -0.003298
       HTA8_2004_1_169156975913379 -0.004313 -0.002250  0.002398  0.003896 -0.004916
       HTA8_2009_1_165180794846621 -0.004545 -0.001238 -0.0014

In [24]:
SCLCtype_pval = phenograph.classify(train, test, k=30, metric='euclidean')

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm


In [76]:
#adata.obs = adata.obs.drop(adata.obs.columns[adata.obs.columns.str.contains('pval_')], axis=1)

In [77]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,mnn_clusters,seurat_clusters,coarse_annotation,ne_subtype,tumorstatus,cancer_type,ident
HTA8_1003_1_120726943881004,HTA8_1003_1,17782.0,3547,4.105275,41,41,Epithelial,neuroendocrine,cancer,SCLC,41
HTA8_1003_1_126886686124853,HTA8_1003_1,16854.0,3411,0.130533,25,25,Epithelial,neuroendocrine,cancer,SCLC,25
HTA8_1003_1_131310517143398,HTA8_1003_1,23000.0,3789,3.160870,41,41,Epithelial,neuroendocrine,cancer,SCLC,41
HTA8_1003_1_134376318458780,HTA8_1003_1,10301.0,2611,8.300165,13,13,Epithelial,neuroendocrine,cancer,SCLC,13
HTA8_1003_1_134540478868771,HTA8_1003_1,19531.0,3630,6.297681,9,9,Epithelial,neuroendocrine,cancer,SCLC,9
...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,HTA8_2019_1,2762.0,1184,2.208545,22,22,Epithelial,neuroendocrine,cancer,SCLC,22
HTA8_2019_1_241054862980981,HTA8_2019_1,4898.0,1056,6.941609,25,25,Epithelial,neuroendocrine,cancer,SCLC,25
HTA8_2019_1_241055686643099,HTA8_2019_1,50367.0,7479,2.253460,32,32,Epithelial,neuroendocrine,cancer,SCLC,32
HTA8_2019_1_241109193116452,HTA8_2019_1,29610.0,6316,0.385005,11,11,Epithelial,neuroendocrine,cancer,SCLC,11


In [25]:
adata.obs = pd.concat([adata.obs,
                       pd.DataFrame(SCLCtype_pval[1], index = dm_ev.index,
                                    columns = [re.sub('limma_','pval_',i) for i in SCLCtype_score.columns])],
                      axis=1)

In [79]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,mnn_clusters,seurat_clusters,coarse_annotation,ne_subtype,tumorstatus,cancer_type,ident,pval_SCLC-A,pval_SCLC-N,pval_SCLC-P,pval_SCLC-Y
HTA8_1003_1_120726943881004,HTA8_1003_1,17782.0,3547,4.105275,41,41,Epithelial,neuroendocrine,cancer,SCLC,41,0.427305,0.095148,0.002589,0.474958
HTA8_1003_1_126886686124853,HTA8_1003_1,16854.0,3411,0.130533,25,25,Epithelial,neuroendocrine,cancer,SCLC,25,0.158880,0.031020,0.709454,0.100645
HTA8_1003_1_131310517143398,HTA8_1003_1,23000.0,3789,3.160870,41,41,Epithelial,neuroendocrine,cancer,SCLC,41,0.233895,0.118090,0.001127,0.646888
HTA8_1003_1_134376318458780,HTA8_1003_1,10301.0,2611,8.300165,13,13,Epithelial,neuroendocrine,cancer,SCLC,13,0.729246,0.165876,0.005084,0.099794
HTA8_1003_1_134540478868771,HTA8_1003_1,19531.0,3630,6.297681,9,9,Epithelial,neuroendocrine,cancer,SCLC,9,0.225343,0.134849,0.001144,0.638665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,HTA8_2019_1,2762.0,1184,2.208545,22,22,Epithelial,neuroendocrine,cancer,SCLC,22,0.119230,0.217972,0.401795,0.261002
HTA8_2019_1_241054862980981,HTA8_2019_1,4898.0,1056,6.941609,25,25,Epithelial,neuroendocrine,cancer,SCLC,25,0.979658,0.006836,0.001525,0.011981
HTA8_2019_1_241055686643099,HTA8_2019_1,50367.0,7479,2.253460,32,32,Epithelial,neuroendocrine,cancer,SCLC,32,0.191768,0.127223,0.584351,0.096657
HTA8_2019_1_241109193116452,HTA8_2019_1,29610.0,6316,0.385005,11,11,Epithelial,neuroendocrine,cancer,SCLC,11,0.140693,0.162819,0.644861,0.051627


In [27]:
adata.obs.loc[:,'SCLC_subtype'] = adata.obs.loc[:,['pval_SCLC-A', 'pval_SCLC-N', 'pval_SCLC-P']].idxmax(axis=1).str.replace('pval_','')

In [28]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,mnn_clusters,seurat_clusters,coarse_annotation,ne_subtype,tumorstatus,cancer_type,ident,pval_SCLC-A,pval_SCLC-N,pval_SCLC-P,SCLC_subtype
HTA8_1003_1_120726943881004,HTA8_1003_1,17782.0,3547,4.105275,41,41,Epithelial,neuroendocrine,cancer,SCLC,41,0.755753,0.238982,0.005265,SCLC-A
HTA8_1003_1_126886686124853,HTA8_1003_1,16854.0,3411,0.130533,25,25,Epithelial,neuroendocrine,cancer,SCLC,25,0.182187,0.039463,0.778350,SCLC-P
HTA8_1003_1_131310517143398,HTA8_1003_1,23000.0,3789,3.160870,41,41,Epithelial,neuroendocrine,cancer,SCLC,41,0.678634,0.317393,0.003973,SCLC-A
HTA8_1003_1_134376318458780,HTA8_1003_1,10301.0,2611,8.300165,13,13,Epithelial,neuroendocrine,cancer,SCLC,13,0.796868,0.197050,0.006081,SCLC-A
HTA8_1003_1_134540478868771,HTA8_1003_1,19531.0,3630,6.297681,9,9,Epithelial,neuroendocrine,cancer,SCLC,9,0.656599,0.339486,0.003915,SCLC-A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HTA8_2019_1_241054849649590,HTA8_2019_1,2762.0,1184,2.208545,22,22,Epithelial,neuroendocrine,cancer,SCLC,22,0.149894,0.273870,0.576236,SCLC-P
HTA8_2019_1_241054862980981,HTA8_2019_1,4898.0,1056,6.941609,25,25,Epithelial,neuroendocrine,cancer,SCLC,25,0.986605,0.011512,0.001884,SCLC-A
HTA8_2019_1_241055686643099,HTA8_2019_1,50367.0,7479,2.253460,32,32,Epithelial,neuroendocrine,cancer,SCLC,32,0.220229,0.148160,0.631611,SCLC-P
HTA8_2019_1_241109193116452,HTA8_2019_1,29610.0,6316,0.385005,11,11,Epithelial,neuroendocrine,cancer,SCLC,11,0.156255,0.174982,0.668763,SCLC-P


In [31]:
cell_names = adata.obs.index[adata.obs['SCLC_subtype'] == 'SCLC-P']

with open('SCLC-P_cells.txt', 'w') as file:
    for cell_name in cell_names:
        file.write(cell_name + '\n')