In [19]:
# Author: Antti Kiviaho
# Date: 10.5.2023

import os 
os.chdir('/lustre/scratch/kiviaho/prostate_spatial')
import numpy as np
import scanpy as sc
import pandas as pd
from scripts.utils import load_from_pickle

## Function definitions

In [20]:

def calculate_matrix_orders(arr):
    # get the indices that would sort each row of the array
    sort_indices = np.argsort(-arr, axis=1)

    # create an array to mark the sorted order
    sorted_order = np.empty_like(sort_indices)
    rows, cols = np.indices(arr.shape)
    sorted_order[rows, sort_indices] = cols

    # replace each entry in the original array with its index in the sorted order
    result = sorted_order.astype(int)

    return result



def get_genes_by_factor(res):
    gene_w = res['gene_weights'].T
    genes = list(res['genes'].index)

    # create ranking matrices from the NMF weights
    mat_1 = calculate_matrix_orders(gene_w.T).T
    mat_2 = calculate_matrix_orders(gene_w)

    # Iterate through the factors (columns)
    genes_by_factors = {}
    for factor in range(gene_w.shape[1]):
        valid_genes = list()
        # Iterate through the genes, starting from the highest weighted gene of this factor
        for i in np.argsort(mat_1[:,factor]): 
            if mat_2[i,factor] ==0: # Is this the factor the gene effects the most?
                
                # If yes, add it to the list of genes
                valid_genes.append(genes[i])
            else:
                # if not, stop adding genes into the list, move on to the next factor
                break 
        # Append the list of valid genes into a dictionary under the appropriate key
        genes_by_factors['factor'+str(factor)] = valid_genes
    
    return genes_by_factors

def score_modules(ctype, n_comps):
    dat = sc.read_h5ad(ctype + '.h5ad')
    dat.X = dat.layers['counts'].copy()

    res = load_from_pickle('sc_modules/data/'+ctype+'_'+str(n_comps)+'_modules_nsnmf_results.pickle')
    genes_dict = get_genes_by_factor(res)
    for k in genes_dict.keys():
        lst = genes_dict[k]

        # Score the gene expression data.
        sc.tl.score_genes(dat, gene_list=lst, score_name=k)

    # Find the column with the highest score and add a new column for the corresponding factor.
    score_cols = list(genes_dict.keys())
    max_col = dat.obs[score_cols].idxmax(axis=1)
    dat.obs['max_factor'] = max_col

    obs_with_scores = dat.obs.copy()
    del dat

    return(obs_with_scores,genes_dict)

def summarize_factor(df, factor, g_dict):
    n_present = len(np.unique(df[df['max_factor']==factor]['sample']))
    n_dsets = len(np.unique(df[df['max_factor']==factor]['dataset']))
    n_total = len(df['sample'].cat.categories)
    print('Phenotype distribution:')
    print(df[df['max_factor'] == factor]['phenotype'].value_counts())
    print('')
    print('Present in '+str(n_present)+' out of '+str(n_total)+
    ' samples and ' + str(n_dsets) + ' datasets.')
    print('')
    print('Top 20 contributing genes:')
    for g in g_dict[factor][:20]:
        print(g)
    


## Loading the NMF gene lists and scoring them

In [21]:
# The number of components used is determined as the highest 
# n that yileds at least 20 unique genes in each component.
# Very similar to the one used in Barkley et al. Nat Genetics 2022

nmf_components_by_ctype = {
    'T_cell': 4,
    'SMC': 7,
    'Myeloid': 6,
    'Mast': 2,
    'Fibroblast': 8,
    'Epithelial': 5,
    'Endothelial': 7,
    'B_cell': 4
    }

# Create a dataframe for the final annotations 
final_annotations = pd.DataFrame()

### Epithelial

In [22]:
scores, genes_dict = score_modules('Epithelial',nmf_components_by_ctype['Epithelial'])
scores['max_factor'].value_counts()

factor4    32370
factor3    21460
factor1    13243
factor2     9810
factor0     5203
Name: max_factor, dtype: int64

In [23]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
PCa       3297
normal    1676
CRPC       230
Name: phenotype, dtype: int64

Present in 95 out of 98 samples and 7 datasets.

Top 20 contributing genes:
GPX3
TGM2
FLRT3
C1R
CYR61
SERPINF1
SFRP1
DKK3
SCUBE2
SLC14A1
SDC2
C1S
HLA-DPB1
EGR1
MAP1B
IFITM3
HLA-DRA
CFTR
NNMT
C3


In [24]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       7586
CRPC      5282
normal     375
Name: phenotype, dtype: int64

Present in 47 out of 98 samples and 7 datasets.

Top 20 contributing genes:
PPP1R14B
S100A14
S100A2
S100P
SDCBP2
SERPINB1
PLIN2
TINAGL1
FOSL1
KRT17
LAMC2
CYP1B1
PIM1
ODC1
CAV1
GPX2
PHLDA1
TXNRD1
EMP3
MYC


In [25]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
PCa       9142
normal     388
CRPC       280
Name: phenotype, dtype: int64

Present in 80 out of 98 samples and 7 datasets.

Top 20 contributing genes:
TMC5
FNIP2
GULP1
MUC3A
ERG
TMEM238
COL9A2
PCA3
VSTM2L
AMACR
TMEM178A
PDLIM3
PEX10
PRSS23
PTP4A3
ASRGL1
HPN
GLRX
CRLS1
EEF1A2


In [26]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       14263
normal     4541
CRPC       2656
Name: phenotype, dtype: int64

Present in 95 out of 98 samples and 7 datasets.

Top 20 contributing genes:
CLDN4
CRABP2
DDIT4
IER2
ASS1
WFDC2
RHOV
CYP4B1
MTRNR2L12
ID3
CXCL17
MUC20
GABRP
HES1
SLPI
NFKBIA
FOS
ARRDC2
MDK
RARRES3


In [27]:
summarize_factor(scores,'factor4',genes_dict)

Phenotype distribution:
PCa       19664
normal    12102
CRPC        604
Name: phenotype, dtype: int64

Present in 94 out of 98 samples and 7 datasets.

Top 20 contributing genes:
KLK3
KLK2
ACPP
IDH1
RAMP1
NAAA
DEGS1
PAK1IP1
DPP4
KLK4
LCP1
P4HB
NIPAL3
ANPEP
SCD
ABCC4
LIFR
IQGAP2
ANTXR2
MSMB


In [28]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'secreting epithelial')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'advanced malignant epithelial')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'early malignant epithelial')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'lipid response epithelial')
scores['max_factor'] = scores['max_factor'].replace('factor4', 'healthy epithelial')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

### T cell

In [29]:
scores, genes_dict = score_modules('T_cell',nmf_components_by_ctype['T_cell'])
scores['max_factor'].value_counts()

factor0    52596
factor2    46625
factor3     6596
factor1     2380
Name: max_factor, dtype: int64

In [30]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
normal    31826
PCa       20485
CRPC        285
Name: phenotype, dtype: int64

Present in 86 out of 95 samples and 7 datasets.

Top 20 contributing genes:
CCL5
KLRD1
CTSW
ID2
GZMH
CCL4
GZMB
NKG7
KLRC1
GZMA
PRF1
GNLY
CXCR4
CLIC3
PLAC8
LGALS1
SPON2
CD69
GADD45B
PFN1


In [31]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       1858
normal     327
CRPC       195
Name: phenotype, dtype: int64

Present in 76 out of 95 samples and 7 datasets.

Top 20 contributing genes:
DOK2
IL32
YBX1
HLA-DRB1
ODC1
GBP5
ITM2A
NAMPT
JUND
HSPA5
ANXA1
DUSP2
HSPE1
CD81
COTL1
CARD16
GAPDH
GSPT1
PRNP
UBE2S


In [32]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
PCa       30588
normal    14109
CRPC       1928
Name: phenotype, dtype: int64

Present in 94 out of 95 samples and 7 datasets.

Top 20 contributing genes:
FOS
DNAJB1
JUN
HSPA1B
HSPA1A
DUSP1
NR4A1
RPLP0
NFKBIA
ZFAND2A
HSPA8
PPP1R15A
CDKN1A
HSPA6
DNAJA1
UBC
HSP90AB1
HSPB1
RNASEK
SLBP


In [33]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       3371
normal    3026
CRPC       199
Name: phenotype, dtype: int64

Present in 87 out of 95 samples and 7 datasets.

Top 20 contributing genes:
MRPL1
ATP1B3
TUBA1B
HLA-DRA
NEAT1
ERRFI1
IL7R
JMY
SMC4
MTRNR2L1
CREM
CD27
HMGB2
MACF1
MT2A
CHN1
DUSP4
ZFAND5
BIRC3
AIF1


In [34]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'NK cells')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'cytokine signalling T cells')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'activated T cells')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'helper T cells')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

### Myeloid

In [35]:
scores, genes_dict = score_modules('Myeloid',nmf_components_by_ctype['Myeloid'])
scores['max_factor'].value_counts()

factor3    10718
factor1     8424
factor2     6255
factor0     6215
factor5     1588
factor4      434
Name: max_factor, dtype: int64

In [36]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
normal    3846
PCa       2257
CRPC       112
Name: phenotype, dtype: int64

Present in 52 out of 98 samples and 6 datasets.

Top 20 contributing genes:
LTB
CD79A
CD69
BCAS2
CCL5
CD3D
CD3E
HIST1H1C
SPIB
IL32
CACYBP
MYC
CD2
HOPX
FKBP11
CD7
ID3
CTSW
MARCKSL1
NKG7


In [37]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       5024
normal    2445
CRPC       955
Name: phenotype, dtype: int64

Present in 98 out of 98 samples and 7 datasets.

Top 20 contributing genes:
C1QA
C1QC
C1QB
TMEM176B
CD14
HLA-DRB5
PLD3
TMEM176A
FOLR2
SLC40A1
LGMN
CD163
PSAP
HLA-DRA
LIPA
TREM2
HLA-DRB1
PLTP
CPM
CTSB


In [38]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
PCa       3374
normal    2664
CRPC       217
Name: phenotype, dtype: int64

Present in 89 out of 98 samples and 7 datasets.

Top 20 contributing genes:
COTL1
BCL2A1
RNASEK
GABARAP
ACTB
NFKBIA
IL1B
LGALS2
ATP6V0C
VIM
S100A6
FCN1
G0S2
SOD2
PPA1
LYZ
IFI30
PPIF
S100A10
AREG


In [39]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       7483
normal    2731
CRPC       504
Name: phenotype, dtype: int64

Present in 94 out of 98 samples and 7 datasets.

Top 20 contributing genes:
HSPA1B
BAG3
STMN1
IER5
YWHAH
PCNA
DNAJB1
HSPA1A
C3
TUBB
HIST1H4C
RHOB
EGR1
HSPH1
JUN
HSP90AA1
DNAJA4
HSPA6
TUBA1B
HMGB2


In [40]:
summarize_factor(scores,'factor4',genes_dict)

Phenotype distribution:
PCa       282
CRPC       83
normal     69
Name: phenotype, dtype: int64

Present in 54 out of 98 samples and 7 datasets.

Top 20 contributing genes:
APOC1
ACP5
NUPR1
FTL
CHCHD6
MYL9
CRIP2
ELF3
APOE
LGALS1
CTSD
TPM2
ADIRF
KRT19
CSTB
CALD1
OTOA
IGFBP4
IFI27
SPARC


In [41]:
summarize_factor(scores,'factor5',genes_dict)

Phenotype distribution:
normal    756
PCa       730
CRPC      102
Name: phenotype, dtype: int64

Present in 84 out of 98 samples and 7 datasets.

Top 20 contributing genes:
MT2A
FTH1
MT1X
ANPEP
MT1E
SLC16A10
CXCL3
CXCL1
EMP1
IL6
SDC2
MGST1
SLC7A11
MT1G
TMEM107
MARCO
CD36
FABP5
LINC00910
FDX1


In [42]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'lymphoid-like myeloid cells')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'antigen presenting macrophages')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'inflammatory monocytes')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'tumor associated myeloid cells')
scores['max_factor'] = scores['max_factor'].replace('factor4', 'macrophages')
scores['max_factor'] = scores['max_factor'].replace('factor5', 'metallothionein myeloid cells')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

### Fibroblasts

In [43]:
scores, genes_dict = score_modules('Fibroblast',nmf_components_by_ctype['Fibroblast'])
scores['max_factor'].value_counts()

factor7    1633
factor2    1464
factor6    1265
factor3     606
factor1     451
factor0     393
factor4     353
factor5     105
Name: max_factor, dtype: int64

In [44]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
CRPC      253
normal    117
PCa        23
Name: phenotype, dtype: int64

Present in 18 out of 52 samples and 4 datasets.

Top 20 contributing genes:
REL
RGS5
PHLDA1
GPRC5A
SGK1
ZFAND2A
ID4
GADD45A
INHBA
FABP4
PMAIP1
TNFAIP3
TNFAIP6
ISG15
CXCL2
VCAM1
LST1
PKNOX1
MLLT11
IFIT3


In [45]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       244
normal    120
CRPC       87
Name: phenotype, dtype: int64

Present in 19 out of 52 samples and 4 datasets.

Top 20 contributing genes:
MT2A
SOD2
NAMPT
KRT18
HMGA1
FOSL1
IER3
PTGS2
LIF
CREM
FXYD3
ISG20
GCLM
MT1X
ELF3
ADAMTS4
MT1G
AKR1C1
TNFRSF12A
KRT8


In [46]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
CRPC      1113
normal     189
PCa        162
Name: phenotype, dtype: int64

Present in 34 out of 52 samples and 5 datasets.

Top 20 contributing genes:
CTHRC1
HTRA3
ANKH
PDGFRL
MTRNR2L12
POSTN
TIMP1
ASPSCR1
THBS2
APOD
FN1
F2R
MTRNR2L8
DCLK1
HOPX
FOXS1
CST2
PTHLH
COL1A1
NOTCH3


In [47]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       437
normal    168
CRPC        1
Name: phenotype, dtype: int64

Present in 33 out of 52 samples and 4 datasets.

Top 20 contributing genes:
TCF21
IGF1
SPOCK3
C7
SFRP1
IGFBP2
PAGE4
MYLK
TGM2
PDE5A
RORB
HSPB6
PEMT
CES1
HSD17B4
APCDD1
A2M
STEAP2
CYP1B1
RNASEK


In [48]:
summarize_factor(scores,'factor4',genes_dict)

Phenotype distribution:
normal    180
CRPC      140
PCa        33
Name: phenotype, dtype: int64

Present in 24 out of 52 samples and 5 datasets.

Top 20 contributing genes:
TPM2
TAGLN
PPP1R14A
ACTA2
IGFBP3
PCSK1N
LAMP5
ADRA2A
CMSS1
ACTG2
LEPR
HHIP
SLC14A1
HSD17B6
DPEP1
RRAD
CNN1
CXCL14
BNIP3L
PLN


In [49]:
summarize_factor(scores,'factor5',genes_dict)

Phenotype distribution:
normal    60
PCa       39
CRPC       6
Name: phenotype, dtype: int64

Present in 20 out of 52 samples and 4 datasets.

Top 20 contributing genes:
CRYAB
TSC22D4
HSPA2
CLU
HLA-DRA
SNCG
VWA1
TSPAN8
SBSPON
ARC
CD74
GPM6B
LGI4
RGCC
HLA-DMA
INSIG1
PLP1
STMN1
LAPTM5
HLA-DPA1


In [50]:
summarize_factor(scores,'factor6',genes_dict)

Phenotype distribution:
normal    1069
CRPC       111
PCa         85
Name: phenotype, dtype: int64

Present in 17 out of 52 samples and 5 datasets.

Top 20 contributing genes:
MFAP5
CD55
CFD
CLEC3B
PCOLCE2
SPRY1
PI16
C1QTNF3
CILP
CRABP2
IFI6
ADH1B
ATF5
TNXB
MTRNR2L1
RAMP2
KLK1
NBEAL1
SFRP2
MPC2


In [51]:
summarize_factor(scores,'factor7',genes_dict)

Phenotype distribution:
PCa       685
normal    607
CRPC      341
Name: phenotype, dtype: int64

Present in 46 out of 52 samples and 5 datasets.

Top 20 contributing genes:
JUNB
ZFP36
FOS
DUSP1
IER2
NR4A1
DNAJB1
HSPA8
HSPA1B
BAG3
JUN
EGR1
CDKN1A
HSPA1A
CSRNP1
BTG2
KLF10
MAFF
HSPD1
SLC2A3


In [52]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'NF-kB fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'inflammatory fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'crpc fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'tumor suppressor fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor4', 'myofibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor5', 'antigen-presenting fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor6', 'normal fibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor7', 'stressed fibroblasts')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

### Smooth muscle cells

In [53]:
scores, genes_dict = score_modules('SMC',nmf_components_by_ctype['SMC'])
scores['max_factor'].value_counts()

factor1    3767
factor4    3162
factor0    3126
factor2    2883
factor3    1679
factor5     128
factor6      72
Name: max_factor, dtype: int64

In [54]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
PCa       1509
normal    1081
CRPC       536
Name: phenotype, dtype: int64

Present in 83 out of 85 samples and 6 datasets.

Top 20 contributing genes:
TAGLN
CSRP1
VIM
FHL1
CNN1
ACTA2
SBSPON
ACTG2
SORBS2
CLU
SOD3
SNCG
CCDC3
NOV
TNC
TMEM176A
SYNM
PDLIM4
CRIM1
HSPG2


In [55]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       2094
normal    1271
CRPC       402
Name: phenotype, dtype: int64

Present in 75 out of 85 samples and 6 datasets.

Top 20 contributing genes:
FOSB
FOS
TUBB4B
DNAJA1
JUN
NR4A1
MAFF
RHOB
CEBPD
EGR1
HSPA8
YBX3
CEBPB
H2AFX
IGFBP4
RASL11A
EIF4A3
ATF3
IER2
RNASEK


In [56]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
PCa       1626
normal     744
CRPC       513
Name: phenotype, dtype: int64

Present in 79 out of 85 samples and 6 datasets.

Top 20 contributing genes:
COL4A2
COL18A1
COL4A1
NDUFA4L2
THY1
EVA1B
UACA
SPARC
COL5A2
CAMK2N1
CD81
CYGB
MARCKS
PPP1R14B
ASPN
CA4
CD248
GNAS
RGS5
NR2F2


In [57]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       1096
normal     583
CRPC         0
Name: phenotype, dtype: int64

Present in 51 out of 85 samples and 5 datasets.

Top 20 contributing genes:
SERPINF1
PTN
DCN
LUM
IGF1
LTBP4
MMP2
C7
OLFML3
MATN2
TCF21
MGST1
CFD
RGS2
FBLN1
CCDC80
SAT1
ALDH1A3
SPOCK3
FBLN5


In [58]:
summarize_factor(scores,'factor4',genes_dict)

Phenotype distribution:
PCa       2085
normal    1062
CRPC        15
Name: phenotype, dtype: int64

Present in 60 out of 85 samples and 6 datasets.

Top 20 contributing genes:
PCP4
MTRNR2L8
NET1
MTRNR2L12
GADD45B
MT1E
PHLDA2
MT1X
MAP3K7CL
SLC38A11
MT1M
ZNF331
CREM
GBP2
MT2A
MTRNR2L1
NEAT1
PDK4
PLN
ADAMTS9


In [59]:
summarize_factor(scores,'factor5',genes_dict)

Phenotype distribution:
normal    63
PCa       61
CRPC       4
Name: phenotype, dtype: int64

Present in 29 out of 85 samples and 4 datasets.

Top 20 contributing genes:
CXCR4
HLA-DRA
CD52
LAPTM5
HLA-DPA1
CD69
HLA-DPB1
CD3D
CD74
HLA-DRB1
CORO1A
CCL5
HCST
HLA-DQB1
SRGN
PTPRC
CYTIP
EGFL7
CD48
CD2


In [60]:
summarize_factor(scores,'factor6',genes_dict)

Phenotype distribution:
normal    44
PCa       28
CRPC       0
Name: phenotype, dtype: int64

Present in 11 out of 85 samples and 4 datasets.

Top 20 contributing genes:
KLK2
FXYD3
RDH11
SPINT2
SORD
NKX3-1
TSPAN1
NPY
KLK4
KLK3
TSTD1
CLDN3
PRAC1
GOLM1
SLC45A3
ZG16B
KRT8
TRPM4
TMPRSS2
SPDEF


In [61]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'pericytes')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'stressed SMCs')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'ECM secreting SMCs')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'myofibroblasts')
scores['max_factor'] = scores['max_factor'].replace('factor4', 'neuronal SMCs')
scores['max_factor'] = scores['max_factor'].replace('factor5', 'antigen-presenting SMCs')
scores['max_factor'] = scores['max_factor'].replace('factor6', 'luminal-like SMCs')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

### B cells

In [62]:
scores, genes_dict = score_modules('B_cell',nmf_components_by_ctype['B_cell'])
scores['max_factor'].value_counts()

factor1    4498
factor3    1843
factor2     225
factor0     101
Name: max_factor, dtype: int64

In [63]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
PCa       90
normal    11
CRPC       0
Name: phenotype, dtype: int64

Present in 19 out of 73 samples and 2 datasets.

Top 20 contributing genes:
JUNB
ZFP36
BRD2
NFKBIA
BTG2
FOS
JUN
TSC22D3
TXNIP
IER2
FOSB
GADD45B
DUSP1
EIF4A3
HERPUD1
PIM2
XBP1
ZNF331
ATF3
RPN2


In [64]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       2991
normal    1135
CRPC       372
Name: phenotype, dtype: int64

Present in 69 out of 73 samples and 6 datasets.

Top 20 contributing genes:
B2M
SEC61G
RPL27A
RABAC1
RPL13A
RPL41
MALAT1
MANF
RPS27L
HLA-DQA2
SDF2L1
CD63
TMEM258
SSR3
SPCS1
KIAA1551
HLA-C
CYBA
SUB1
FTL


In [65]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
normal    126
PCa        97
CRPC        2
Name: phenotype, dtype: int64

Present in 28 out of 73 samples and 3 datasets.

Top 20 contributing genes:
SEC61B
NPC2
TMSB4X
IRF7
PLAC8
PPP1R14B
GPR183
FTH1
TYROBP
S100A6
CST3
CLN8
CXCR3
CTSB
LILRA4
TSPAN13
ALOX5AP
FOSL2
SERPINF1
FCER1G


In [66]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
normal    1424
PCa        418
CRPC         1
Name: phenotype, dtype: int64

Present in 14 out of 73 samples and 2 datasets.

Top 20 contributing genes:
HSPA1A
HSPE1
HSPH1
HSPD1
CORO1A
MARCKSL1
HSPA1B
HSPA8
HSPB1
GAPDH
LCP1
ACTR3
DNAJB1
TUBA1B
ACTG1
ACTB
TUBB
BCL2A1
DOK2
STMN1


In [67]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'stressed B cells')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'plasma cells')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'dendritic cells')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'unfolded protein response B cells')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

## Endothelial

In [68]:
scores, genes_dict = score_modules('Endothelial',nmf_components_by_ctype['Endothelial'])
scores['max_factor'].value_counts()

factor4    11310
factor5     3611
factor0     3603
factor6     2224
factor1      500
factor2      365
factor3      211
Name: max_factor, dtype: int64

In [69]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
PCa       2394
normal    1139
CRPC        70
Name: phenotype, dtype: int64

Present in 81 out of 87 samples and 6 datasets.

Top 20 contributing genes:
SPARC
INSR
FLT1
COL4A1
RGCC
EDNRB
COL4A2
ID2
MLEC
VWA1
ANGPT2
PDGFD
KDR
DDIT4
MGP
RBP7
TMEM233
THY1
LBH
BTNL9


In [70]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       361
normal    138
CRPC        1
Name: phenotype, dtype: int64

Present in 58 out of 87 samples and 6 datasets.

Top 20 contributing genes:
MKL2
SLFN5
RARRES3
SSFA2
LINC00910
KIAA1551
SAMHD1
TGFB2
FAM84B
IFI44L
SYT15
HLA-DQA2
PRKAR2B
CCDC80
NEURL1B
ZNF521
LHX6
SESN3
GIMAP8
PARP14


In [71]:
summarize_factor(scores,'factor2',genes_dict)

Phenotype distribution:
PCa       229
normal    130
CRPC        6
Name: phenotype, dtype: int64

Present in 38 out of 87 samples and 6 datasets.

Top 20 contributing genes:
KLK2
KLF2
TSPAN1
FXYD3
SORD
KLK4
NPY
AGR2
RDH11
KLK3
NKX3-1
SPON2
SPINT2
TSTD1
PRAC1
TMPRSS2
AZGP1
CLDN3
SPDEF
STEAP2


In [72]:
summarize_factor(scores,'factor3',genes_dict)

Phenotype distribution:
PCa       180
normal     26
CRPC        5
Name: phenotype, dtype: int64

Present in 33 out of 87 samples and 6 datasets.

Top 20 contributing genes:
TPM2
MYL9
CCL5
RGS5
CALD1
CCL4
CD69
TAGLN
MTRNR2L1
CXCR4
HIGD1B
RARRES2
NDUFA4L2
CRISPLD2
ACTA2
NOTCH3
CD3D
MTRNR2L8
COX4I2
COL3A1


In [73]:
summarize_factor(scores,'factor4',genes_dict)

Phenotype distribution:
PCa       6585
normal    4059
CRPC       666
Name: phenotype, dtype: int64

Present in 84 out of 87 samples and 6 datasets.

Top 20 contributing genes:
HSPA8
CDKN1A
PNP
PPP1R15A
HSPE1
ZFP36
H2AFX
NR4A1
HSPD1
HSPA1A
DNAJB1
FOS
FOSL1
DNAJA1
BAG3
SOD2
LITAF
DNAJB4
ZC3H12A
TUBA1C


In [74]:
summarize_factor(scores,'factor5',genes_dict)

Phenotype distribution:
PCa       2882
normal     698
CRPC        31
Name: phenotype, dtype: int64

Present in 79 out of 87 samples and 6 datasets.

Top 20 contributing genes:
CLU
OLFM1
CCL14
VWF
MATN2
IFITM1
CEBPD
SELP
MET
PLA1A
MMRN1
PKP4
CCL23
IL1R1
TMTC1
AHNAK
IGFBP7
SYNE2
GTF2B
HLA-DRB5


In [75]:
summarize_factor(scores,'factor6',genes_dict)

Phenotype distribution:
PCa       1501
normal     665
CRPC        58
Name: phenotype, dtype: int64

Present in 83 out of 87 samples and 6 datasets.

Top 20 contributing genes:
CLDN5
SRP14
KCTD12
SAT1
PLLP
ENPP2
VEGFC
STMN1
RHOB
PPP1R14A
FN1
GLUL
FGL2
EMP3
LTBP4
TSPAN2
IGFBP3
FBLN5
JAG1
HEY1


In [76]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'PI3K-Akt active endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'myocardin endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor2', 'epithelial-like endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor3', 'SMC-like endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor4', 'unfolded protein response endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor5', 'endothelial cells')
scores['max_factor'] = scores['max_factor'].replace('factor6', 'angiogenesis-promoting endothelial cells')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

## Mast cells

In [77]:
scores, genes_dict = score_modules('Mast',nmf_components_by_ctype['Mast'])
scores['max_factor'].value_counts()

factor0    6466
factor1    1295
Name: max_factor, dtype: int64

In [78]:
summarize_factor(scores,'factor0',genes_dict)

Phenotype distribution:
normal    2937
PCa       2556
CRPC       973
Name: phenotype, dtype: int64

Present in 67 out of 80 samples and 6 datasets.

Top 20 contributing genes:
RPS29
HINT1
BTG1
RGS1
TPSAB1
B2M
ZFP36
TMSB4X
HLA-C
HIST1H4C
SOCS1
HLA-A
RGS2
HMGB2
LY6E
LGALS1
MANF
DUSP1
HLA-DPB1
CD69


In [79]:
summarize_factor(scores,'factor1',genes_dict)

Phenotype distribution:
PCa       901
normal    335
CRPC       59
Name: phenotype, dtype: int64

Present in 58 out of 80 samples and 6 datasets.

Top 20 contributing genes:
YWHAZ
LMNA
VIM
HIF1A
MS4A2
CPA3
ELL2
MYADM
MAP3K8
TUBB4B
SFPQ
NFKBIZ
RPL17
ACTB
GLUL
FOSB
SLC2A3
HSP90B1
NFKB1
SQSTM1


In [80]:
# Rename the columns accordingly
scores['max_factor'] = scores['max_factor'].replace('factor0', 'mast cells')
scores['max_factor'] = scores['max_factor'].replace('factor1', 'interleukin mast cells')
final_annotations = pd.concat([final_annotations,scores['max_factor']],axis=0)

## Merging annotations with data

In [83]:
final_annotations = final_annotations.rename(columns={0:'final_annotation'})

In [84]:
adata = load_from_pickle('all-scvi-integrated-7-sc-datasets.pickle')
adata.X = adata.layers['counts'].copy()
del adata.layers['counts']
adata

AnnData object with n_obs × n_vars = 327771 × 14819
    obs: 'sample', 'patient', 'celltype_orig', 'phenotype', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes', 'doublet_score', 'predicted_doublet', 'size_factors', 'clusters', 'broad_celltypes', '_scvi_batch', '_scvi_labels', 'VI_clusters'
    uns: '_scvi_uuid', '_scvi_manager_uuid', 'neighbors', 'umap', 'leiden'
    obsm: 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'distances', 'connectivities'

In [89]:
# Subset the adata to only include 
adata.obs = pd.merge(adata.obs,final_annotations,left_index=True,right_index=True,how='left')

In [95]:
# Removing the cells with nan as cell type reference
adata = adata[~adata.obs['final_annotation'].isna()]


In [105]:
# Finished, annotated data
print(adata.obs['final_annotation'].value_counts())
print('')
print('The total number of different celltypes: ' + str(len(adata.obs['final_annotation'].unique())))

NK cells                                       52596
activated T cells                              46625
healthy epithelial                             32370
lipid response epithelial                      21460
advanced malignant epithelial                  13243
unfolded protein response endothelial cells    11310
tumor associated myeloid cells                 10718
early malignant epithelial                      9810
antigen presenting macrophages                  8424
helper T cells                                  6596
mast cells                                      6466
inflammatory monocytes                          6255
lymphoid-like myeloid cells                     6215
secreting epithelial                            5203
plasma cells                                    4498
stressed SMCs                                   3767
endothelial cells                               3611
PI3K-Akt active endothelial cells               3603
neuronal SMCs                                 

In [None]:
adata.write('single-cell-reference-with-nmf-derived-annotations-20230511.h5ad')