In [None]:
import numpy as np
import anndata as ad
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

import os
os.chdir('/lustre/scratch/kiviaho/prostate_spatial/')

from scripts.utils import load_from_pickle, save_to_pickle
import warnings
warnings.filterwarnings('ignore')



## Annotating single-cell datasets using marker genes
In this section we download a dictionary object with quality controlled
and normalized single-cell datasets. We can cluster these datasets individually
to identify cell type clusters using marker genes.

In [None]:
random_seed = 745634
figwidth = 8
figheight = 6

sc.set_figure_params(figsize=(figwidth,figheight),dpi=120)

def do_processing(adata):
    sc.pp.highly_variable_genes(adata,n_top_genes=2000)
    sc.pp.pca(adata,n_comps=20)
    sc.pp.neighbors(adata, random_state=random_seed)
    sc.tl.umap(adata,random_state=random_seed)
    sc.tl.leiden(adata, key_added="clusters",resolution=0.5)
    return(adata)

def drop_missing_markers(marker_list,missing_vals):
    for v in marker_list.values():
        for miss in missing_vals:
            if (miss) in v:
                v.remove((miss))
    return marker_list

# define a custom function to combine the columns based on a condition
def combine_phenotype_epithelial(row):
    if row['broad_celltypes'] == 'Epithelial':
        return row['phenotype'] + '_' + row['broad_celltypes']
    else:
        return row['broad_celltypes']

#### Marker genes

In [None]:
""" marker_genes_chen = {'Fibroblast':'ACTA2',
                    'Endothelial':['PECAM1','VWF','ENG'],
                    'Mast':['CMA1','MS4A2','TPSAB1','TPSB2'],
                    'Epithelial':['AR','KRT19','KRT18','KRT8','TP63','KRT14','KRT5'],
                    'Monocytic':['LYZ','FCGR3A','CSF1R','CD68','CD163','CD14','UCHL1','HAVCR2'],
                    'T_cell':['PDCD1','CTLA4','CD8A','SELL','PTPRC','CD4','BTLA','IL2RA','IL7R','CCR7','CD28','CD27','SLAMF1','DPP4','CD7','CD2','CD3G','CD3E','CD3D']} """

# This is a set of marker genes that is a combination of markers from Chen et al. 2020 and Hirz et al. 2020
# 'Fibroblast':['DCN','LUM','PTN','IGF1','APOD','COL1A2','FBLN1','MEG3','CXCL12'],

# These do not make a difference
#'mDC':['CD1C','PKIB','INSIG1','CLEC10A','C15orf48','PPA1'],
#'pDC':['IRF7','IRF4','LILRA4','PPP1R14B','SOX4','TSPAN13','KIAA0226','PTCRA','RAB11FIP1','CXCR3''IL3RA'],
#'Plasma_cell':['SEC11C','XBP1','PRDX4','SPCS2','SSR3','SDF2L1','C19orf10','MANF','TMEM258','DNAJB9']


""" broad_marker_genes_epithelial_split = {'Fibroblast':['DCN','LUM','IGF1','APOD','COL1A2','FBLN1'],
                    'Myofibroblast':['RGS5','ACTA2','MYH11','FRZB','NDUFA4L2','PPP1R14A','MYLK'],
                    'Endothelial':['VWF','ENG','CLDN5'],
                    'Epithelial':['ACPP','AR','KLK3','KLK4','KRT19','KRT18','KRT8'],
                    'Basal/intermediate':['TP63','KRT14','KRT5'],
                    'Mast':['MS4A2','TPSAB1','CPA3'],
                    'Monocyte':['LYZ','FCGR3A','CSF1R','CD68','CD163','CD14','C1QA','C1QC','C1QB','GPR34','MS4A4A'],
                    'T_cell':['PTPRC','CD4','IL7R','CCR7','DPP4','CD7','CD2','CD3G','CD3E','CD3D'],
                    'B_cell':['MS4A1','CD79A','CD79B','CD19','VPREB3','BANK1']}

broad_marker_genes = {'Epithelial':['ACPP','AR','KLK3','KLK4','KRT19','KRT18','KRT8','TP63','KRT14','KRT5'],
                    'Fibroblast':['DCN','LUM','IGF1','APOD','COL1A2','FBLN1'],
                    'Myofibroblast':['RGS5','ACTA2','MYH11','FRZB','NDUFA4L2','PPP1R14A','MYLK'],
                    'Endothelial':['VWF','ENG','CLDN5'],
                    'Mast':['MS4A2','TPSAB1','CPA3'],
                    'Monocyte':['LYZ','FCGR3A','CSF1R','CD68','CD163','CD14','C1QA','C1QC','C1QB','GPR34','MS4A4A'],
                    'T_cell':['PTPRC','CD4','IL7R','CCR7','DPP4','CD7','CD2','CD3G','CD3E','CD3D'],
                    'B_cell':['MS4A1','CD79A','CD79B','CD19','VPREB3','BANK1']} """


# These have been copy-pasted from the excel table that has a combination of genes from publications
refined_markers = {'Epithelial':['KRT5','KRT7','KRT19','MMP7','EPCAM','PRAC1','HOXB13','LRRC26','SPDEF','NKX3-1',
                                 'KLK3','FOLH1','SCHLAP1','PLA2G2A','MSMB','CETN2','ZMYND10','EPCAM','PCA3','KLK3',
                                 'KRT18','KRT8','ACPP','MSMB','S100P','SCGB3A1','SCGB1A1','SLC4A4','CLDN3','SAA1',
                                 'SAA2','KRT17','S100A2','ACPP','KLK3','MSMB','KLK2','NEFH','AZGP1','RDH11','PLA2G2A',
                                 'TMPRSS2','NKX3-1','AR','KRT19','KRT18','KRT8','TP63','KRT14','KRT5','DPP4'],
                    'Endothelial':['ACKR1','CCL14','PECAM1','IFI27','CLEC14A','ENPP2','DEPP1','RAMP2','VWF',
                                   'IF127','RNASE1','ACKR1','TM4SF1','VWF','SELE','IFI27','FLT1','SPARCL1',
                                   'SDPR','PTPRB','DARC','PLVAP','A2M','PECAM1','VWF','ENG'],
                    'Fibroblast':['LUM','DCN','IGF1','DCN','LUM','MGP','DCN','PTGDS','APOD',
                                  'STC1','AREG','DCN','FBLN1','COL1A2','IGF1','C7','IGFBP5','CCDC80','CFD','LTBP4','SFRP1','ACTA2'],
                    'SMC':['RGS5','ACTA2','TAGLN','BGN','RGS5','MT1M','ACTA2','ACTG2','DES','RGS5','NDUFA4L2','RERGL','MYH11','RGS5',
                           'ACTA2','TAGLN','MYL9','MYLK','C11orf96','MCAM','CALD1','LMOD1'],
                    'Mast':['KIT','TPSB2','TPSAB1','CPA3','TPSAB1','CPA3','CPA3','TPSAB1','KIT',
                            'VWA5A','IL1RL1','CTSG','SLC18A2','ACSL4','MS4A2','GATA2','CMA1','MS4A2','TPSAB1','TPSB2'],
                    'T cell':['CD3D','CD3E','CD3G','CD8A','CD8B','IL7R','CD2','CCL5','IFNG','CD8B','CD8A','SMC4','MK167',
                              'CTLA4','IL2RA','FOXP3','CD3D','CCL5','CD69','IL7R','TRBC2','CCL5','IFNG','CD8A','CXCR4',
                              'PTPRC','ETS1','CCL4','CD69','PDCD1','CTLA4','CD8A','SELL','PTPRC','BTLA','IL2RA','IL7R',
                              'CCR7','CD28','CD27','SLAMF1','CD7','CD2','CD3G','CD3E','CD3D'],
                        'B cell':['CD79A','MS4A1','MS4A1','CD79B','IGJ','MZB1','XBP1','TPSB2',
                                  'TPSAB1','IGKC','IGHA1','IGJ','IGHA2','AC096579.7','MZB1',
                                  'IGHG3','SLAMF7','IGHG4','IGHG1','MS4A1','IGHM','CXCR5','CD79A','CD22','BANK1','LY9','CCR7','IRF8','CD83'],
                        'Myeloid':['CD4','C1QA','C1QB','AIF1','CD68','LYZ','IL1B','FCN1','S100A12','C1QC','C1QA','S100A9','S100A8','LYZ',
                                   'HLA-DPB1','HLA-DRA','IL1B','HLA-DRA','HLA-DPA1','HLA-DPB1','HLA-DRB1','CD74','IL8','HLA-DQA1','IF130',
                                   'LYZ','LYZ','FCGR3A','CSF1R','CD68','CD163','CD14','UCHL1','HAVCR2','CD4'],
                        'Neuronal':['PLP1','MPZ','S100A8','MT1H'],
                        'Dendritic':['IRF7','IRF4','FCER1A','CD1C']  
                                 }

# Modify the list of markers to 
# 1) be sorted according to the number of publications a marker is present in
# 2) not contain any duplicates
for k in refined_markers.keys():
    refined_markers[k] = list(pd.Series(refined_markers[k]).value_counts().index)


In [None]:
# Loading the data
adata_dict = load_from_pickle('normalized_sc_7_datasets.pickle')
adata_dict.keys()
datasets = list(adata_dict.keys())

### Dong et al. 2020

In [None]:
adata = adata_dict['dong_2020'].copy()
adata = do_processing(adata)


In [None]:
refined_markers = drop_missing_markers(refined_markers,['AC096579.7', 'DARC', 'IF127', 'IF130', 'IGJ', 'IL8', 'MK167'])
sc.tl.dendrogram(adata,groupby='clusters')
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)

In [None]:
# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['1','0','13','7','8','10','12'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['9'],'Endothelial')
broad_celltypes = broad_celltypes.replace(['3'],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['5'],'SMC')
broad_celltypes = broad_celltypes.replace(['11'],'Mast')
broad_celltypes = broad_celltypes.replace(['2'],'T cell')
broad_celltypes = broad_celltypes.replace(['14','16'],'B cell')
broad_celltypes = broad_celltypes.replace(['15','6'],'Myeloid')
broad_celltypes = broad_celltypes.replace(['4'],'Epi_Endothelial_dong2020')


# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df



adata.obs['broad_celltypes'] = broad_celltypes
sc.set_figure_params(figsize=(figwidth,figheight),dpi=120)
sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)


In [None]:
if (adata.obs_names == adata_dict['dong_2020'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['dong_2020'].obs = adata.obs.copy()

### Chen et al. 2021

In [None]:
adata = adata_dict['chen_2021'].copy()
adata = do_processing(adata)

refined_markers = drop_missing_markers(refined_markers,['DEPP1'])
sc.tl.dendrogram(adata,groupby='clusters')
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)

In [None]:
# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['0','1','3','17','5','12','13','6','7','9','16'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['4','15'],'Endothelial')
broad_celltypes = broad_celltypes.replace([],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['10'],'SMC')
broad_celltypes = broad_celltypes.replace(['11'],'Mast')
broad_celltypes = broad_celltypes.replace(['2'],'T cell')
broad_celltypes = broad_celltypes.replace(['14'],'B cell')
broad_celltypes = broad_celltypes.replace(['8'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')


adata.obs['broad_celltypes'] = broad_celltypes


# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df

sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['chen_2021'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['chen_2021'].obs = adata.obs.copy()

### Song et al. 2022

In [None]:
adata = adata_dict['song_2022'].copy()
adata = do_processing(adata)

refined_markers = drop_missing_markers(refined_markers,['ACKR1', 'PECAM1', 'SCHLAP1', 'TPSB2'])
sc.tl.dendrogram(adata,groupby='clusters')
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)


In [None]:
# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['11','0','8','13','3','5','12','4','6'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['7'],'Endothelial')
broad_celltypes = broad_celltypes.replace(['10'],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['9'],'SMC')
broad_celltypes = broad_celltypes.replace(['15'],'Mast')
broad_celltypes = broad_celltypes.replace(['1'],'T cell')
broad_celltypes = broad_celltypes.replace(['16'],'B cell')
broad_celltypes = broad_celltypes.replace(['2','14'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')


adata.obs['broad_celltypes'] = broad_celltypes
# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df

sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['song_2022'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['song_2022'].obs = adata.obs.copy()

## Cheng et al. 2022

In [None]:
adata = adata_dict['cheng_2022'].copy()
adata = do_processing(adata)

refined_markers = drop_missing_markers(refined_markers,['CD1C', 'CMA1', 'CTSG', 'FCER1A', 'MS4A1', 'SDPR'])
sc.tl.dendrogram(adata,groupby='clusters')
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)


In [None]:
# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['4','0','5','3','10','6','9','1','2','8','11'],'Epithelial')
broad_celltypes = broad_celltypes.replace([],'Endothelial')
broad_celltypes = broad_celltypes.replace(['12'],'Fibroblast')
broad_celltypes = broad_celltypes.replace([],'SMC')
broad_celltypes = broad_celltypes.replace([],'Mast')
broad_celltypes = broad_celltypes.replace(['15'],'T cell')
broad_celltypes = broad_celltypes.replace([],'B cell')
broad_celltypes = broad_celltypes.replace(['13'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')
broad_celltypes = broad_celltypes.replace(['7','14'],'Epi_Endothelial_cheng2022')

adata.obs['broad_celltypes'] = broad_celltypes
# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df


sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['cheng_2022'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['cheng_2022'].obs = adata.obs.copy()

## Chen (Gonghong) et al. 2022

In [None]:
adata = adata_dict['chen_2022'].copy()
adata = do_processing(adata)

refined_markers = drop_missing_markers(refined_markers,['AC096579.7', 'DARC', 'IF127', 'IF130', 'IGJ', 'IL8', 'MK167', 'SDPR'])

sc.tl.dendrogram(adata,groupby='clusters')
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)


In [None]:
# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['11','0','3','4','7'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['13','6','10'],'Endothelial')
broad_celltypes = broad_celltypes.replace(['5','12'],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['2','8'],'SMC')
broad_celltypes = broad_celltypes.replace(['14'],'Mast')
broad_celltypes = broad_celltypes.replace(['1'],'T cell')
broad_celltypes = broad_celltypes.replace(['16'],'B cell')
broad_celltypes = broad_celltypes.replace(['15','9'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')

adata.obs['broad_celltypes'] = broad_celltypes
# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df


sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['chen_2022'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['chen_2022'].obs = adata.obs.copy()

# Wong et al. 2022

In [None]:
adata = adata_dict['wong_2022'].copy()
adata = do_processing(adata)
sc.tl.dendrogram(adata,groupby='clusters')

refined_markers = drop_missing_markers(refined_markers,['AC096579.7', 'C11orf96', 'DARC', 'IF127', 'IF130', 'IGJ', 'IL8', 'MK167', 'SDPR'])
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)


In [None]:

# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['8','3','14','4','7'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['2'],'Endothelial')
broad_celltypes = broad_celltypes.replace(['13'],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['10'],'SMC')
broad_celltypes = broad_celltypes.replace(['12'],'Mast')
broad_celltypes = broad_celltypes.replace(['0','1','9'],'T cell')
broad_celltypes = broad_celltypes.replace(['5'],'B cell')
broad_celltypes = broad_celltypes.replace(['6','11'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')

adata.obs['broad_celltypes'] = broad_celltypes
# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df


sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['wong_2022'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['wong_2022'].obs = adata.obs.copy()

# Hirz et al. 2023

In [None]:
""" adata = adata_dict['hirz_2023'].copy()
adata = do_processing(adata)
 """
sc.tl.dendrogram(adata,groupby='clusters')
refined_markers = drop_missing_markers(refined_markers,['AC096579.7', 'ACKR1', 'DEPP1', 'IF127', 'IF130', 'IGHA1', 'IGHA2', 'IGHG1', 'IGHG3', 'IGHG4', 'IGHM', 'IGKC', 'MK167', 'PECAM1', 'SCHLAP1', 'TPSB2', 'TRBC2'])
sc.pl.dotplot(adata,refined_markers,groupby='clusters',layer='counts',dendrogram=True, log=True)


In [None]:

# Modify the labels according to the dotplot
broad_celltypes = adata.obs['clusters'].copy()

broad_celltypes = broad_celltypes.replace(['9','11','13'],'Epithelial')
broad_celltypes = broad_celltypes.replace(['8'],'Endothelial')
broad_celltypes = broad_celltypes.replace([],'Fibroblast')
broad_celltypes = broad_celltypes.replace(['6'],'SMC')
broad_celltypes = broad_celltypes.replace(['10'],'Mast')
broad_celltypes = broad_celltypes.replace(['4','0','14','3','16','1','2'],'T cell')
broad_celltypes = broad_celltypes.replace(['15'],'B cell')
broad_celltypes = broad_celltypes.replace(['5','12','7','3','16','1','2'],'Myeloid')
broad_celltypes = broad_celltypes.replace([],'Neuronal')
broad_celltypes = broad_celltypes.replace([],'Dendritic')

adata.obs['broad_celltypes'] = broad_celltypes
# Add the phenotype to distinguish between healthy / malignant epithelial cell populations
df = adata.obs.copy()
df['broad_celltypes'] = df.apply(combine_phenotype_epithelial, axis=1)
if (df.index == adata.obs.index).all():
    adata.obs = df

sc.pl.umap(adata,color=['broad_celltypes','sample'],size=10)
sc.tl.dendrogram(adata,groupby='broad_celltypes')
sc.pl.dotplot(adata,refined_markers,groupby='broad_celltypes',layer='counts',dendrogram=True, log=True)

In [None]:
if (adata.obs_names == adata_dict['hirz_2023'].obs_names).all():
    print('Adding broad cell type annotations...')
    adata_dict['hirz_2023'].obs = adata.obs.copy()

In [None]:
adata_dict

In [None]:
save_to_pickle(adata_dict,'normalized_sc_7_datasets_with_annot.pickle')