In [None]:
import os
import sys
print("Python version" + sys.version)
os.getcwd()
print(sys.executable)

In [2]:
import numpy as np
np.random.seed(123)
import pandas as pd
import scipy
import itertools

import umap
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scanpy as sc
import anndata as ad
import scvelo as scv
from tqdm.notebook import tqdm

from pathlib import Path

In [None]:
sc.settings.verbosity = 1
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [4]:
# remove weird grid from scvelo
plt.rcParams['axes.grid'] = False

In [5]:
# revised from Stefan's cell type signature
signatures_path_ = '../cell_type_from_stefan/scrnaseq_signature_collection/'
from score_and_classify import *

In [6]:
data_folder = '/fast/users/twei_m/work/crc/datasets'

In [7]:
new_data_folder = '/fast/users/twei_m/work/crc/datasets_new_preprocessing'

In [8]:
adata_all = sc.read(Path(new_data_folder)/'202305_CB_all_cells.h5')

In [11]:
adata_epi = sc.read(Path(new_data_folder)/'202306_CB_epi_Numbat_Scitcem_inferCNV_icms_Uhlitz_scanvi.h5')

In [13]:
adata_str = adata_all[adata_all.obs['celltype_1a'] == 'str'].copy()
adata_imm = adata_all[adata_all.obs['celltype_1a'] == 'imm'].copy()

In [None]:
scv.pl.scatter(adata_all, basis='umap', color=['celltype_1a', 'epi_score', 'str_score', 'imm_score'], 
               ncols=2, dpi=300, legend_loc='right margin', size = 1)

In [17]:
smillie_path = Path(new_data_folder)/'1-s2.0-S0092867419307329-mmc2.xlsx'

In [18]:
def score_from_smillie_TableS2(adata, signatures_path=smillie_path):
    epi_tab = pd.read_excel(smillie_path, sheet_name=0)
    str_tab = pd.read_excel(smillie_path, sheet_name=1)
    imm_tab = pd.read_excel(smillie_path, sheet_name=2)
    score_genes(adata, gene_list= np.unique(np.array(epi_tab['gene'].dropna(), dtype='str')), 
                score_name='epi')
    score_genes(adata, gene_list= np.unique(np.array(str_tab['gene'].dropna(), dtype='str')), 
                score_name='str')
    score_genes(adata, gene_list= np.unique(np.array(imm_tab['gene'].dropna(), dtype='str')), 
                score_name='imm')


In [None]:
score_from_smillie_TableS2(adata_all)

In [20]:
adata_all.obs['celltype_major'] = np.array(['epi', 'str', 'imm'])[np.argmax(adata_all.obs[['epi', 'str', 'imm']].values, axis=1)]
adata_all.obs['celltype_major_score'] = np.max(adata_all.obs[['epi', 'str', 'imm']].values, axis=1)


In [None]:
adata_all.uns['celltype_1a_colors'] = ['#144FAC', '#DD6B80', '#4EAC57']

scv.pl.scatter(adata_all, basis='umap', color=['celltype_1a'], 
               ncols=2, dpi=300, legend_loc='right margin', size = 2, 
              title = ['major cell types'])

In [None]:
scv.pl.scatter(adata_all, basis='umap', color=['epi_score', 'epi', 'str_score', 'str','imm_score', 'imm',
                                              'celltype_1a', 'celltype_major',
                                              'celltype_1a_score', 'celltype_major_score'], 
               ncols=2, dpi=300, legend_loc='right margin', size = 1)

In [None]:
scv.pl.scatter(adata_all, basis='umap', color=['PDGFRA', 'CD81', 'BMP4', 'GREM1', 'CD34','FOXL1', 'GLI1',
                                              'PDPN'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
# blood and lymphatic endo
scv.pl.scatter(adata_all, basis='umap', color=['PECAM1', 'LYVE1'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
# immune
scv.pl.scatter(adata_all, basis='umap', color=['CD3G','CD3E','CD3G','CD8A','CD8B','CD4',
                                               'CD19', 'TNFRSF8','CD34','CD38','CD14','ITGAM', 'PTPRC',
                                               'FCGR2A',''], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

### score the whole thing

In [23]:
epi_tab = pd.read_excel(smillie_path, sheet_name=0)
str_tab = pd.read_excel(smillie_path, sheet_name=1)
imm_tab = pd.read_excel(smillie_path, sheet_name=2)

In [24]:
epi_list = epi_tab.groupby('ident')['gene'].apply(list)
str_list = str_tab.groupby('ident')['gene'].apply(list)
imm_list = imm_tab.groupby('ident')['gene'].apply(list)

In [25]:
epi_celltype = np.unique(epi_tab[['ident']])
str_celltype = np.unique(str_tab[['ident']])
imm_celltype = np.unique(imm_tab[['ident']])

In [26]:
all_cell_type = np.append(epi_celltype, 
                          np.append(str_celltype,imm_celltype))

In [None]:
for ctype in np.unique(epi_tab[['ident']]):
    score_genes(adata_all, gene_list= epi_list[[ctype]][0], 
                score_name=ctype)

In [None]:
for ctype in np.unique(str_tab[['ident']]):
    score_genes(adata_all, gene_list= str_list[[ctype]][0], 
                score_name=ctype)

In [None]:
for ctype in np.unique(imm_tab[['ident']]):
    score_genes(adata_all, gene_list= imm_list[[ctype]][0], 
                score_name=ctype)

In [30]:
adata_all.obs['celltype_fine'] = np.array(all_cell_type)[np.argmax(adata_all.obs[all_cell_type].values, axis=1)]


In [None]:
scv.pl.scatter(adata_all, basis='umap', color=['celltype_fine'], 
               ncols=2, dpi=300, legend_loc='right margin', size = 1)

### score each compartment separately

In [None]:
for ctype in np.unique(epi_tab[['ident']]):
    score_genes(adata_epi, gene_list= epi_list[[ctype]][0], 
                score_name=ctype)

In [34]:
adata_epi.obs['celltype_direct_smillie'] = np.array(epi_celltype)[np.argmax(adata_epi.obs[epi_celltype].values, axis=1)]


In [None]:
adata_epi.obs[['sample_origin' , 'celltype_direct_smillie']].value_counts().sort_index()

In [37]:
adata_epi.uns['celltype_direct_smillie_colors'] = adata_epi.uns['cell_type_epi_custom_colors']

In [None]:
scv.pl.scatter(adata_epi, basis='umap', color=['BRAF','KRAS', 'TP53', 'APC','NOTCH1', 'CTNNB1'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_epi, basis='umap', color=['LGR5','P2RX4', 'MTOR', 'AKT1', 'TOP2A', 'LRP6', 'BMP4', 'BMP2'], 
               ncols=3, dpi=300, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_epi, basis='umap', color=['celltype_direct_smillie', 'Uhlitz_scANVI', 'cell_type_epi_custom'], 
               ncols=1, dpi=300, legend_loc='right margin', size = 2)

In [None]:
for ctype in np.unique(imm_tab[['ident']]):
    score_genes(adata_imm, gene_list= imm_list[[ctype]][0], 
                score_name=ctype)

In [42]:
adata_imm.obs['celltype_direct_smillie'] = np.array(imm_celltype)[np.argmax(adata_imm.obs[imm_celltype].values, 
                                                                            axis=1)]

In [None]:
scv.pl.scatter(adata_imm, basis='umap', color=['celltype_direct_smillie'], 
               ncols=1, dpi=300, legend_loc='right margin', size = 2)

## Keep Uhlitz epi, score the immune and stromal

In [421]:
# re-run PCA and louvain/leiden for better annotation
# dont touch UMAP for plotting
adata_imm.uns['log1p']['base'] = None
sc.pp.highly_variable_genes(adata_imm, n_top_genes=2000, batch_key='sample') 
sc.tl.pca(adata_imm, svd_solver='arpack', n_comps = 50, use_highly_variable=True)
sc.pp.neighbors(adata_imm, n_neighbors=20, n_pcs=15)

sc.tl.louvain(adata_imm, key_added='louvain', resolution=1)
sc.tl.leiden(adata_imm, key_added='leiden', resolution=1)
sc.tl.louvain(adata_imm, key_added='louvain_highres', resolution=2)
sc.tl.leiden(adata_imm, key_added='leiden_highres', resolution=2)

In [429]:
score_tumor_immune_cells(adata_imm, signatures_path_)

In [427]:
imm_basic = ['B cells', 'T cells', 'Macrophages', 'Monocytes', 
                              'Neutrophils', 'NK cells', 'Plasma cells']

In [None]:
# immune
# https://www.nature.com/articles/s41467-022-29366-6

scv.pl.scatter(adata_imm, basis='umap', color=['CD3G','CD3E','CD3G','CD8A','CD8B','CD4', 'CD28', # T cells
                                               'FOXP3', 'IL2RA', 'CTLA4', 'TNFRSF18', 'ENTPD1', # Treg, CD25, CD39, CD73 
                                               'CD19', 'MS4A1', 'CD79A', 'CD79B' # CD20, B cells
                                               ], 
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [None]:
scv.pl.scatter(adata_imm, basis='umap', color=['NCAM1','FCGR3A', 'KLRF1',
                                               'NCR3', 'NCR2', 'NCR1'
                                               #CD56, 'CD16', NK
                                               ], 
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [None]:
# immune
# https://www.nature.com/articles/s41467-022-29366-6

scv.pl.scatter(adata_imm, basis='umap', color=['FCGR3A', # CD16, NK, Monocytes, Macrophages, Neutrophil
                                               'CD14', # Mono, Macro
                                               'ITGAM', # CD11b, Mono, Macro, Neu
                                               'CD68', # Mye, Macro, DC, Mono
                                               'CD163', # M2, Mono, DC(subset)
                                               'CEACAM8', 'FCGR3B',# CD66b, Neu unique
                                               'ITGAX', 'CXCL8', 'CD86', 'CD209', 'CD33', 'CCR2',#CD11c, DC
                                               'PTPRC', # CD45, HSC
                                               'FCGR2A', # CD32a, Mono, Macro, DC, Mast, Neu
                                               'SPP1' # Macro
                                               ], 
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [None]:
# immune, Pelka
# https://doi.org/10.1016/j.cell.2021.08.003 
scv.pl.scatter(adata_imm, basis='umap', color=['FCN1', 'VCAN', 'CD300E', 'S100A12' # Mono
                                              ], 
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [None]:
# immune, Pelka
# https://doi.org/10.1016/j.cell.2021.08.003 
scv.pl.scatter(adata_imm, basis='umap', color=['CD163', 'APOC1', 'LIPA',  # Macro
                                               'C1QA', 'CSF1R', # Macro + DC
                                               #'CLEC9A', 'XCR1', 
                                               'BATF3',
                                              'FCER1A', 'CD1C', 'CLEC10A', 'CD1E', 'PAK1', #'IL22RA2',
                                              #'CLEC4C', 'IRF7', 'CXCR3', 'LILRA4',
                                              #'SIGLEC6', 'AXL','CCL19',
                                               'CSF3R',
                                               'LAMP3', 'CCL22', 'CCR7','CD274', 'FCGR3B', 'HCAR2'], # different DCs
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [None]:
# immune
# https://www.nature.com/articles/s41467-022-29366-6

scv.pl.scatter(adata_imm, basis='umap', color=['SDC1','MZB1', 'XBP1',  # 'CD138' plasma
                                               'KIT','IL1RL1', 'MS4A2', 'TPSAB1', 'CPA3'# Mast
                                              ], 
               ncols=4, dpi=150, legend_loc='right margin', size = 2)

In [574]:
score_genes(adata_imm, ['CD3G','CD3E','CD3G','CD4', 'PTPRC'], score_name='CD4+ T cells')
score_genes(adata_imm, ['CD3G','CD3E','CD3G','CD8A','CD8B', 'PTPRC'], score_name='CD8+ T cells')
score_genes(adata_imm, ['CD3G','CD3E','CD3G','CD4','FOXP3', 'PTPRC'], score_name='Treg cells')
score_genes(adata_imm, ['CD19', 'MS4A1', 'CD79A', 'CD79B', 'PTPRC'], score_name='B cells (my)')

score_genes(adata_imm, ['NCAM1','FCGR3A','KLRF1','NCR3', 'NCR2', 'NCR1'], score_name='NK cells (my)')

score_genes(adata_imm, ['KIT','IL1RL1', 'MS4A2', 'TPSAB1', 'CPA3'], score_name='Mast cells')
score_genes(adata_imm, ['SDC1','MZB1', 'XBP1'], score_name='Plasma cells (my)')

#score_genes(adata_imm, ['CD68', 'PTPRC'], score_name='Myeloid cells')

In [None]:
score_genes(adata_imm, ['FCGR3B', 'CEACAM8','ITGAM', 'PTPRC'], score_name='Neutrophils (my)')
score_genes(adata_imm, ['ITGAX','C1QA', 'CSF1R'], 
            score_name='Dendritic cells')
score_genes(adata_imm, ['CD14','FCN1', 'VCAN', 'CD300E', 'S100A12'], 
            score_name='Monocytes (my)')
score_genes(adata_imm, ['CD68', 'CD14','APOC1', 'LIPA','C1QA', 'CSF1R'], 
            score_name='Macrophages (my)')


In [None]:
imm_higher = ['B cells (my)',
              'CD4+ T cells','CD8+ T cells','Treg cells', 'NK cells (my)','Mast cells','Dendritic cells',
              'Monocytes (my)','Macrophages (my)','Plasma cells (my)',
              'Neutrophils (my)'#, 'Myeloid cells',
             ]

In [None]:
# basic 
df = adata_imm.obs[imm_basic + ['leiden']].groupby(by=["leiden"], dropna=False).mean().T
plt.figure(figsize = (16,4))
ax = sns.heatmap(df, square=True)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 0)
ax.set_yticklabels(ax.get_yticklabels(),rotation = 0)

;

In [None]:
pd.DataFrame(np.array(df.index[np.argmax(df.values, axis=0)]), index=df.columns).T

In [482]:
celltypes = df.index[np.argmax(df.values, axis=0)]
clusters = df.columns
adata_imm.obs['imm_basic']=None
for celltype, cluster in zip(celltypes, clusters):
    adata_imm.obs['imm_basic'][adata_imm.obs['leiden']==cluster] = celltype

In [None]:
# higher
df = adata_imm.obs[imm_higher + ['leiden']].groupby(by=["leiden"], dropna=False).mean().T
plt.figure(figsize = (16,7))
ax = sns.heatmap(df, square=True)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 0)
ax.set_yticklabels(ax.get_yticklabels(),rotation = 0)

;

In [None]:
pd.DataFrame(np.array(df.index[np.argmax(df.values, axis=0)]), index=df.columns).T

In [579]:
celltypes = df.index[np.argmax(df.values, axis=0)]
clusters = df.columns
adata_imm.obs['imm_higher']=None
for celltype, cluster in zip(celltypes, clusters):
    adata_imm.obs['imm_higher'][adata_imm.obs['leiden']==cluster] = celltype

### Florian's anno

In [333]:
adata_florian_all = sc.read(Path(data_folder)/'anno/CRC/3p/seu_all_final.h5')

In [335]:
adata_imm.obs['flo_imm'] = adata_florian_all.obs.reindex(adata_imm.obs.index)[['cell_type_imm']]

In [None]:
scv.pl.scatter(adata_imm, color=['louvain_highres', 'leiden', 'leiden_highres'], 
               legend_loc='right', 
               size = 2, ncols =1, dpi=150)

In [None]:
scv.pl.scatter(adata_imm, color=['imm_basic', 'imm_higher','flo_imm'], 
               legend_loc='right', 
               size = 2, ncols =1, dpi=150)

In [585]:
adata_imm.obs['imm_higher'] = adata_imm.obs['imm_higher'].cat.rename_categories({
   'B cells (my)':'B cells',
   'Macrophages (my)':'Macrophages',
   'Monocytes (my)':'Monocytes',
   'Neutrophils (my)':'Neutrophils',
   'Plasma cells (my)':'Plasma cells'
})

In [None]:
scv.pl.scatter(adata_imm, color=['imm_higher'], 
               legend_loc='right', 
               size = 2, ncols =2, dpi=150)

### Stromal cells: endothelial, fibroblast, other

In [587]:
# re-run PCA and louvain/leiden for better annotation
# dont touch UMAP for plotting
adata_str.uns['log1p']['base'] = None
sc.pp.highly_variable_genes(adata_str, n_top_genes=2000, batch_key='sample') 
sc.tl.pca(adata_str, svd_solver='arpack', n_comps = 50, use_highly_variable=True)
sc.pp.neighbors(adata_str, n_neighbors=20, n_pcs=15)

sc.tl.louvain(adata_str, key_added='louvain', resolution=1)
sc.tl.leiden(adata_str, key_added='leiden', resolution=1)
sc.tl.louvain(adata_str, key_added='louvain_highres', resolution=2)
sc.tl.leiden(adata_str, key_added='leiden_highres', resolution=2)

In [588]:
adata_str.obs['flo_str'] = adata_florian_all.obs.reindex(adata_str.obs.index)[['cell_type_str']]

In [None]:
scv.pl.scatter(adata_str, color=['louvain','leiden','flo_str'], 
               legend_loc='right', 
               size = 2, ncols =2, dpi=150)

In [None]:
# Phillips
# endothelial/Platelet: PECAM1
# muscle cells: ACTA2
# https://www.nature.com/articles/s41467-022-29366-6
scv.pl.scatter(adata_str, color=['PECAM1', 'VWF', 'MCAM', 'ENG', 'CDH5',# Endo
                                 'FAP', 'TNC', # CAF
                                 'IL13RA2','IL11', # infl. FB
                                 'COL1A1', 'COL3A1', # FB 
                                 'S100B', 'CDH2', # glial
                                 'ACTA2', # Pericytes/Smooth muscle
                                  'VIM', # mesenchymal
                                  'DES', 'S100A4', 'FN1', # potential, not unique myofib/Peri
                                 'PDGFRB', 'CSPG4' # Peri
                                 ],
               legend_loc='right', 
               size = 2, ncols =5, dpi=150)

In [622]:
score_genes(adata_str, ['PECAM1', 'VWF', 'MCAM', 'ENG', 'CDH5'], score_name='Endothelial cells')
score_genes(adata_str, ['FAP'], score_name='CAFs')
score_genes(adata_str, ['COL1A1', 'COL3A1','ACTA2'], score_name='Fibroblasts')
score_genes(adata_str, ['ACTA2','PDGFRB', 'CSPG4','DES'], score_name='Pericytes')
#score_genes(adata_str, ['FN1'], score_name='Myofibroblasts')


In [623]:
str_basic = ['Endothelial cells', 'CAFs','Fibroblasts', 'Pericytes'] #,'Myofibroblasts'

In [None]:
# basic
df = adata_str.obs[str_basic + ['leiden']].groupby(by=["leiden"], dropna=False).mean().T
plt.figure(figsize = (16,7))
ax = sns.heatmap(df, square=True)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 0)
ax.set_yticklabels(ax.get_yticklabels(),rotation = 0)

;

In [625]:
pd.DataFrame(np.array(df.index[np.argmax(df.values, axis=0)]), index=df.columns).T

leiden,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Endothelial cells,Fibroblasts,Fibroblasts,Pericytes,Endothelial cells,Fibroblasts,Fibroblasts,Endothelial cells,Fibroblasts,Fibroblasts,Fibroblasts,Fibroblasts,Fibroblasts,Pericytes,Endothelial cells,Fibroblasts


In [626]:
celltypes = df.index[np.argmax(df.values, axis=0)]
clusters = df.columns
adata_str.obs['str_basic']=None
for celltype, cluster in zip(celltypes, clusters):
    adata_str.obs['str_basic'][adata_str.obs['leiden']==cluster] = celltype

In [None]:
scv.pl.scatter(adata_str, color=['str_basic'], 
               legend_loc='right', 
               size = 2, ncols =2, dpi=150)

### Put all cell type together

In [648]:
adata_all.obs['cell_type_level2'] = None

for i in np.arange(0, adata_all.shape[0]):
    if adata_all.obs['celltype_1a'][i] == 'epi':
        adata_all.obs['cell_type_level2'][i] = adata_epi.obs.loc[adata_all.obs['cell_type_level2'].index[i]]['Uhlitz_scANVI']
    elif adata_all.obs['celltype_1a'][i] == 'str':
        adata_all.obs['cell_type_level2'][i] = adata_str.obs.loc[adata_all.obs['cell_type_level2'].index[i]]['str_basic']
    elif adata_all.obs['celltype_1a'][i] == 'imm':
        adata_all.obs['cell_type_level2'][i] = adata_imm.obs.loc[adata_all.obs['cell_type_level2'].index[i]]['imm_higher']
    

In [None]:
scv.pl.scatter(adata_all, color=['cell_type_level2'], 
               legend_loc='right', 
               size = 2, ncols =1, dpi=300)

In [None]:
adata_all.write(Path(new_data_folder)/'adata_all_full_cell_type_annotation.h5')