# Annotating Cell Types

This workbook is runned after the standard workflow with the PBMC 3k.



In [None]:
import besca as bc
import numpy as np
import pandas as pd
import scanpy.api as sc
import matplotlib.pyplot as plt
from scipy import sparse, io
import os
import time
import logging
import seaborn as sns
sc.logging.print_versions()

# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
version = '2.8'
start0 = time.time()

In [None]:
#define standardized filepaths based on above input
root_path = os.getcwd()
bescapath="./Code/Besca/besca_dev/"#o##s.path.split(root_path)[0]

### Uncomment this when running after the sworflow
analysis_name = 'pbmc3k_filtered'
results_folder = os.path.join(root_path, 'analyzed', analysis_name)
clusters='leiden'


results_folder = os.path.join(root_path, 'analyzed', analysis_name)


In [None]:

adata = sc.read_h5ad(os.path.join(results_folder, analysis_name + '.h5ad') )
adata


In [None]:
sc.pl.umap(adata, color= [clusters], legend_loc='on data')

In [None]:
# One can load besca-provided signatures using the function below
signature_dict = bc.datasets.load_immune_signatures(refined=False)

signature_dict

In [None]:

gmt_file= bescapath + '/besca/datasets/genesets/Immune.gmt'
bc.tl.sig.combined_signature_score(adata, gmt_file,
                             UP_suffix='_UP', DN_suffix='_DN', method='scanpy',
                             overwrite=False, verbose=False,
                             use_raw=True, conversion=None)

In [None]:
scores = [x for x in adata.obs.columns if 'scanpy' in x]

In [None]:
sc.pl.umap(adata, color= scores)

# Immune signatures for specific sub-populations

In [None]:
 ## PROVIDED WITH BESCA
gmt_file_anno= bescapath + '/besca/datasets/genesets/CellNames_scseqCMs6_sigs.gmt'
bc.tl.sig.combined_signature_score(adata, gmt_file_anno)


In [None]:
scores = [x for x in adata.obs.columns if 'scanpy' in x]
sc.pl.umap(adata, color= scores, color_map = 'viridis')

# Automated annotation

A decision-tree-based annotation that reads signatures from a provided .gmt file and hierarchy as well as cutoffs and signature ordering from a configuration file and attributes each cell to a specific type according to signature enrichment. 

This is an aid to start ther annotation and annotation can then be further refined by adding further signatures or adjusting the configuration files. It was tested mainly on PBMCs and oncology (tumor biopsies) related samples.


In [None]:
mymarkers = bc.tl.sig.read_GMT_sign(gmt_file_anno,directed=False)
mymarkers = bc.tl.sig.filter_siggenes(adata, mymarkers) ### remove genes not present in dataset or empty signatures
mymarkers['Ubi'] = ['B2M','ACTB', 'GAPDH'] ### used for cutoff adjustment to individual dataset, can be modified

In [None]:
sc.pl.umap(adata, color= mymarkers['NClassMonocyte'])

We read the configuration file, containing hierarchy, cutoff and signature priority information. 
A new version of this file should be created and maintained with each annotation. 
The included example is optimised for the annotation of the 6.6k PBMC dataset. 

In [None]:
configfile=bescapath + '/besca/datasets/genesets/CellNames_scseqCMs6_config.tsv' ### replace this with your config

In [None]:
sigconfig,levsk=bc.tl.sig.read_annotconfig(configfile)

Fract_pos was exported by BESCA in the standard worflow test, 
contains information of fraction positive cells per genes per cluster.

We use these values as a basis for a wilcoxon test per signature per cluster. 

In [None]:

f=pd.read_csv(results_folder + "/labelings/"+clusters+"/fract_pos.gct",sep="\t",skiprows=2)
df=bc.tl.sig.score_mw(f,mymarkers)
myc=np.median(df.loc['Ubi',:]*0.5) ### Set a cutoff based on Ubi and scale with values from config file


In [None]:
df.iloc[0:3,0:7]

For each signature, positive and negative clusters are determined. Only positive clusters are maintained. Cutoffs can be individualised based on the config file (scaling factor) and myc, which is determined based on ubiquitously expressed genes. 

In [None]:
df=df.drop('Ubi')

In [None]:
#Cluster attribution based on cutoff

sigscores={}
for mysig in list(df.index):
    sigscores[mysig]=bc.tl.sig.getset(df,mysig,sigconfig.loc[mysig,'Cutoff']*myc)
    #sigscores[mysig]=bc.tl.sig.getset(df,mysig,10)

One can inspect the cluster attribution per cell type in the signature list and adjust cutoffs as required. 

In [None]:
#sigscores

In [None]:
sc.pl.umap(adata, color= [clusters], legend_loc='on data')

Now each cluster gets annotated, according to the distinct levels specified in the config file. 
Note that in case a cluster is positive for multiple identities, only the first one is taken, 
in the order specified in the "Order" column in the config file. 

To check the given order, per levels, you can inspect levsk

In [None]:
#levsk

In [None]:

sigconfig.loc["CD8Tcell","Cutoff"]=1.2
# RECOMPUTING SIG SCORE WITH NEW CUTOFF
sigscores={}
for mysig in list(df.index):
    sigscores[mysig]=bc.tl.sig.getset(df,mysig,sigconfig.loc[mysig,'Cutoff']*myc)
    #sigscores[mysig]=bc.tl.sig.getset(df,mysig,10)

In [None]:
cnames=bc.tl.sig.make_anno(df,sigscores,sigconfig,levsk)

We now obtained per each cluster cell type attribution at distinct levels. 

In [None]:
cnames

Only short names were used in the signature naming convention in this case. 
One can easity tranform this to EFO terms if preferred, a conversion table comes with besca. 

In [None]:
cnamesDBlabel = bc.tl.sig.obtain_dblabel(bescapath+'/besca/datasets/nomenclature/CellTypes_v1.tsv', cnames )
cnamesDBlabel

Finally, one can add the new labels to adata.obs as annotation. 

In [None]:
adata.obs['celltype0']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype0',clusters)
adata.obs['celltype2']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype2',clusters)
adata.obs['celltype3']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype3',clusters)

In [None]:
sc.pl.umap(adata,color=['leiden','celltype2', 
                        'CD8A', 'CD8B', # CD8 T cell markers
                        'CD4' ,    'GPR183', 'CMTM8' # CD4 t cell markers
                       ], ncols=1) #,'celltype3'

In [None]:
sc.pl.umap(adata,color=['celltype0'])

In [None]:
sc.pl.umap(adata,color=['celltype3']) #,'celltype3'

### Export labelling

Chosen labels can also be exported as a new folder in labelings/

In [None]:
### Save labelling
#adata = bc.st.additional_labeling(adata, 'celltype3', 'celltype3', 'Major cell types attributed based on HumanCD45p_scseqCMs8', 'schwalip', results_folder)


# Reclustering 

In [None]:
adata.obs['leiden_orginal'] = adata.obs['leiden'].copy()
adata.obs['celltype2_original']  = adata.obs['celltype2'].copy() 

adata.obs['celltype3_original']  = adata.obs['celltype3'].copy() 

In [None]:
adata_rc = bc.tl.rc.recluster ( adata, celltype_label = 'celltype2_original', 
                               celltype= ('CD8-positive, alpha-beta T cell','CD4-positive, alpha-beta T cell',
                                         'cytotoxic CD56-dim natural killer cell'), resolution=1.3)

In [None]:
#adata_rc

In [None]:
sc.pl.umap( adata_rc, color = ['celltype2_original','leiden', 
                              # Some markers
                              'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY'])

Exporting new labelling is required to obtained the right files needed.

In [None]:
adata_rc = bc.st.additional_labeling(adata_rc, 'leiden', 'Leiden_Reclustering', 'Leiden Reclustering on Lymphocytes', 'Julienla', results_folder)

In [None]:
sc.pl.umap( adata_rc, color = scores)

In [None]:
mymarkers['Ubi'] = ['B2M','ACTB', 'GAPDH'] 
clusters = 'Leiden_Reclustering'
f=pd.read_csv(results_folder + "/labelings/"+clusters+"/fract_pos.gct",sep="\t",skiprows=2)
df=bc.tl.sig.score_mw(f,mymarkers)
myc=np.median(df.loc['Ubi',:]*0.5) ### Set a cutoff based on Ubi and scale with values from config file


In [None]:
# RECOMPUTING SIG SCORE WITH NEW CUTOFF
df=df.drop('Ubi')
sigscores={}
for mysig in list(df.index):
    sigscores[mysig]=bc.tl.sig.getset(df,mysig,sigconfig.loc[mysig,'Cutoff']*myc)
    #sigscores[mysig]=bc.tl.sig.getset(df,mysig,10)

In [None]:
#levsk

In [None]:
toExclude = [x  for x in levsk[1] if not x == 'Tcell' and not x == 'NKcell']

In [None]:
# toExclude

In [None]:
cnames=bc.tl.sig.make_anno(df,sigscores,sigconfig,levsk)
cnames

In [None]:
cnames=bc.tl.sig.make_anno(df,sigscores,sigconfig,levsk,  toexclude= toExclude)
cnames

In [None]:
cnamesDBlabel = bc.tl.sig.obtain_dblabel(bescapath+'/besca/datasets/nomenclature/CellTypes_v1.tsv', cnames )
cnamesDBlabel

In [None]:
adata_rc.obs['celltype0']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype0','leiden')
adata_rc.obs['celltype2']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype2','leiden')
adata_rc.obs['celltype3']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype3','leiden')

In [None]:
sc.pl.umap(adata_rc,color=['celltype3', 'celltype2',
                           'CD3G', 'CD8A', 'CD4', 'IL7R', 'NKG7', 'GNLY', #, 'S100A'
                           'IL7R', 'SELL'
                          ], 
           ncols=1)

In [None]:
sc.pl.umap(adata_rc,color=['celltype3',  'celltype3_original',
                           'celltype2', 'celltype2_original'], ncols=1) 

In [None]:
cnames

In [None]:
names_2 = []
names_3 = []
for i in range( cnames.shape[0]) :
    # Orderigng lexo. order
    names_2 += [cnames['celltype2'][str(i)]]
    names_3 += [cnames['celltype3'][str(i)]]

In [None]:
bc.tl.rc.annotate_new_cellnames( adata, adata_rc, names = names_2, new_label='celltype2', method = 'leiden')

bc.tl.rc.annotate_new_cellnames( adata, adata_rc, names = names_3, new_label='celltype3', method = 'leiden')


In [None]:
scores_selection = [
 #'score_lymphocyte_scanpy',
 'score_myeloid_scanpy',
 'score_Bcell_scanpy',
 'score_Tcells_scanpy',
 'score_CD4_scanpy',
 'score_CD8_scanpy',
 'score_NKcell_scanpy',
 'score_monocyte_scanpy',
     'score_CD4Tcell_scanpy',
 'score_CD8Tcell_scanpy',
'score_Tcell_scanpy',
 'score_CD8Tcell_IL7Rmax_scanpy',
 'score_CD4Tcell_IL7Rmax_scanpy',
 'score_RegTcell_scanpy',
 #'score_cDC1_scanpy',
 #'score_cDC2_scanpy',
 'score_cDC_scanpy',
# 'score_NClassMonocyte_scanpy',
 'score_Blymphocyte_scanpy',
 'score_CD56dimNK_scanpy',
    'score_CD56brightNK_scanpy',
 'score_NaiCD4Tcell_scanpy',
 'score_NaiCD8Tcell_scanpy',
 'score_CMCD4Tcell_scanpy',
 'score_EMCD8Tcell_scanpy',
 'score_EMCD4Tcell_scanpy',
 'score_ClassMonocyte_scanpy',
 'score_NKTcell_scanpy',
 'score_gdTcell_scanpy',
 'score_CytotoxCD4Tcell_scanpy',
 'score_CytotoxCD8Tcell_scanpy',
 'score_ExhCD8Tcell_scanpy',
 'score_cDC_CCR7_scanpy',
     'score_ActTcell_scanpy',
 'score_NaiTcell_scanpy'
]

In [None]:
sc.pl.umap(adata_rc,color=['celltype3', 'celltype2'], ncols = 1)

In [None]:
sc.pl.umap(adata_rc,color=scores_selection, color_map = 'viridis')

In [None]:
sc.pl.umap(adata,color=scores_selection, color_map = 'viridis')

In [None]:


sc.pl.umap(adata,color=['celltype2',
                       'celltype3'], ncols=1, alpha= 0.9, size= 30) 

In [None]:


sc.pl.umap(adata,color=['celltype2', 'celltype2_original',
                       'celltype3', 'celltype3_original'], ncols=1) 

In [None]:
adata.write_h5ad('analyzed/pbmc3k_filtered/pbmc3k_processed.h5ad')