# Annotating Cell Types

This workbook can be run after the standard workflow.
It shows how to use the  annotation function to annotate the dataset that was generated by the standard worflow to assign clusters to cell types.

We also demonstrate signature-scoring functions and data exploration that can further facilitate annotation or the evaluation thereof.

An alternative in the case a annotated training dataset already exists is to use the auto-annot module. Please refer to the corresponding tutorial.

In [None]:
import besca as bc
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse, io
import os
import time
import logging
import seaborn as sns
sc.logging.print_header()

# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
version = '2.8'
start0 = time.time()

In [None]:
#define standardized filepaths based on above input
root_path = os.getcwd()
bescapath_full = os.path.dirname(bc.__file__)
bescapath = os.path.split(bescapath_full)[0]

analysis_name = 'standard_workflow_besca2'
species='mouse' ## or mouse for now
conversion=None
sigsuffix=''
if species=='mouse': sigsuffix='.mouse'
#analysis_name = 'standard_workflow_besca2_CLR' #use _CLR or _DSB for citeseq analysis
annot_author = 'annot_author' ### replace with userid

# Choose the clustering to base the annotation on.
# leiden (RNA), citeseq (protein)
clusters='leiden' 

# Which UMAP basis to use for visualization
# umap or umap_citeseq
umap_basis='umap'

# Specify if Cite-seq data
citeseq=False

# Specify if signatures should be exported to gmtx
export_sigs=False

# Specify if you are connected to a mongoDB that you want to use to explore similarity with previous data
mongodb_explore=False


The chunk of code below is usefull if this is the initial installation of besca and that you are running this notebook as a test. It will download if needed the test datasets and export the labelling. 
This export is usually done at the end of the standard workflow. Exported files  are necessary for the annotations.

In [None]:
use_example_dataset = False
if use_example_dataset:
    analysis_name='pbmc3k_processed'
    results_folder = os.path.split(os.getcwd())[0] + '/besca/datasets/data/'
    clusters='leiden'
    umap_basis='umap'
    citeseq = False
    # This line will either download, or load the datasets
    adata = bc.datasets.pbmc3k_processed()
    # This line exports the annotation for the annotation.
    adata = bc.st.additional_labeling(adata, labeling_to_use= clusters, labeling_name = clusters, 
                                      labeling_description = 'Exporting a postori the labels for annotation',
                                      labeling_author = 'Testing', 
                                      results_folder= results_folder)
else:
    if clusters == 'leiden':
        results_folder = os.path.join(root_path, 'analyzed')
        if citeseq:
            results_folder = os.path.join(results_folder, analysis_name, 'citeseq' , 'citeseq') 
            adata = sc.read_h5ad(os.path.join(results_folder + '_merged' ,analysis_name + '.h5ad') ) 
        else:
            results_folder = os.path.join(results_folder, analysis_name)
            adata = sc.read_h5ad(os.path.join(results_folder, analysis_name + '.h5ad') )
    else:
        results_folder = os.path.join(root_path, 'analyzed', analysis_name, 'citeseq', clusters)
        adata = sc.read_h5ad(os.path.join(results_folder + '_merged' ,analysis_name + '.h5ad') )
        clusters='leiden_citeseq'

In [None]:
### Create export file and folder names
results_file = os.path.join(results_folder, analysis_name + '.annotated.h5ad')
figdir=os.path.join(root_path, 'analyzed', analysis_name+'/figures/')
sc.settings.figdir = figdir
if not os.path.exists(figdir):
    os.makedirs(figdir)

In [None]:
sc.pl.embedding(adata, color = [clusters], basis = umap_basis )

In [None]:
if citeseq:
    # Expression pattern of the antibodies
    # Can be usefull to refine the annotations, if the clusters are RNA-based
    ab_names = adata.var[adata.var.feature_type=='Antibody Capture']
    sc.pl.embedding(adata, basis = umap_basis, color=ab_names.index.tolist(), color_map = 'viridis')

### Explore top marker genes per cluster 

In [None]:
DEgenes=bc.tl.dge.get_de(adata,clusters,demethod='wilcoxon',topnr=5000, logfc=1,padj=0.05)


In [None]:
### Select only top genes (in order of p-val) for 2 clusters and plot expression per cluster
### Sort according to FC for more specific genes
tops=list(DEgenes['0'].sort_values('Log2FC',ascending=False)['Name'][0:20])+list(DEgenes['1'].sort_values('Log2FC',ascending=False)['Name'][0:20])
sc.pl.dotplot(adata, var_names=tops,groupby=clusters)

##### Explore similarity to other data in MongoDB (only works if connected to MongoDB with studies)

In [None]:
if mongodb_explore:
    import sys
    sys.path.append(pathtomongoloader) #Pls adjust e.g. /bioinfo/scseq/mongoloader/
    from scCorr import clusterCorrelation

    fract_pos=results_folder+'/labelings/'+clusters+'/fract_pos.gct'
    
    ### Select clusters of interest e.g. 0|1 or "ALL" for all clusters; 
    ### Select studies of interest for correlating: dbstudies=studyID|analysisID|labelingID or dbstudies=ALL; species=mouse (or human) or species=ALL
    
    ### Select general studies first (use more stringent cutoff e.g. 0.3 to reduce correlations)
    cordf=clusterCorrelation(fract_pos,  "-clusters=1|5", "mouse", "-dbstudies=ALL", "-dbspecies=mouse", 0.3)
    cordf[0] # These are all the top hits
    cordf[1].index=[x.split('   ')[0]+'_'+x.split('   ')[3] for x in list(cordf[1].index)] #Simplify labeling for heatmap
    ### Plot top results as heatmap
    sns.set_context("paper", font_scale=1.1)     
    sns.clustermap(cordf[1].loc[cordf[1].transpose().max()>=0.45,:], figsize=(4,10), annot=True,  cmap="YlGnBu")
    
    
    ### Second example - select a specific study
    cordf=clusterCorrelation(fract_pos,  "-clusters=0|1", "mouse", "-dbstudies=TabulaMuris_droplet|TM_droplet_all_besca2_7|celltype", "-dbspecies=mouse", 0.1)
    cordf[0] ### This is the top hit per cluster
    cordf[1].index=[x.split('   ')[3] for x in list(cordf[1].index)] #Simplify labeling for heatmap
    ### Plot top results as heatmap
    sns.set_context("paper", font_scale=1.1)     
    sns.clustermap(cordf[1].loc[cordf[1].transpose().max()>=0.3,:], figsize=(5,6), annot=True,  cmap="YlGnBu")

### Explore distribution of various cell populations

In [None]:
# One can load besca-provided signatures using the function below
signature_dict = bc.datasets.load_immune_signatures(refined=False)

signature_dict

Additionaly it is possible to read an compute scanpy score using this function below.

If the gmt file is composed of combined signature (UP and DN), a common score will be computed: 
$$Total\_SCORE= Score_{UP} - Score_{DN}$$

In [None]:
gmt_file= bescapath + '/besca/datasets/genesets/Immune.gmt'

if species=='mouse':
    # Genes converted to mouse homologs
    mousehuman_file = bescapath + '/besca/datasets/homologs/MGItoHGNC.csv'
    mousehuman=pd.read_csv(mousehuman_file,sep='\t',header='infer', encoding="unicode_escape")
    mousehuman.index=mousehuman['MGI']
    conversion=pd.Series(data=mousehuman['HGNC'], index=mousehuman.index)

bc.tl.sig.combined_signature_score(adata, gmt_file,
                             UP_suffix='_UP', DN_suffix='_DN', method='scanpy',
                             overwrite=False, verbose=False,
                             use_raw=True, conversion=conversion)

In [None]:
scores = [x for x in adata.obs.columns if 'scanpy' in x]

In [None]:
sc.pl.embedding(adata, basis = umap_basis, color= scores)

## Signatures for specific sub-populations

In [None]:
## Provided with besca; change this for own gmt file
gmt_file_anno= bescapath + '/besca/datasets/genesets/CellNames_scseqCMs6_sigs'+sigsuffix+'.gmt'
bc.tl.sig.combined_signature_score(adata, gmt_file_anno) #optional conversion argument , conversion=conversion


In [None]:
### Plot all signatures containing "scanpy" in name
scores = [x for x in adata.obs.columns if 'scanpy' in x]
sc.pl.embedding(adata, basis = umap_basis, color= scores, color_map = 'viridis')

In [None]:
## An extra set of signatures (less specific but informative) is also provided 
gmt_file_anno_extra= bescapath + '/besca/datasets/genesets/CellNames_scseqCMs6_Extrasigs'+sigsuffix+'.gmt'
bc.tl.sig.combined_signature_score(adata, gmt_file_anno_extra) #optional conversion argument , conversion=conversion


In [None]:
### Plot all signatures containing "_scv" in name
scores = [x for x in adata.obs.columns if '_scv' in x]
sc.pl.embedding(adata, basis = umap_basis, color= scores, color_map = 'viridis')

In [None]:
### Plot only selected signatures
sc.pl.embedding(adata, basis = umap_basis, color= ['score_Myeloid_scanpy','score_Bcell_scanpy','score_Tcell_scanpy','score_NKcell_scanpy'], color_map = 'viridis')

# Automated annotation

A decision-tree-based annotation that reads signatures from a provided .gmt file and hierarchy as well as cutoffs and signature ordering from a configuration file and attributes each cell to a specific type according to signature enrichment. 

This is an aid to start ther annotation and annotation can then be further refined by adding further signatures or adjusting the configuration files. It was tested mainly on PBMCs and oncology (tumor biopsies) related samples.


## Loading markers and signature

In [None]:
from itertools import repeat
mymarkers = bc.tl.sig.read_GMT_sign(gmt_file_anno,directed=False)

#optional conversion - if human-based signatures are read
#if species=='mouse':
#    for signature in mymarkers.keys():
#        mymarkers[signature] = [i for i in map(bc.tl.sig._helper._to_geneid, repeat(conversion), mymarkers[signature]) if i is not None]

mymarkers = bc.tl.sig.filter_siggenes(adata, mymarkers) ### remove genes not present in dataset or empty signatures

In [None]:
mymarkers

### Select Ubiquitously expressed genes for cutoff adjustment to invidual datasets

In [None]:
## Cutoff in the configuration file will be calculated relative to the enrichment of this ubiquitous signature
## For best performance, choose genes that are uniformly distributed across all clusters on a given dataset 
## Ideally, expression is moderate rather than high
mymarkers['Ubi'] = [ 'ZNF207', 'HNRNPU','SNRPD3', 'SRRM1'] # alternatives: ['B2M','ACTB', 'HNRNPK'] 
if species=='mouse':
    mymarkers['Ubi'] = ['B2m','Actb',  'Hnrnpk','Hnrnpu'] # alternative: 'Znf207'

In [None]:
### Inspect gene expression for ubi genes, checking for uniform distribution across clusters
sc.pl.embedding(adata, basis = umap_basis, color= mymarkers['Ubi'])

In [None]:
sc.pl.dotplot(adata, var_names= mymarkers['Ubi'], groupby=clusters)

In [None]:
### Inspect gene expression for an example signature
sc.pl.embedding(adata, basis = umap_basis, color= mymarkers['Hematopoietic'])

## Configuration of the annotation

We read the configuration file, containing hierarchy, cutoff and signature priority information. 
A new version of this file should be created and maintained with each annotation. 
The included example is optimised for the annotation of the 6.6k PBMC dataset. 

In [None]:
configfile=bescapath + '/besca/datasets/genesets/CellNames_scseqCMs6_config'+sigsuffix+'.tsv' ### replace this with your config

In [None]:
sigconfig,levsk=bc.tl.sig.read_annotconfig(configfile)

In [None]:
# Optional configuatation: The order of cells at different levels can be manually changed if needed
#levsk[0]=['ColorectalCancer', 'Epithelial','Fibroblast','Endothelial','Erythrocyte','HematoStem',
#          'Hepatocyte','MelMelanoma','Neural','Adipocyte','Hematopoietic','Schwann','Chondrocyte','Glial']

#### Get an overview of the cell type hierarchy included in the configuration file

In [None]:
plt=bc.pl.nomenclature_network(configfile, font_size=8)
plt.savefig(figdir+"Nomenclatureplot.svg", format="svg")

Fract_pos was exported by BESCA in the standard worflow test, 
contains information of fraction positive cells per genes per cluster.

We use these values as a basis for a wilcoxon test per signature per cluster. 

In [29]:
## Optional: For an alternative clustering to be used as annotation, one needs to export the corresponding fract_pos

#adata = bc.st.clustering(adata, results_folder, myres=2, method = clusters) ## higher clustering resolution
#bc.export.clustering(adata, outpath = os.path.join(results_folder, 'labelings', clusters+'_r2'), method = clusters)
#bc.export.labeling_info(outpath=os.path.join(results_folder, 'labelings', clusters+'_r2'), description=clusters+' clustering with r=2', method='leiden')

In [None]:
f=pd.read_csv(results_folder + '/labelings/' + clusters + '/fract_pos.gct' ,sep="\t",skiprows=2)
df=bc.tl.sig.score_mw(f,mymarkers)

### Set a cutoff based on Ubi and scale with values from config file
### Change the factor 0.5 to systematically be more stringent (higher e.g. 1) or leniant (lower e.g. 0.25)
myfactor=0.5
if species=='mouse':
    myfactor=0.3
myc=np.median(df.loc['Ubi',:]*myfactor) 

In [None]:
### Check the cutoff 
myc

In [None]:
### Check the ubiquitous signature score (should be as uniform as possible across clusters)
df.loc["Ubi",:]

In [None]:
df.iloc[0:3,0:7]

#### For an overview of highest scoring signatures, one can generate a heatmap

In [None]:
sns.clustermap(df.loc[df.max(axis=1)>myc*1.5,:].astype(float),figsize=(8, 12))
plt.savefig(figdir+"SignatureHeatmap_all.svg", format="svg")

For each signature, positive and negative clusters are determined. Only positive clusters are maintained. Cutoffs can be individualised based on the config file (scaling factor) and myc, which is determined based on ubiquitously expressed genes. 

In [97]:
### Remove ubiquitous signature for the scoring part
df=df.drop('Ubi')

In [98]:
# Optional: Cutoffs can also be manually adjusted if needed 
# Always adjust from lowest to highest level and remember to check priorities as well (order)
# sigconfig.loc['Epithelial','Cutoff']=1.5 ### Increase for being more stringent, decrease for more leniant


#### Cluster attribution based on cutoff (all clusters above cutoff will be attributed to a cell type)

In [99]:
sigscores={}
for mysig in list(df.index):
    sigscores[mysig]=bc.tl.sig.getset(df,mysig,sigconfig.loc[mysig,'Cutoff']*myc)
    #sigscores[mysig]=bc.tl.sig.getset(df,mysig,10)

One can inspect the cluster attribution per cell type in the signature list and adjust cutoffs as required. 

In [None]:
sigscores

In [None]:
### Check to a specific cell type 
sigscores['Hematopoietic']

Now each cluster gets annotated, according to the distinct levels specified in the config file. 
Note that in case a cluster is positive for multiple identities, only the first one is taken, 
in the order specified in the "Order" column in the config file. 

To check the given order, per levels, you can inspect levsk, and adjust above as needed

In [None]:
levsk

#### Cell types that are not expected in the dataset or that are too fine-grained be explicity excluded from the annotation

In [103]:
### For instance, if no erythrocytes or pancreatic cells are expected, they can be specified here
### For instance, if plasma cells should not be subclassified, they can can be specified here
toexclude=['Erythrocyte','AlphaPancreatic', 'BetaPancreatic', 
           'DeltaPancreatic','IgGPlasma','IgAPlasma','IgMPlasma','PancreaticDuctal','Cholangiocyte']


### Obtain cluster assignment

In [104]:
cnames=bc.tl.sig.make_anno(df,sigscores,sigconfig,levsk, toexclude=toexclude)

We now obtained per each cluster cell type attribution at distinct levels. 

In [None]:
cnames

Export the used annotation parameters, for future reference

In [106]:
bc.tl.sig.export_annotconfig(sigconfig, levsk, results_folder, analysis_name)

## Using dblabel convention

Only short names were used in the signature naming convention in this case. 
One can easity tranform this to EFO terms if preferred, a conversion table comes with besca. 

This nomenclature is quite extended, and the function 
**obtain_dblabel** can perform the conversion.

In [None]:
### Transform these short forms to dblabel - EFO standard nomenclature
cnamesDBlabel = bc.tl.sig.obtain_dblabel(bescapath+'/besca/datasets/nomenclature/CellTypes_v1.tsv', cnames )
cnamesDBlabel

Finally, one can add the new labels to adata.obs as annotation. 

In [108]:
adata.obs['celltype0']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype0',clusters)
adata.obs['celltype1']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype1',clusters)
adata.obs['celltype2']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype2',clusters)
adata.obs['celltype3']=bc.tl.sig.add_anno(adata,cnamesDBlabel,'celltype3',clusters)

Inspect the labels on the umap from lowest to highest resolution

In [None]:
sc.pl.embedding(adata,color=['celltype1'], basis = umap_basis) 

In [None]:
sc.pl.embedding(adata,color=['celltype2'], basis = umap_basis) 

In [None]:
sc.pl.embedding(adata,color=['celltype3'], basis = umap_basis) 

### Helper functions for additional checks

In [None]:
### Which classification has cluster 7? 
bc.tl.sig.match_cluster(adata,'leiden','7','celltype3',0.3)

In [None]:
### What cluster corresponds to CD1c-positive myeloid dendritic cell? 
bc.tl.sig.match_cluster(adata,'celltype3','CD1c-positive myeloid dendritic cell','leiden',0.3) ## lowered cutoff

## Manual refinement

In some cases, the annotation does not produce the optimal result. Manual adjustments can be made, 
by replacing cell type names or by manually labeling clusters. Note that the second option is not 
stable across reruns if any adjustments are made to the clustering. 

In [114]:
### Example 1: Classical monocytes contains both classical and non-classical monocytes, adjust to monocytes
#adata.obs['celltype2']=list(adata.obs['celltype2'].replace('classical monocyte', 'monocyte'))
#adata.obs['celltype3']=list(adata.obs['celltype3'].replace('classical monocyte', 'monocyte'))

In [115]:
### Example 2: non-classical monocytes need to be added to a specific cluster
#adata.obs['celltype2']=adata.obs['celltype2'].cat.add_categories('non-classical monocyte')
#adata.obs['celltype3']=adata.obs['celltype3'].cat.add_categories('non-classical monocyte')
#adata.obs.loc[adata.obs[clusters].isin(['4']),'celltype2']='non-classical monocyte'
#adata.obs.loc[adata.obs[clusters].isin(['4']),'celltype3']='non-classical monocyte'

In [116]:
### Example 3: if some epithelial cells are not recognised as cancer cells
#adata.obs.loc[adata.obs['celltype3']=='epithelial cell','celltype0']='colorectal cancer cell'
#adata.obs.loc[adata.obs['celltype3']=='epithelial cell','celltype1']='colorectal cancer cell'


In [117]:
#### Make sure to remove obsolete categories
adata.obs['celltype3']=adata.obs['celltype3'].cat.remove_unused_categories()
adata.obs['celltype2']=adata.obs['celltype2'].cat.remove_unused_categories()
adata.obs['celltype1']=adata.obs['celltype1'].cat.remove_unused_categories()
adata.obs['celltype0']=adata.obs['celltype0'].cat.remove_unused_categories()

## Reclustering sub-clusters 

Sometimes, single clusters can contain a mix of cell types. For example, for the PBMC3K data example, the lymphocyte clusters are mixed. In this case, one can try to increase clustering resolution (as above) or recluster specifically on those clusters.
Below we show an example.

The main steps are:
+ Saving previous clustering and annotation for comparison purpose (advised)
+ Recluster 
+ Export the new labelling (see function additional_labeling)
+ Read the new labelling information including fract_pos files
+ Recompute signatures/markers values
+ Reannotate
+ Convert annotation to dblabel
+ Export all for the data subset to the larger adata object

In [118]:
recluster = False

if use_example_dataset:
    recluster = True
    celltype_label = 'celltype2_original'
    to_recluster =  ('CD8-positive, alpha-beta T cell','CD4-positive, alpha-beta T cell',
                                         'cytotoxic CD56-dim natural killer cell')

In [None]:
if recluster:
    # Save previous clustering obtained for comparision
    adata.obs['leiden_original'] = adata.obs['leiden'].copy()
    adata.obs['celltype2_original']  = adata.obs['celltype2'].copy() 

    # Calling reclustering
    adata_rc = bc.tl.rc.recluster ( adata, celltype_label = celltype_label, 
                               celltype=to_recluster, resolution=1.3)

    # Leiden reclustering have to be exported to use the annotation function 
    cluster_renamed = 'Leiden_Reclustering'
    adata_rc = bc.st.additional_labeling(adata_rc, 'leiden', cluster_renamed, 'Leiden Reclustering on Lymphocytes', 'author', results_folder)
   
    # Reading additional labelling
    f=pd.read_csv(results_folder + "/labelings/"+cluster_renamed+"/fract_pos.gct",sep="\t",skiprows=2)
    df=bc.tl.sig.score_mw(f,mymarkers)
    myc=np.median(df.loc['Ubi',:]*0.5) ### Set a cutoff based on Ubi and scale with values from config file

    # Recomputing the signature score with the new cutoff
    df=df.drop('Ubi')
    sigscores={}
    for mysig in list(df.index):
        sigscores[mysig]=bc.tl.sig.getset(df,mysig,sigconfig.loc[mysig,'Cutoff']*myc)

    # Adjust cutoffs if needed
    # sigconfig.loc['Epithelial','Cutoff']=1.5
    
    cnames=bc.tl.sig.make_anno(df,sigscores,sigconfig,levsk, toexclude=toexclude)
    cnamesDBlabel = bc.tl.sig.obtain_dblabel(bescapath+'/besca/datasets/nomenclature/CellTypes_v1.tsv', cnames )
    
    adata_rc.obs['celltype0']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype0','leiden')
    adata_rc.obs['celltype2']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype2','leiden')
    adata_rc.obs['celltype3']=bc.tl.sig.add_anno(adata_rc,cnamesDBlabel,'celltype3','leiden')
    # Lex order needed.
    names_2 = []
    names_3 = []
    for i in range( cnames.shape[0]) :
        names_2 += [cnames['celltype2'][str(i)]]
        names_3 += [cnames['celltype3'][str(i)]]
    
    bc.tl.rc.annotate_new_cellnames( adata, adata_rc, names = names_2, new_label='celltype2', method = 'leiden')

    bc.tl.rc.annotate_new_cellnames( adata, adata_rc, names = names_3, new_label='celltype3', method = 'leiden')
    
    sc.pl.umap(adata,color=['celltype2', 'celltype2_original',
                       'celltype3'], ncols=1) 

### Save annotation result and export labelling

Chosen labels can also be exported as a new folder in labelings/

In [None]:
### Attribute the cell annotation level of choice (typically the highest resolution one, if estimated to be reliable) to dblabel
adata.obs['dblabel']=adata.obs['celltype3']

### Export file for future reference
adata.write(results_file)
#adata=sc.read(results_file)

In [None]:
### Save labelling celltype1
adata = bc.st.additional_labeling(adata, 'celltype1', 'celltype1', 'Major cell types', annot_author, results_folder)

### Save labelling celltype1
adata = bc.st.additional_labeling(adata, 'celltype2', 'celltype2', 'Intermediate cell types', annot_author, results_folder)


### Save labelling dblabel used as reference
adata = bc.st.additional_labeling(adata, 'dblabel', 'dblabel', 'Cell types attributed according to CL nomenclature, based on own annotation (celltype3)', annot_author, results_folder)

### Follow-up analyses for marker generation and inspecting the annotation

#### Cell-centered analysis

In [None]:
### Breakdown of cell types per experiment (sample)
bc.pl.celllabel_quant_stackedbar(adata, count_variable='celltype3', subset_variable = 'experiment');


In case two annotation versions are present, one can perfom a riverplot to compare

In [None]:
bc.pl.riverplot_2categories(adata, ['dblabel','celltype3'])

#### Gene-centered analysis

If one is interested in new markers, one can perform DE at the cell type annotation level of choice. Markers can then also be exported to a .gmtx file for subsequent import into GeMS (signatureDB), following specific conventions. 

In [None]:
if export_sigs:
    ### Metadata setup for cell type signatures
    User=author
    Source='internal scseq'  
    Subtype='all' # or healthy, onc, ...
    domain='cell marker'
    studyID='mongodb-study-id' # replace with your studyID
    analysisID=analysis_name
    genesetname=studyID+'_dblabel'
    suffix='_model_user' # replace with specific info, e.g. _mc38_pcs
    signature_dict={}

In [None]:
### Perform DE cells of each celltype3 vs. all other cells
DEgenes=bc.tl.dge.get_de(adata,'celltype3',demethod='wilcoxon',topnr=5000, logfc=1,padj=0.05)

##### Example 1: naive B cell markers

In [None]:
### Select only top 15 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='naive B cell' # full dblabel
coishort='NaiBcell' #dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')

In [None]:
if export_sigs:
    setName=coishort+suffix
    desc='Genes higher expressed in '+ coi + ' vs. all other cells in MC38 in vivo exp ID tumor; coefs are log2FC'
    pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
    genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
    signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,
                                                  genes,studyID,analysisID,celltype=coishort)


###### Example 2: pDC markers

In [None]:
### Select only top 15 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='plasmacytoid dendritic cell' # full dblabel
coishort='pDC' #dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')

In [None]:
if export_sigs:
    setName=coishort+suffix
    desc='Genes higher expressed in '+ coi + ' vs. all other cells in MC38 in vivo exp ID tumor; coefs are log2FC'
    pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
    genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
    signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,
                                                  genes,studyID,analysisID,celltype=coishort)


In [None]:
### Export sigs if relevant 
if export_sigs:
    outgmtfile=results_folder+'/Celltypemarkers.gmtx'
    bc.tl.sig.write_gmtx_forgems(signature_dict, outgmtfile)

If one is interested in additional markers correlated with the marker of interest, 
one can caculated the spearman correlation. 

In [175]:
import scipy.stats as ss
import operator

allmarkers=adata.var_names

spearcorsAll = {}
for i in allmarkers:
    spearcorsAll[i]=ss.spearmanr(adata.raw[:, i].X.todense(),adata.raw[:,'MS4A1'].X.todense())[0]


spearcorsAll=sorted(spearcorsAll.items(),key=operator.itemgetter(1))
spearcorsAll_ids = [idx for idx, val in spearcorsAll]
goiNegAll=spearcorsAll_ids[0:30]
goiPosAll=spearcorsAll_ids[(len(spearcorsAll_ids)-30):len(spearcorsAll_ids)]

In [None]:
sc.pl.dotplot(adata, var_names=goiPosAll,groupby='celltype3')

### Convert to html

In [None]:
%%javascript

IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [None]:
nb_name = os.path.join(os.getcwd(), nb_name)

In [None]:
! jupyter nbconvert --to html {nb_name}