# Signature exports

This workbook exemplifies how to export various types of signatures for storage in [GeMS](https://github.com/bedapub/GeMS). 

In [6]:
import besca as bc
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse, io
import os
import time
import logging
import seaborn as sns
sc.logging.print_header()

# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
version = '2.8'
start0 = time.time()

scanpy==1.6.0 anndata==0.7.5 umap==0.3.10 numpy==1.20.1 scipy==1.5.1 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.10.2 python-igraph==0.9.0 louvain==0.7.0 leidenalg==0.8.3


In [7]:
#define standardized filepaths based on above input
root_path = os.getcwd()
bescapath_full = os.path.dirname(bc.__file__)
bescapath = os.path.split(bescapath_full)[0]

analysis_name = 'sw_besca24' ## your analysis name, as per standard workflow
species='human' ## or mouse for now
conversion=False
#analysis_name = 'standard_workflow_besca2_CLR' #use _CLR or _DSB for citeseq analysis
annot_author = 'annot_author' ### replace with own userid

# Choose the clustering to base the annotation on.
# leiden (RNA), citeseq (protein)
clusters='leiden' 
results_folder = os.path.join(root_path, 'analyzed')
results_folder = os.path.join(results_folder, analysis_name)

### Set global DE cutoffs 
myfc=1 # (log2FC)
mypval=0.05 # adjusted p-val

use_example_dataset = True ### Set to False if own data

In [8]:
if use_example_dataset:
    analysis_name='Granja_processed'
    results_folder = os.path.split(os.getcwd())[0] + '/besca/datasets/data/'
    clusters='leiden'
    # This line will either download, or load the datasets
    adata = bc.datasets.Granja2019_processed()
    # This line exports the annotation for the annotation.

### Example 1: export cell type markers 

In [9]:
### Metadata setup for cell type signatures
User=annot_author
Source='internal scseq'  
Subtype='healthy' # or healthy, onc, ...
domain='cell marker'
studyID='Granja2019' # replace with the scMongoDB studyID
analysisID=analysis_name
genesetname=studyID+'_dblabel'
suffix='_BMPBMC_user' # replace with specific info, e.g. _mc38_pcs
signature_dict={}

In [10]:
### Perform DE cells of each celltype3 vs. all other cells
DEgenes=bc.tl.dge.get_de(adata,'dblabel',demethod='wilcoxon',topnr=5000, logfc=myfc,padj=mypval)


##### Example 1: naive B cell markers


In [None]:
### Select only top 35 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='naive B cell' # full dblabel
coishort='NaiBcell' # dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')


setName=coishort+suffix
desc='Genes higher expressed in '+ coi + ' vs. all other cells in Kotliarov et al. 2019 data; coefs are log2FC'
pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)


##### Example 2: pDC markers

In [None]:
### Select only top 15 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='plasmacytoid dendritic cell' # full dblabel
coishort='pDC' #dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')

setName=coishort+suffix
desc='Genes higher expressed in '+ coi + ' vs. all other cells in MC38 in vivo exp ID tumor; coefs are log2FC'
pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)



And now export the signatures to a .gmtx file

In [None]:
outgmtfile=results_folder+'/Celltypemarkers.gmtx' ### replace this with the path and file name of interest

bc.tl.sig.write_gmtx_forgems(signature_dict, outgmtfile)

### Example 2: export condition markers

This is a similar setup to a perturbation setting. Here, we only focus on a cell of a specific type and calculate differences PBMC vs. bone marrow, thus the example is a tissue marker.  

In [39]:
### Metadata setup for cell type signatures
User=annot_author
Source='internal scseq'  
Subtype='healthy' # or healthy, onc, ...
domain='tissue marker' ### change to perturbation if needed
studyID='Granja2019' # replace with the scMongoDB studyID
analysisID=analysis_name
genesetname=studyID+'_BMvsPBMC_higher' # for perturbation e.g. Pembro_induced_MC38CD8Tcell
suffix='_'+studyID # replace with specific info, e.g. _mc38_pcs
signature_dict={}


###### Genes DE expressed BM vs. PBMC in CD8+ T cells

In [35]:
totest=adata[adata.obs['Type'].isin(['BMMC','PBMC'])].copy()
totest=totest[totest.obs['celltype2']=='CD8-positive, alpha-beta T cell']

In [36]:

DEgenes=bc.tl.dge.get_de(totest,'Type',demethod='wilcoxon',topnr=5000, logfc=myfc,padj=mypval) ### Would be treatment_id

Trying to set attribute `.uns` of view, copying.


In [46]:
setName='BMMChigh_inCD8Tcell'+suffix # for perturbation e.g. Pembro_induced_MC38CD8Tcell_T1
desc='Genes higher expressed in bone marrow vs. PBMC in CD8-positive T cells in Granja et al. dataset; coefs are log2FC'
# for perturbation e.g. 'Genes higher expressed in Pembro vs. vehicle in CD8-positive T cells in MC38 in vivo exp. ID time T2; coefs are log2FC'
pdout=DEgenes['BMMC'].sort_values('Log2FC', ascending=False)
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName]=bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)

Metadata for signature BMMChigh_inCD8Tcell_Granja2019 successfully captured


In [48]:
setName='PBMChigh_inCD8Tcell'+suffix # for perturbation e.g. Pembro_induced_MC38CD8Tcell_T1
desc='Genes higher expressed in PBMC vs. bone marrow in CD8-positive T cells in Granja et al. dataset; coefs are log2FC'
# for perturbation e.g. 'Genes higher expressed in Pembro vs. vehicle in CD8-positive T cells in MC38 in vivo exp. ID time T2; coefs are log2FC'
pdout=DEgenes['PBMC'].sort_values('Log2FC', ascending=False)
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName]=bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)

Metadata for signature PBMChigh_inCD8Tcell_Granja2019 successfully captured


And now export the signatures to a .gmtx file


In [4]:
outgmtfile=results_folder+'/Tissuemarkers.gmtx' ### replace this with the path and file name of interest
bc.tl.sig.write_gmtx_forgems(signature_dict, outgmtfile)

### Instead of writing to a .gmtx, directly upload to GeMS

In [45]:
outgmtfile=results_folder+'/Celltypemarkers.gmtx'

In [46]:
mysigs=bc.tl.sig.read_GMT_sign(outgmtfile,  directed=False, verbose=False) ### replace this with example file
forheader=mysigs[list(mysigs.keys())[0]]
del mysigs[list(mysigs.keys())[0]]

In [47]:
### read the description as well
mysigsdesc=pd.read_csv(outgmtfile,sep='\t')
description=pd.Series([x[1] for x in list(mysigsdesc.index)])
description.index=[x[0] for x in list(mysigsdesc.index)]

In [48]:
#### Set parameters
params = {'gf': 0, 'ti': 9606}
params['us']=mysigs[list(mysigs.keys())[0]][0]
params['so']=mysigs[list(mysigs.keys())[0]][1]
params['st']=mysigs[list(mysigs.keys())[0]][2]
params['do']=mysigs[list(mysigs.keys())[0]][4]

headers = ['setName', 'desc','geneset']+forheader[5:7]+['genes | logFC']

In [49]:
params

{'gf': 0,
 'ti': 9606,
 'so': 'internal scseq',
 'us': 'annot_author',
 'st': 'healthy',
 'do': 'cell marker'}

In [50]:
parsed=[]
for key in mysigs.keys():
    tmp=mysigs[key].copy()
    mysigs[key].pop(0)
    mysigs[key].pop(0)
    mysigs[key].pop(0)
    mysigs[key].pop(1)
    mysigs[key].insert(0,description[key])
    parsed.append([key]+mysigs[key])

In [52]:
dataIn_1 = {
    'headers': headers,
    'parsed': parsed,
    'params': params
}

In [53]:
from requests import post
import yaml
with open('.credentials/gems.yml') as f: ### replace this with your credentials for GeMS API if available
    dataMap = yaml.safe_load(f)
dataMap

BASE_URL_1 = 'http://' + dataMap['hostname'] + ':' + str(dataMap['localport']) + '/api/insert'


In [54]:
returnJSON = post(BASE_URL_1, json=dataIn_1).json()
print(returnJSON)

{'response': [200]}


#### Remove from GeMS signatures that are not desired

In [57]:
BASE_URL_2 = 'http://' + dataMap['hostname'] + ':' + str(dataMap['localport']) + '/api/remove'

In [58]:
toRemove = []
for key in mysigs.keys(): 
    toRemove.append(
    {
        'setName': key, 
        'source': 'internal scseq', 
        'user': 'annot_author', 
        'subtype': 'healthy',
    })

dataIn_2 = {
    'genesets': toRemove
}

In [59]:
returnJSON = post(BASE_URL_2, json=dataIn_2).json()
print(returnJSON)

{'response': 200}


In [60]:
! jupyter nbconvert --to html Signature_exports.ipynb


[NbConvertApp] Converting notebook Signature_exports.ipynb to html
[NbConvertApp] Writing 1330474 bytes to Signature_exports.html
