# Signature import exports

This workbook exemplifies how to export various types of signatures for storage in [GeMS](https://github.com/bedapub/GeMS). 

In [None]:
import besca as bc
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse, io
import os
import time
import logging
import seaborn as sns
import yaml
import requests
sc.logging.print_header()

# for standard processing, set verbosity to minimum
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
version = '2.8'
start0 = time.time()

In [None]:
#define standardized filepaths based on above input
root_path = os.getcwd()
bescapath_full = os.path.dirname(bc.__file__)
bescapath = os.path.split(bescapath_full)[0]

analysis_name = 'sw_besca24' ## your analysis name, as per standard workflow
species='human' ## or mouse for now
conversion=False
#analysis_name = 'standard_workflow_besca2_CLR' #use _CLR or _DSB for citeseq analysis
annot_author = 'annot_author' ### replace with own userid

# Choose the clustering to base the annotation on.
# leiden (RNA), citeseq (protein)
clusters='leiden' 
results_folder = os.path.join(root_path, 'analyzed')
results_folder = os.path.join(results_folder, analysis_name)

### Set global DE cutoffs 
myfc=1 # (log2FC)
mypval=0.05 # adjusted p-val

use_example_dataset = True ### Set to False if own data

In [None]:
if use_example_dataset:
    analysis_name='Granja_processed'
    results_folder = os.path.split(os.getcwd())[0] + '/besca/datasets/data/'
    clusters='leiden'
    # This line will either download, or load the datasets
    adata = bc.datasets.Granja2019_processed()
    # This line exports the annotation for the annotation.

## Part 1: Signature exports 

### Example 1: export cell type markers 

In [None]:
### Metadata setup for cell type signatures
User=annot_author
Source='internal scseq'  
Subtype='healthy' # or healthy, onc, ...
domain='cell marker'
studyID='Granja2019' # replace with the scMongoDB studyID
analysisID=analysis_name
genesetname=studyID+'_dblabel'
suffix='_BMPBMC_user' # replace with specific info, e.g. _mc38_pcs
signature_dict={}

In [None]:
### Perform DE cells of each celltype3 vs. all other cells
DEgenes=bc.tl.dge.get_de(adata,'dblabel',demethod='wilcoxon',topnr=5000, logfc=myfc,padj=mypval)


##### Example 1: naive B cell markers


In [None]:
### Select only top 35 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='naive B cell' # full dblabel
coishort='NaiBcell' # dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')


setName=coishort+suffix
desc='Genes higher expressed in '+ coi + ' vs. all other cells in Kotliarov et al. 2019 data; coefs are log2FC'
pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)


##### Example 2: pDC markers

In [None]:
### Select only top 15 genes (in order of p-val) for 2 cell types and plot expression per cell type
coi='plasmacytoid dendritic cell' # full dblabel
coishort='pDC' #dblabel short
tops=list(DEgenes[coi].sort_values('Log2FC',ascending=False)['Name'][0:35])
sc.pl.dotplot(adata, var_names=tops,groupby='celltype3')

setName=coishort+suffix
desc='Genes higher expressed in '+ coi + ' vs. all other cells in MC38 in vivo exp ID tumor; coefs are log2FC'
pdout=DEgenes[coi].sort_values('Log2FC',ascending=False)[0:30]
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName] = bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)



And now export the signatures to a .gmtx file

In [None]:
outgmtfile=results_folder+'/Celltypemarkers.gmtx' ### replace this with the path and file name of interest

bc.tl.sig.write_gmtx_forgems(signature_dict, outgmtfile)

### Example 2: export condition markers

This is a similar setup to a perturbation setting. Here, we only focus on a cell of a specific type and calculate differences PBMC vs. bone marrow, thus the example is a tissue marker.  

In [None]:
### Metadata setup for cell type signatures
User=annot_author
Source='internal scseq'  
Subtype='healthy' # or healthy, onc, ...
domain='tissue marker' ### change to perturbation if needed
studyID='Granja2019' # replace with the scMongoDB studyID
analysisID=analysis_name
genesetname=studyID+'_BMvsPBMC_higher' # for perturbation e.g. Pembro_induced_MC38CD8Tcell
suffix='_'+studyID # replace with specific info, e.g. _mc38_pcs
signature_dict={}


###### Genes DE expressed BM vs. PBMC in CD8+ T cells

In [None]:
totest=adata[adata.obs['Type'].isin(['BMMC','PBMC'])].copy()
totest=totest[totest.obs['celltype2']=='CD8-positive, alpha-beta T cell']

In [None]:

DEgenes=bc.tl.dge.get_de(totest,'Type',demethod='wilcoxon',topnr=5000, logfc=myfc,padj=mypval) ### Would be treatment_id

In [None]:
setName='BMMChigh_inCD8Tcell'+suffix # for perturbation e.g. Pembro_induced_MC38CD8Tcell_T1
desc='Genes higher expressed in bone marrow vs. PBMC in CD8-positive T cells in Granja et al. dataset; coefs are log2FC'
# for perturbation e.g. 'Genes higher expressed in Pembro vs. vehicle in CD8-positive T cells in MC38 in vivo exp. ID time T2; coefs are log2FC'
pdout=DEgenes['BMMC'].sort_values('Log2FC', ascending=False)
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName]=bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)

In [None]:
setName='PBMChigh_inCD8Tcell'+suffix # for perturbation e.g. Pembro_induced_MC38CD8Tcell_T1
desc='Genes higher expressed in PBMC vs. bone marrow in CD8-positive T cells in Granja et al. dataset; coefs are log2FC'
# for perturbation e.g. 'Genes higher expressed in Pembro vs. vehicle in CD8-positive T cells in MC38 in vivo exp. ID time T2; coefs are log2FC'
pdout=DEgenes['PBMC'].sort_values('Log2FC', ascending=False)
genes="\t".join(list(pdout['Name'].astype(str) + " | " + pdout['Log2FC'].round(2).astype(str)))
signature_dict[setName]=bc.tl.sig.make_gmtx(setName,desc,User,Source,Subtype,domain,genesetname,genes,studyID,analysisID)

And now export the signatures to a .gmtx file


In [None]:
outgmtfile=results_folder+'/Tissuemarkers.gmtx' ### replace this with the path and file name of interest
bc.tl.sig.write_gmtx_forgems(signature_dict, outgmtfile)

### Instead of writing to a .gmtx, directly upload to GeMS

In [None]:
outgmtfile=results_folder+'/Celltypemarkers.gmtx'

In [None]:
mysigs=bc.tl.sig.read_GMT_sign(outgmtfile,  directed=False, verbose=False) ### replace this with example file
forheader=mysigs[list(mysigs.keys())[0]]
del mysigs[list(mysigs.keys())[0]]

In [None]:
### read the description as well
mysigsdesc=pd.read_csv(outgmtfile,sep='\t')
description=pd.Series([x[1] for x in list(mysigsdesc.index)])
description.index=[x[0] for x in list(mysigsdesc.index)]

In [None]:
#### Set parameters
params = {'gf': 0, 'ti': 9606}
params['us']=mysigs[list(mysigs.keys())[0]][0]
params['so']=mysigs[list(mysigs.keys())[0]][1]
params['st']=mysigs[list(mysigs.keys())[0]][2]
params['do']=mysigs[list(mysigs.keys())[0]][4]

headers = ['setName', 'desc','geneset']+forheader[5:7]+['genes | logFC']

In [None]:
params

In [None]:
parsed=[]
for key in mysigs.keys():
    tmp=mysigs[key].copy()
    mysigs[key].pop(0)
    mysigs[key].pop(0)
    mysigs[key].pop(0)
    mysigs[key].pop(1)
    mysigs[key].insert(0,description[key])
    parsed.append([key]+mysigs[key])

In [None]:
dataIn_1 = {
    'headers': headers,
    'parsed': parsed,
    'params': params
}

In [None]:
from requests import post
import yaml

gems_file = None
if os.path.exists('.credentials/gems.yml'):
    gems_file = '.credentials/gems.yml'
else:
    user_credentials = os.path.expanduser('~/.credentials/gems.yml')
    if os.path.exists(user_credentials):
        gems_file = user_credentials
if gems_file is None:
    raise ValueError("Credential file cannot be found in the current path"
                     "or in the user's home directory (~/.credentials/gems.yml)")
        
with open(gems_file) as f: ### replace this with your credentials for GeMS API if available
    dataMap = yaml.safe_load(f)

BASE_URL_1 = 'http://' + dataMap['hostname'] + ':' + str(dataMap['localport']) + '/api/insert'


In [None]:
returnJSON = post(BASE_URL_1, json=dataIn_1).json()
print(returnJSON)

#### Remove from GeMS signatures that are not desired

In [None]:
BASE_URL_2 = 'http://' + dataMap['hostname'] + ':' + str(dataMap['localport']) + '/api/remove'

In [None]:
toRemove = []
for key in mysigs.keys(): 
    toRemove.append(
    {
        'setName': key, 
        'source': 'internal scseq', 
        'user': 'annot_author', 
        'subtype': 'healthy',
    })

dataIn_2 = {
    'genesets': toRemove
}

In [None]:
returnJSON = post(BASE_URL_2, json=dataIn_2).json()
print(returnJSON)

## Part 2: Signature imports from GeMS

*Use case*: for a given set of signatures, retrieve all from GeMS. 

In [None]:
# Get GEMS hostname and localport from credentials file
with open(os.path.join(os.path.expanduser('~'), '.credentials/gems.yml')) as f:
        cred = yaml.safe_load(f)


In [None]:
# Test if GEMS signature retrieval works
print(bc.tl.sig.get_gems('Tcell_sc', BASE_URL= 'http://' + str(cred['hostname']) + ':' +  str(cred['localport'])))


All signature names related to a certain application can be retrieved with a single call, **example rtbeda_CIT signatures**. 

In [None]:
apigetsignames=False

if apigetsignames:
    apihttp='http://youraddress' #replace with relevant value
    dbname='yourdbname'  #replace with relevant value
    colname='yourgenesetcollection' #replace with relevant value
    selcolumn='meta.application'  #uses 'application' field in GeMS metadata entry
    applicationname='rtbeda_CIT'#'application' field equals to rtbeda_CIT; replace with bescaviz, celltypeviz etc. 

    # Get CIT signature names from MongoDB API
    api_call =  apihttp+ 'db='+dbname+'&' + 'col='+colname+'&' + 'select='+selcolumn+'&' + \
                'oper=eq&' + 'val='+applicationname+'&' + 'fields=setName&' + 'out=jsonArray'
    # print(api_call)

    response = requests.get(api_call)
    # print(response.status_code)
    json = response.json()
    # print(json)

    ### List all the signatures of a specific set name
    CIT_signature_names = [dic['setName'] for dic in json]
    print(CIT_signature_names)
else:
    CIT_signature_names=['Immune_l_Ayers17', 'IFNG_l_Ayers17', 'NKcell_sc', 'NaiTcell_sc'] #replace this with your sigs of interest

In [None]:
# Get all CIT signatures from GEMS
CIT_signatures = {}
CIT_signatures_UP = {}
for sig in CIT_signature_names:
    CIT_signatures[sig] = bc.tl.sig.get_gems(sig, BASE_URL= 'http://' + str(cred['hostname']) + ':' +  str(cred['localport']))[sig]
    CIT_signatures_UP[sig] = bc.tl.sig.get_gems(sig, BASE_URL= 'http://' + str(cred['hostname']) + ':' +  str(cred['localport']))[sig]['UP']
print(CIT_signatures)

In [None]:
bc.convert_notebook_to_HTML()