# DPCL_cisTopicObject_3K_fragments_80_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_80_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/low/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

# DPCL_cisTopicObject_3K_fragments_1K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_1K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/low/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_1K_cells.py

# DPCL_cisTopicObject_3K_fragments_10K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_10K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/low/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_10K_cells.py

# DPCL_cisTopicObject_3K_fragments_25K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_25K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/low/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_25K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_25K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_25K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_25K_cells.py

# DPCL_cisTopicObject_10K_fragments_80_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_80_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/medium/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_80_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_80_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_80_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_80_cells.py

# DPCL_cisTopicObject_10K_fragments_1K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_1K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/medium/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_1K_cells.py

# DPCL_cisTopicObject_10K_fragments_10K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_10K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/medium/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_10K_cells.py

# DPCL_cisTopicObject_10K_fragments_25K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_25K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/medium/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(scplus_obj,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
                     
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}
    
    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
      
    
    times = [ time_format_regulons, time_signatures,time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m9, m10, m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m9, m10-m9, m12-m10, m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_25K_cells/'
import pickle
infile = open(outDir + 'scenicplus/scplus_obj.pkl', 'rb')
scplus_obj= pickle.load(infile)
infile.close()
# Region ranking 
import pickle
infile = open(outDir+'scenicplus/region_ranking.pkl', 'rb') 
region_ranking = pickle.load(infile)
infile.close()

run_scenicplus_time(scplus_obj,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill',
    region_ranking = region_ranking
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=24:00:00
#SBATCH --mem=1500G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_25K_cells_v2
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_25K_cells_v2.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_25K_cells_v2.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_25K_cells_v2.py

# DPCL_cisTopicObject_20K_fragments_80_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_80_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/high/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_80_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_80_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_80_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_80_cells.py

# DPCL_cisTopicObject_20K_fragments_80_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_1K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/high/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_1K_cells.py

# DPCL_cisTopicObject_20K_fragments_10K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

                        
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_10K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/high/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_10K_cells.py

# DPCL_cisTopicObject_20K_fragments_25K_cells

In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(GEX_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    r2g_file,
    tf2g_file,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    import time
    start_time = time.time()
    t1_start = time.time()
    log.info('Creating SCENIC+ object')
    scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = GEX_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)
    m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_create_object = t1_stop-t1_start
    
    log.info('Filter genes')
    import time
    t1_start = time.time()
    filter_genes(scplus_obj, min_pct = 0.5)
    m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_genes = t1_stop-t1_start
    log.info('Filter regions')
    import time
    t1_start = time.time()
    filter_regions(scplus_obj, min_pct = 0.5)
    m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
    t1_stop = time.time()
    time_filter_regions = t1_stop-t1_start
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : "+ save_path)

    else:
        log.info(save_path + " folder already exists.")
    
    if 'Cistromes' not in scplus_obj.uns.keys():
        log.info('Merging cistromes')
        import time
        t1_start = time.time()
        merge_cistromes(scplus_obj)
        m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_cistromes = t1_stop-t1_start
    
    
    if 'search_space' not in scplus_obj.uns.keys():
        log.info('Getting search space')
        import time
        t1_start = time.time()
        get_search_space(scplus_obj,
                     biomart_host = biomart_host,
                     species = species,
                     assembly = assembly, 
                     upstream = upstream,
                     downstream = downstream)
        m5=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_search_space = t1_stop-t1_start
                 
    scplus_obj.uns['region_to_gene'] = pd.read_csv(r2g_file, sep='\t')
    time_region_to_gene = 0
    m6 = 0
    if 'region_to_gene' not in scplus_obj.uns.keys():
        log.info('Inferring region to gene relationships')
        import time
        t1_start = time.time()
        calculate_regions_to_genes_relationships(scplus_obj, 
                        ray_n_cpu = n_cpu, 
                        _temp_dir = _temp_dir,
                        importance_scoring_method = 'GBM',
                        importance_scoring_kwargs = GBM_KWARGS,
                        **kwargs)
        m6=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_region_to_gene = t1_stop-t1_start
        scplus_obj.uns['region_to_gene'].to_csv(save_path+'region_to_gene.tsv', sep='\t')

    
    scplus_obj.uns['TF2G_adj'] = pd.read_csv(tf2g_file, sep='\t')  
    time_tf_to_gene = 0
    m7 = 0
    if 'TF2G_adj' not in scplus_obj.uns.keys():
        log.info('Inferring TF to gene relationships')
        import time
        t1_start = time.time()
        calculate_TFs_to_genes_relationships(scplus_obj, 
                        tf_file = tf_file,
                        ray_n_cpu = n_cpu, 
                        method = 'GBM',
                        _temp_dir = _temp_dir,
                        key= 'TF2G_adj',
                        **kwargs)
        m7=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_tf_to_gene = t1_stop-t1_start
        scplus_obj.uns['TF2G_adj'].to_csv(save_path+'tf_to_gene.tsv', sep='\t')
                        
    if 'eRegulons' not in scplus_obj.uns.keys():
        log.info('Build eGRN')
        import time
        t1_start = time.time()
        build_grn(scplus_obj,
                 min_target_genes = 10,
                 adj_pval_thr = 1,
                 min_regions_per_gene = 0,
                 quantiles = (0.85, 0.90, 0.95),
                 top_n_regionTogenes_per_gene = (5, 10, 15),
                 top_n_regionTogenes_per_region = (),
                 binarize_using_basc = True,
                 rho_dichotomize_tf2g = True,
                 rho_dichotomize_r2g = True,
                 rho_dichotomize_eregulon = True,
                 rho_threshold = 0.05,
                 keep_extended_motif_annot = True,
                 merge_eRegulons = True, 
                 order_regions_to_genes_by = 'importance',
                 order_TFs_to_genes_by = 'importance',
                 key_added = 'eRegulons',
                 cistromes_key = 'Unfiltered',
                 disable_tqdm = False, 
                 ray_n_cpu = n_cpu,
                 _temp_dir = _temp_dir,
                 **kwargs)
        m8=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_gsea = t1_stop-t1_start
        log.info('Saving object')         
        with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
            dill.dump(scplus_obj, f, protocol = -1)
                 
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}

    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
        
    times = [time_create_object, time_filter_genes, time_filter_regions, time_cistromes, time_search_space, time_region_to_gene, time_tf_to_gene, time_gsea, time_format_regulons, time_signatures, time_region_ranking, time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m1,m2-m1,m3-m2,m4-m3, m5-m4, m6-m5, m7-m6,m8-m7,m9-m8,m10-m9,m11-m10,m12-m11,m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Create_object', 'Filter_genes', 'Filter_regions', 'Merge_cistromes', 'Search_space', 'Region_to_gene', 'TF_to_gene', 'GSEA', 'Format_regulons', 'Signatures',
                                   'Region_ranking', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/'
import pickle
infile = open(outDir + 'cistopic_obj.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
## Precomputed imputed data
import pickle
infile = open(outDir + 'Imputed_accessibility.pkl', 'rb')
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
path_to_loom = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/data/high/rna/count_matrix.loom'
loom = SCopeLoom.read_loom(path_to_loom)
# Fix names
expr_mat = loom.ex_mtx
rna_anndata = anndata.AnnData(X=expr_mat)
## Precomputed imputed data
import pickle
infile = open(outDir+'pycistarget/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

run_scenicplus_time(rna_anndata,
    cistopic_obj,
    imputed_acc_obj,
    menr,
    '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/scenicplus/region_to_gene.tsv',
    '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/scenicplus/tf_to_gene.tsv',
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

In [None]:
#!/bin/bash

#SBATCH --partition=bigmem
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=8:00:00
#SBATCH --mem=300G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_25K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_25K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_25K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_25K_cells.py

In [1]:
## Precomputed imputed data
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/'
infile = open(outDir+'scenicplus/scplus_obj.pkl', 'rb') 
scplus_obj = pickle.load(infile)
infile.close()

In [14]:
scplus_obj.uns['TF2G_adj'].iloc[:,1:8] 

Unnamed: 0,TF,target,importance,regulation,rho,importance_x_rho,importance_x_abs_rho
0,A1CF,ATAD2B,0.510889,0,0.026045,0.013306,0.013306
1,A1CF,MPV17,0.061977,-1,-0.092891,-0.005757,0.005757
2,A1CF,NAGLU,0.493043,1,0.198487,0.097863,0.097863
3,A1CF,AC127904.2,0.255126,0,-0.003474,-0.000886,0.000886
4,A1CF,CEP83,0.015408,-1,-0.095726,-0.001475,0.001475
...,...,...,...,...,...,...,...
11607269,KLF12,KLF12,13.856801,1,1.000000,13.856801,13.856801
11607270,OVOL1,OVOL1,6.372256,1,1.000000,6.372256,6.372256
11607271,ZNF529,ZNF529,11.553746,1,1.000000,11.553746,11.553746
11607272,ZNF816,ZNF816,8.773581,1,1.000000,8.773581,8.773581


In [13]:
scplus_obj.uns['TF2G_adj']

Unnamed: 0.1,Unnamed: 0,TF,target,importance,regulation,rho,importance_x_rho,importance_x_abs_rho
0,0,A1CF,ATAD2B,0.510889,0,0.026045,0.013306,0.013306
1,1,A1CF,MPV17,0.061977,-1,-0.092891,-0.005757,0.005757
2,2,A1CF,NAGLU,0.493043,1,0.198487,0.097863,0.097863
3,3,A1CF,AC127904.2,0.255126,0,-0.003474,-0.000886,0.000886
4,4,A1CF,CEP83,0.015408,-1,-0.095726,-0.001475,0.001475
...,...,...,...,...,...,...,...,...
11607269,11607269,KLF12,KLF12,13.856801,1,1.000000,13.856801,13.856801
11607270,11607270,OVOL1,OVOL1,6.372256,1,1.000000,6.372256,6.372256
11607271,11607271,ZNF529,ZNF529,11.553746,1,1.000000,11.553746,11.553746
11607272,11607272,ZNF816,ZNF816,8.773581,1,1.000000,8.773581,8.773581


In [None]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time
import psutil

def _format_df(df, key, adjpval_thr, log2fc_thr):
    """
    A helper function to format differential test results
    """
    df.index = df['names']
    df = df[['logfoldchanges', 'pvals_adj']]
    df.columns = ['Log2FC', 'Adjusted_pval']
    df['Contrast'] = key
    df.index.name = None
    df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
    df = df.loc[df['Log2FC'] >= log2fc_thr]
    df = df.sort_values(
        ['Log2FC', 'Adjusted_pval'], ascending=[False, True]
    )
    return df

def get_differential_features_time(scplus_obj: SCENICPLUS,
                              variable,
                              use_hvg: Optional[bool] = True,
                              contrast_type: Optional[List] = ['DARs', 'DEGs'],
                              adjpval_thr: Optional[float] = 0.05,
                              log2fc_thr: Optional[float] = np.log2(1.5),
                              min_cells: Optional[int] = 2
                              ):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+')

    for contrast in contrast_type:
        log.info('Calculating ' + contrast + ' for variable ' + variable)
        if contrast == 'DEGs':
            adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
            min_disp = 0.5
        if contrast == 'DARs':
            adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
                index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
            min_disp = 0.05
        adata.obs = scplus_obj.metadata_cell

        # remove annotations with less than 'min_cells'
        label_count = adata.obs[variable].value_counts()
        keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
        keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
        adata = adata[keepcellids]
        
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        if use_hvg:
            sc.pp.highly_variable_genes(
                adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf, n_bins=10)
            var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
            )
            adata = adata[:, var_features]
            log.info('There are ' + str(len(var_features)) +
                     ' variable features')        
        
        sc.tl.rank_genes_groups(
            adata, variable, method='wilcoxon', corr_method='bonferroni')
        groups = adata.uns['rank_genes_groups']['names'].dtype.names
        diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
            adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
        if contrast not in scplus_obj.uns.keys():
            scplus_obj.uns[contrast] = {}
        scplus_obj.uns[contrast][variable] = diff_dict
        log.info('Finished calculating ' + contrast +
                 ' for variable ' + variable)

def run_scenicplus_time(scplus_obj,
    variable: List[str],
    species: str,
    assembly: str,
    tf_file: str,
    save_path: str,
    biomart_host: Optional[str] = 'http://www.ensembl.org',
    upstream: Optional[List] = [1000, 150000],
    downstream: Optional[List] = [1000, 150000],
    region_ranking: Optional['CisTopicImputedFeatures'] = None,
    gene_ranking: Optional['CisTopicImputedFeatures'] = None,
    simplified_eGRN: Optional[bool] = False,
    calculate_TF_eGRN_correlation: Optional[bool] = True,
    calculate_DEGs_DARs: Optional[bool] = True,
    export_to_loom_file: Optional[bool] = True,
    export_to_UCSC_file: Optional[bool] = True,
    tree_structure: Sequence[str] = (),
    path_bedToBigBed: Optional[str] = None,
    n_cpu: Optional[int] = 1,
    _temp_dir: Optional[str] = '',
    **kwargs
    ):
    
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('SCENIC+_wrapper')
    
    scplus_obj.uns['region_to_gene'] = scplus_obj.uns['region_to_gene'].iloc[:,1:8] 
    scplus_obj.uns['TF2G_adj'] = scplus_obj.uns['TF2G_adj'].iloc[:,1:8] 
                     
    if 'eRegulon_metadata' not in scplus_obj.uns.keys():
        log.info('Formatting eGRNs')
        import time
        t1_start = time.time()
        format_egrns(scplus_obj,
                      eregulons_key = 'eRegulons',
                      TF2G_key = 'TF2G_adj',
                      key_added = 'eRegulon_metadata')
        m9=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_format_regulons = t1_stop-t1_start

                    
    if 'eRegulon_signatures' not in scplus_obj.uns.keys():
        log.info('Converting eGRNs to signatures')
        import time
        t1_start = time.time()
        get_eRegulons_as_signatures(scplus_obj,
                                     eRegulon_metadata_key='eRegulon_metadata', 
                                     key_added='eRegulon_signatures')
        m10=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_signatures = t1_stop-t1_start
                                     
    if simplified_eGRN is True:
        md = scplus_obj.uns['eRegulon_signatures']['Gene_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()}

        md = scplus_obj.uns['eRegulon_signatures']['Region_based']
        names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
        scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()}
    
    
    if 'eRegulon_AUC' not in scplus_obj.uns.keys():
        log.info('Calculating eGRNs AUC')
        if region_ranking is None:
            log.info('Calculating region ranking')
            import time
            t1_start = time.time()
            region_ranking = make_rankings(scplus_obj, target='region')
            m11=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_region_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f:
                dill.dump(region_ranking, f, protocol = -1)
        log.info('Calculating eGRNs region based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = n_cpu)
        m12=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_regions = t1_stop-t1_start
        if gene_ranking is None:
            import time
            t1_start = time.time()
            log.info('Calculating gene ranking')
            gene_ranking = make_rankings(scplus_obj, target='gene')
            m13=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_gene_ranking = t1_stop-t1_start
            with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f:
                dill.dump(gene_ranking, f, protocol = -1)
        log.info('Calculating eGRNs gene based AUC')
        import time
        t1_start = time.time()
        score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC', 
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = n_cpu)
        m14=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_aucell_genes = t1_stop-t1_start
                
                                
    if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys():
        log.info('Binarizing eGRNs AUC')
        import time
        t1_start = time.time()
        binarize_AUC(scplus_obj, 
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=n_cpu)
        m15=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_binarize_regulons = t1_stop-t1_start
             
    if not hasattr(scplus_obj, 'dr_cell'):
        import time
        t1_start = time.time()
        scplus_obj.dr_cell = {}         
    if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC UMAP')
        run_eRegulons_umap(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
    if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys():
        log.info('Making eGRNs AUC tSNE')
        run_eRegulons_tsne(scplus_obj,
                   scale=True, signature_keys=['Gene_based', 'Region_based'])
        m16=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_dimensionality_reduction = t1_stop-t1_start
                   
    if 'RSS' not in scplus_obj.uns.keys():
        import time
        t1_start = time.time()
        log.info('Calculating eRSS')
        for var in variable:
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
            regulon_specificity_scores(scplus_obj, 
                         var,
                         signature_keys=['Region_based'],
                         out_key_suffix='_region_based',
                         scale=False)
        m17=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_rss = t1_stop-t1_start
                         
    if calculate_DEGs_DARs is True:
        import time
        t1_start = time.time()
        log.info('Calculating DEGs/DARs')
        for var in variable:
            get_differential_features_time(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs'])
        m18=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_degs_dars = t1_stop-t1_start
            
    if export_to_loom_file is True:
        import time
        t1_start = time.time()
        log.info('Exporting to loom file')
        export_to_loom(scplus_obj, 
               signature_key = 'Gene_based',
               tree_structure = tree_structure,
               title =  'Gene based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom'))
        export_to_loom(scplus_obj, 
               signature_key = 'Region_based',
               tree_structure = tree_structure,
               title =  'Region based eGRN',
               nomenclature = assembly,
               out_fname=os.path.join(save_path,'SCENIC+_region_based.loom'))
        m19=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_loom = t1_stop-t1_start
               
    if export_to_UCSC_file is True:
        log.info('Exporting to UCSC')
        import time
        t1_start = time.time()
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.rho.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='rho',
                            scale_by_gene=False,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        r2g_data = export_to_UCSC_interact(scplus_obj,
                            species,
                            os.path.join(save_path,'r2g.importance.bed'),
                            path_bedToBigBed=path_bedToBigBed,
                            bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'),
                            region_to_gene_key='region_to_gene',
                            pbm_host=biomart_host,
                            assembly=assembly,
                            ucsc_track_name='R2G',
                            ucsc_description='SCENIC+ region to gene links',
                            cmap_neg='Reds',
                            cmap_pos='Greens',
                            key_for_color='importance',
                            scale_by_gene=True,
                            subset_for_eRegulons_regions=True,
                            eRegulons_key='eRegulons')
        regions = export_eRegulons(scplus_obj,
                os.path.join(save_path,'eRegulons.bed'),
                assembly,
                bigbed_outfile = os.path.join(save_path,'eRegulons.bb'),
                eRegulon_metadata_key = 'eRegulon_metadata',
                eRegulon_signature_key = 'eRegulon_signatures',
                path_bedToBigBed=path_bedToBigBed)
        m20=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
        t1_stop = time.time()
        time_export_ucsc = t1_stop-t1_start
      
    
    times = [ time_format_regulons, time_signatures,time_aucell_regions, 
             time_gene_ranking, time_aucell_genes, time_binarize_regulons, time_dimensionality_reduction, time_rss, time_degs_dars, time_export_loom, time_export_ucsc]
    df = pd.DataFrame(times, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Time'])
    df.to_csv(save_path+'running_times.tsv', sep='\t') 
    memory=[m9, m10, m12,m13,m14,m15,m16,m17,m18,m19,m20]
    df = pd.DataFrame(memory, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'cummulative_memory.tsv', sep='\t')  
    memory=[m9, m10-m9, m12-m10, m13-m12, m14-m13, m15-m14, m16-m15, m17-m16, m18-m17, m19-m18, m20-m19]
    df = pd.DataFrame(memory, index=['Format_regulons', 'Signatures', 'Aucell_regions', 'Gene_ranking', 'Aucell_genes', 'Binarize_regulons', 'Dimensionality_reduction', 'RSS', 'DEGs_DARs', 'Export_loom', 'Export_UCSC'], columns=['Memory'])
    df.to_csv(save_path+'memory_per_step.tsv', sep='\t')  
        
    log.info('Saving object')         
    with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f:
        dill.dump(scplus_obj, f, protocol = -1)
        
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
    
# Load data
## ATAC - cisTopic object
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/'
import pickle
infile = open(outDir + 'scenicplus/scplus_obj.pkl', 'rb')
scplus_obj= pickle.load(infile)
infile.close()

run_scenicplus_time(scplus_obj,
    variable = ['ACC_cell_type'],
    species = 'hsapiens',
    assembly = 'hg38',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_hg38.txt',
    save_path = outDir + 'scenicplus/',
    biomart_host = 'http://dec2017.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('DPCL', 'SCENIC+', 'grnboost'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )