# DPCL_cisTopicObject_3K_fragments_80_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_80_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_80_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_80_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_80_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_80_cells.py

# DPCL_cisTopicObject_3K_fragments_1K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_1K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_1K_cells.py

# DPCL_cisTopicObject_3K_fragments_10K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_10K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_10K_cells.py

# DPCL_cisTopicObject_3K_fragments_25K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/low_simulation/DPCL_cisTopicObject_3K_fragments_25K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_3K_fragments_25K_cells
#SBATCH --output=DPCL_cisTopicObject_3K_fragments_25K_cells.out
#SBATCH --error=DPCL_cisTopicObject_3K_fragments_25K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_3K_fragments_25K_cells.py

# DPCL_cisTopicObject_10K_fragments_80_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_80_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_80_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_80_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_80_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_80_cells.py

# DPCL_cisTopicObject_10K_fragments_1K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_1K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_1K_cells.py

# DPCL_cisTopicObject_10K_fragments_10K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_10K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_10K_cells.py

# DPCL_cisTopicObject_10K_fragments_25K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/medium_simulation/DPCL_cisTopicObject_10K_fragments_25K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_10K_fragments_25K_cells
#SBATCH --output=DPCL_cisTopicObject_10K_fragments_25K_cells.out
#SBATCH --error=DPCL_cisTopicObject_10K_fragments_25K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_10K_fragments_25K_cells.py

# DPCL_cisTopicObject_20K_fragments_80_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_80_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_80_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_80_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_80_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_80_cells.py

# DPCL_cisTopicObject_20K_fragments_1K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_1K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_1K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_1K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_1K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_1K_cells.py

# DPCL_cisTopicObject_20K_fragments_10K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_10K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_10K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_10K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_10K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_10K_cells.py

# DPCL_cisTopicObject_20K_fragments_25K_cells

In [None]:
from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time
import psutil

def run_pycistarget(region_sets: Dict[str, pr.PyRanges],
                 species: str,
                 save_path: str,
                 custom_annot: pd.DataFrame = None,
                 save_partial: bool = False,
                 ctx_db_path: str = None,
                 dem_db_path: str = None,
                 run_without_promoters: bool = False,
                 biomart_host: str = 'http://www.ensembl.org',
                 promoter_space: int = 500,
                 ctx_auc_threshold: float = 0.005,
                 ctx_nes_threshold: float = 3.0,
                 ctx_rank_threshold: float = 0.05,
                 dem_log2fc_thr: float = 0.5,
                 dem_motif_hit_thr: float = 3.0,
                 dem_max_bg_regions: int = 500,
                 annotation : List[str] = ['Direct_annot', 'Orthology_annot'],
                 motif_similarity_fdr: float = 0.000001,
                 path_to_motif_annotations: str = None,
                 annotation_version: str = 'v9',
                 n_cpu : int = 1,
                 _temp_dir: str = None,
                 exclude_motifs: str = None,
                 exclude_collection: List[str] = None,
                 **kwargs):
    # Create logger
    level = logging.INFO
    log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=log_format, handlers=handlers)
    log = logging.getLogger('pycisTarget_wrapper')
    
    import time
    start_time = time.time()
    
    check_folder = os.path.isdir(save_path)
    if not check_folder:
        os.makedirs(save_path)
        log.info("Created folder : " + save_path)
    else:
        log.info(save_path + " folder already exists.")
        
    def get_species_annotation(species: str):
        dataset = pbm.Dataset(name=species,  host=biomart_host)
        annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
        annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
        annot['Chromosome'] = annot['Chromosome'].astype('str')
        filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI')
        annot = annot[~filterf]
        annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1')
        annot = annot[annot.Transcript_type == 'protein_coding']
        annot = annot.dropna(subset = ['Chromosome', 'Start'])
        # Check if chromosomes have chr
        check = region_sets[list(region_sets.keys())[0]]
        if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]):
            annot.Chromosome = annot.Chromosome.str.replace('chr', '')
        if not any(['chr' in x for x in annot.Chromosome]):
            annot.Chromosome = [f'chr{x}' for x in annot.Chromosome]
        annot_dem=annot.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
        return annot, annot_dem
        
    # Prepare annotation
    if species == 'homo_sapiens':
        annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl')
    elif species == 'mus_musculus':
        annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl')
    elif species == 'drosophila_melanogaster':
        annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl')
    elif species == 'gallus_gallus':
        annot, annot_dem = get_species_annotation('ggallus_gene_ensembl')
    elif species == 'custom':
        annot_dem = custom_annot
        annot = annot_dem.copy()
        # Define promoter space
        annot['End'] = annot['Start'].astype(int)+promoter_space
        annot['Start'] = annot['Start'].astype(int)-promoter_space
        annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])
    else:
        raise TypeError("Species not recognized")

    menr = {}
    for key in region_sets.keys():
        if ctx_db_path is not None:
            log.info('Loading cisTarget database for ' + key)
            ## CISTARGET
            regions = region_sets[key]
            ctx_db = cisTargetDatabase(ctx_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                ctx_db.db_rankings = ctx_db.db_rankings.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)]
            ## DEFAULT
            import time
            t1_start = time.time()
            log.info('Running cisTarget for '+key)
            menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr,
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
            m1=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_ctx_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'CTX_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : " + out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['CTX_'+key+'_All'].keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f:
                    dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1)

            if run_without_promoters is True:
                ## REMOVE PROMOTERS
                log.info('Running cisTarget without promoters for '+key)
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query'])
                ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db,
                                   region_sets = regions_np,
                                   specie = species,
                                   auc_threshold = ctx_auc_threshold,
                                   nes_threshold = ctx_nes_threshold,
                                   rank_threshold = ctx_rank_threshold,
                                   annotation = annotation,
                                   motif_similarity_fdr = motif_similarity_fdr, 
                                   path_to_motif_annotations = path_to_motif_annotations,
                                   n_cpu = n_cpu,
                                   _temp_dir= _temp_dir,
                                   annotation_version = annotation_version,
                                   **kwargs)
                m2=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_ctx_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder:" + out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['CTX_'+key+'_No_promoters'].keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80)
                
                if(save_partial):
                    with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f:
                      dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1)
        ## DEM
        if dem_db_path is not None:
            log.info('Running DEM for '+key)
            regions = region_sets[key]
            dem_db = DEMDatabase(dem_db_path, regions)  
            if exclude_motifs is not None:
                out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
                dem_db.db_scores = dem_db.db_scores.drop(out)
            if exclude_collection is not None:
                for col in exclude_collection:
                    dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)]
            t1_start = time.time()
            menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db,
                               region_sets = regions,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               genome_annotation = annot_dem,
                               promoter_space = promoter_space,
                               motif_annotation =   annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
            m3=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
            t1_stop = time.time()
            time_dem_all = t1_stop-t1_start
            out_folder = os.path.join(save_path,'DEM_'+key+'_All')
            check_folder = os.path.isdir(out_folder)
            if not check_folder:
                os.makedirs(out_folder)
                log.info("Created folder : "+ out_folder)
            else:
                log.info(out_folder + " folder already exists.")
            for x in menr['DEM_'+key+'_All'].motif_enrichment.keys():
                out_file = os.path.join(out_folder, str(x) +'.html')
                menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
            if(save_partial):
                with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f:
                  dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                
            if run_without_promoters is True:
                log.info('Running DEM without promoters for '+key)
                ## REMOVE PROMOTERS
                regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
                regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
                db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query'])
                dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()}
                t1_start = time.time()
                menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db,
                               region_sets = regions_np,
                               log2fc_thr = dem_log2fc_thr,
                               motif_hit_thr = dem_motif_hit_thr,
                               max_bg_regions = dem_max_bg_regions,
                               specie = species,
                               promoter_space = promoter_space,
                               motif_annotation = annotation,
                               motif_similarity_fdr = motif_similarity_fdr, 
                               path_to_motif_annotations = path_to_motif_annotations,
                               n_cpu = n_cpu,
                               annotation_version = annotation_version,
                               tmp_dir = save_path,
                               _temp_dir= _temp_dir,
                               **kwargs)
                m4=psutil.Process().memory_info().rss / (1024 * 1024) / 1000
                t1_stop = time.time()
                time_dem_np = t1_stop-t1_start
                out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters')
                check_folder = os.path.isdir(out_folder)
                if not check_folder:
                    os.makedirs(out_folder)
                    log.info("Created folder : "+ out_folder)
                else:
                    log.info(out_folder + " folder already exists.")
                for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys():
                    out_file = os.path.join(out_folder, str(x) +'.html')
                    menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80)
                if(save_partial):
                    with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f:
                      dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1)
                    
        times = [time_ctx_all, time_ctx_np, time_dem_all, time_dem_np]
        df = pd.DataFrame(times, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Time'])
        df.to_csv(save_path+key+'running_times.tsv', sep='\t') 
        memory=[m1,m2,m3,m4]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'cummulative_memory.tsv', sep='\t')  
        memory=[m1,m2-m1,m3-m2,m4-m3]
        df = pd.DataFrame(memory, index=['CTX_all', 'CTX_np', 'DEM_all', 'DEM_np'], columns=['Memory'])
        df.to_csv(save_path+key+'memory_per_step.tsv', sep='\t')  
                    
    log.info('Saving object')         
    with open(os.path.join(save_path,'menr.pkl'), 'wb') as f:
        dill.dump(menr, f, protocol=-1)
    
    import time
    log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60)) 
            
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/speed_benchmark/high_simulation/DPCL_cisTopicObject_20K_fragments_25K_cells/'
infile = open(outDir+'binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Create save_path
save_path = outDir + 'pycistarget/'
# Create save_Dir
if not os.path.exists(save_path):
    os.makedirs(save_path)


# Run pycistarget
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = save_path,
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL_feather_v2.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10nr_clust-nr.mgi-m0.00001-o0.0.tbl',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 8,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

In [None]:
#!/bin/bash

#SBATCH --partition=batch
#SBATCH --cluster wice
#SBATCH --account lp_wice_pilot
#SBATCH --mail-type=ALL
#SBATCH --mail-user=carmen.bravogonzalezblas@kuleuven.be
#SBATCH --ntasks=1

#SBATCH --cpus-per-task=20
#SBATCH --time=3:00:00
#SBATCH --mem=180G

#SBATCH --job-name=DPCL_cisTopicObject_20K_fragments_25K_cells
#SBATCH --output=DPCL_cisTopicObject_20K_fragments_25K_cells.out
#SBATCH --error=DPCL_cisTopicObject_20K_fragments_25K_cells.err

singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch,/local_scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif python DPCL_cisTopicObject_20K_fragments_25K_cells.py