## Generate pycisTarget db

In [None]:
#### Get fasta sequences
module load BEDTools
bedtools getfasta -fi /staging/leuven/stg_00002/lcb/resources/human/hg38/hg38.fa -bed /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/MACS_ATAC/iterative/peak_filtering_norm/combined_summits_final.bed > /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/MACS_ATAC/iterative/peak_filtering_norm/combined_summits_final.fa
#### Activate environment
conda_initialize /staging/leuven/stg_00002/lcb/ghuls/software/miniconda3/
conda activate create_cistarget_databases
#### Set ${create_cistarget_databases_dir} to https://github.com/aertslab/create_cisTarget_databases
create_cistarget_databases_dir='/staging/leuven/stg_00002/lcb/ghuls/software/create_cisTarget_databases'
#### Score the motifs in 10 chunks
for current_part in {1..10} ; do
     python3.8 ${create_cistarget_databases_dir}/create_cistarget_motif_databases.py \
         -f /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/data/MACS_ATAC/iterative/peak_filtering_norm/combined_summits_final.fa \
         -M /staging/leuven/stg_00002/lcb/cbravo/motif_clustering/RNA_harmony_snn_res_5_clusters/motif_collection_combined_motifs_stamp_and_singlets/singletons/ \
         -m /staging/leuven/stg_00002/lcb/cbravo/motif_clustering/RNA_harmony_snn_res_5_clusters/motif_collection_combined_motifs_stamp_and_singlets/motifs.txt \
         -p ${current_part} 10 \
         -o /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/dbs/DPCL \
         -t 20
done
#### Merge scores
${create_cistarget_databases_dir}/combine_partial_regions_or_genes_vs_motifs_or_tracks_cistarget_dbs.py -i /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/dbs/DPCL -o /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/dbs/
#### Remove chunks
rm /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/dbs/DPCL*part*
#### Create rankings
${create_cistarget_databases_dir}/convert_motifs_or_tracks_vs_regions_or_genes_scores_to_rankings_cistarget_dbs.py -i /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/dbs/DPCL.motifs_vs_regions.scores.feather -s 555

## 1. Motif enrichment

In [None]:
# r06i01n13
singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif ipython3

In [None]:
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycisTopic/'
infile = open(outDir+'topic_binarization/binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs/DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}

In [None]:
# Run pycistarget
# run_without_promoters = True, will run the methods in all regions + the region sets without promoters
import os
os.chdir('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/scenicplus/src/')
from scenicplus.wrappers.run_pycistarget import *
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/cluster_V10_V2/',
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/ctx_db/cluster_V10_DPCL.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10-nr.more_orthology.hgnc-mm0.00001-o0.0_clust.tsv',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 1,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')

### 2. With SCREEN database

In [None]:
# r06i01n13
singularity exec -B /lustre1,/staging,/data,/vsc-hard-mounts,/scratch /data/leuven/software/biomed/singularity_images/scenicplus/scenicplus.sif ipython3

In [None]:
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycisTopic/'
infile = open(outDir+'topic_binarization/binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs/DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}

In [None]:
# Run pycistarget
# run_without_promoters = True, will run the methods in all regions + the region sets without promoters
import os
os.chdir('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/scenicplus/src/')
from scenicplus.wrappers.run_pycistarget import *
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/hg38_SCREEN_cluster_db/cluster_SCREEN.regions_vs_motifs.rankings.feather',
                 species = 'homo_sapiens',
                 save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/DPCL/pycistarget/SCREEN_cluster_V10_V2/',
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/hg38_SCREEN_cluster_db/cluster_SCREEN.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10_no_desso_no_factorbook/snapshots/motifs-v10-nr.more_orthology.hgnc-mm0.00001-o0.0_clust.tsv',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 1,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill')