### 1. Run cisTarget

In [2]:
# Load region binarized topics
import pickle
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycisTopic/'
infile = open(outDir+'topic_binarization/binarized_topic_region_otsu.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()
# Load DARs
import pickle
infile = open(outDir+'DARs/DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()
# Format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key: pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}

#### Optional: Check % promoters

In [3]:
# Load promoter annotations
promoter_space = 500
biomart_host = 'http://nov2020.archive.ensembl.org/'
import pybiomart as pbm
# Prepare annotation
name = 'mmusculus_gene_ensembl'
dataset = pbm.Dataset(name=name,  host=biomart_host)
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT')
annot = annot[~filterf]
annot['Chromosome'] = annot['Chromosome'].str.replace(r'(\b\S)', r'chr\1')
annot = annot[annot.Transcript_type == 'protein_coding']
annot = annot.dropna(subset = ['Chromosome', 'Start'])
# Check if chromosomes have chr
check = region_sets[list(region_sets.keys())[0]]
if 'chr' not in check[list(check.keys())[0]].df['Chromosome'][0]:
    annot.Chromosome = annot.Chromosome.str.replace('chr', '')
annot_dem=annot.copy()
# Define promoter space
annot['End'] = annot['Start'].astype(int)+promoter_space
annot['Start'] = annot['Start'].astype(int)-promoter_space
annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']])

  annot['Chromosome'] = annot['Chromosome'].str.replace(r'(\b\S)', r'chr\1')


In [7]:
# Check in topics
regions = region_sets['Topics']
regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
{x:100-(regions_np[x].df.shape[0]/regions[x].df.shape[0]*100) for x in regions.keys()}

{'Topic1': 21.210813355828464,
 'Topic2': 9.781204503045089,
 'Topic3': 83.92430644864965,
 'Topic4': 14.196506205043079,
 'Topic5': 24.576877234803334,
 'Topic6': 8.817407351773568,
 'Topic7': 59.81668293741187,
 'Topic8': 7.837035447294056,
 'Topic9': 8.124505407544177,
 'Topic10': 8.460155337646214,
 'Topic11': 9.373598445208557,
 'Topic12': 9.499362825414153,
 'Topic13': 13.650677979335285,
 'Topic14': 12.996638451257553,
 'Topic15': 10.372540992437138,
 'Topic16': 9.171997659072105,
 'Topic17': 6.7513760393488695,
 'Topic18': 16.983523447401765,
 'Topic19': 11.653046594982072,
 'Topic20': 53.36717009160684,
 'Topic21': 7.262173328939696,
 'Topic22': 6.091685632370755,
 'Topic23': 44.93381170357541,
 'Topic24': 48.04245163255979,
 'Topic25': 6.797842528700997,
 'Topic26': 12.775764680805707,
 'Topic27': 5.87823448758715,
 'Topic28': 70.8575112830432,
 'Topic29': 7.821117682238025,
 'Topic30': 8.537369130299581,
 'Topic31': 13.157428723215872,
 'Topic32': 8.099600555748523,
 'Topic3

In [8]:
# Check in DARs
regions = region_sets['DARs']
regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()}
regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()}
{x:100-(regions_np[x].df.shape[0]/regions[x].df.shape[0]*100) for x in regions.keys()}

{'AST': 16.866771068963516,
 'CGE_LAMP5': 11.406059469092952,
 'CGE_SNCG': 14.176322418136024,
 'CGE_VIP': 14.084640989063246,
 'ENDO': 20.44513094151513,
 'L2_3_IT': 5.098241393137428,
 'L4_IT': 5.389614091090465,
 'L5_IT': 4.402430790006747,
 'L5_PT': 4.9559552890665515,
 'L6_CT': 5.318964018772803,
 'L6_IT': 4.637026745397705,
 'L6_IT_CAR3': 5.732869942288687,
 'L6b': 6.440961955450419,
 'MGE_PVALB': 14.813473379210436,
 'MGE_SST': 12.228376651043888,
 'MGL': 21.486301369863014,
 'NP': 7.733429982331359,
 'OL': 14.036265950302223,
 'OPC': 14.768122467357045}

#### Create V10 data set specific CTX db

In [None]:
# For future times, splitting to 5 parts is much faster than 10
qsub /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/ctx_db/create_ctx.pbs

#### Run cisTarget

In [None]:
# Run pycistarget
# run_without_promoters = True, will run the methods in all regions + the region sets without promoters
import os
os.chdir('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/scenicplus/src/')
from scenicplus.wrappers.run_pycistarget import *
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/ctx_db/TEW_mouse_cortex.regions_vs_motifs.rankings.feather',
                 species = 'mus_musculus',
                 save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific/',
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/ctx_db/TEW_mouse_cortex.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://nov2020.archive.ensembl.org/',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10/snapshots/motifs-v10-nr.mgi-m0.001-o0.0_clust.tsv',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 1,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill',
                 exclude_motifs = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10/Promoter_bias/clustered/Promoter_motifs.tsv')
# Here I renamed the menr.pkl output to: /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific/menr_DT_nosimilarity.pkl'

In [None]:
# Select only annotated TFs
# Run pycistarget
# run_without_promoters = True, will run the methods in all regions + the region sets without promoters
import os
os.chdir('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/scenicplus/src/')
from scenicplus.wrappers.run_pycistarget import *
run_pycistarget(region_sets,
                 ctx_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/ctx_db/TEW_mouse_cortex.regions_vs_motifs.rankings.feather',
                 species = 'mus_musculus',
                 save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific_only_annotated/',
                 dem_db_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/ctx_db/TEW_mouse_cortex.regions_vs_motifs.scores.feather',
                 run_without_promoters = True,
                 biomart_host = 'http://nov2020.archive.ensembl.org/',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 path_to_motif_annotations = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10/snapshots/motifs-v10-nr.mgi-m0.001-o0.0_clust.tsv',
                 annotation_version = 'v10nr_clust',
                 annotation = ['Direct_annot', 'Orthology_annot'],
                 n_cpu = 1,
                 _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill',
                 exclude_motifs = '/staging/leuven/stg_00002/lcb/cbravo/cluster_motif_collection_V10/mm_not_annotated_DO.tsv')
# Here I renamed the menr.pkl output to: /staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific/menr_DT_nosimilarity.pkl'

### 2. Run SCENIC+

#### V10: Direct+Orthology == DARs/Topics

In [2]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *

In [None]:
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycisTopic/'
# Load cisTopic object
import pickle
infile = open(outDir + 'cisTopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
# Load imputed accessibility
import pickle
infile = open(outDir + 'DARs/Imputed_accessibility.pkl', 'rb') #Here I am using pycisTopic gene activity matrix, but could be any :)
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
projDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/'
path_to_annotated_rna_loom = projDir + 'data/MO_GEX_seurat_Cortex.loom'
loom = SCopeLoom.read_loom(path_to_annotated_rna_loom)
cell_data = get_metadata(loom)
# Fix names
cell_data = cell_data.replace('TEW__c14e1d__Multiome_RNA_brain_10x_no_perm', '10x_no_perm')
cell_data = cell_data.replace('TEW__3cc0d9__bb22bc__Multiome_brain_TST_NP40_004', 'TST_NP40_004')
cell_data = cell_data.replace('TEW__75da5c__5b0f59__Multiome_brain_TST', 'TST')
cell_data = cell_data.replace('TEW__c3f7c1__1ac906__Multiome_brain_10xcomplex_UC', '10x_complex_UC')
cell_data = cell_data.replace('TEW__d112c8__547ada__Multiome_RNA_brain_10x_complex', '10x_complex')
cell_data['barcode'] = [x.split('___')[0] for x in cell_data.index.tolist()]
cell_data.index = cell_data['barcode'] + '___' + cell_data['sample_id']
expr_mat = loom.ex_mtx
expr_mat.index = cell_data.index
rna_anndata = anndata.AnnData(X=expr_mat)
rna_anndata.obs = cell_data

In [4]:
# Fix region data (bug in old pycistopic versions)
from pycisTopic.utils import region_names_to_coordinates
fragment_matrix = cistopic_obj.fragment_matrix
binary_matrix = cistopic_obj.binary_matrix
region_data = region_names_to_coordinates(cistopic_obj.region_names)
region_data['Width'] = abs(region_data.End -region_data.Start).astype(np.int32)
region_data['cisTopic_nr_frag'] = np.array(
fragment_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_frag'] = np.log10(
region_data['cisTopic_nr_frag'])
region_data['cisTopic_nr_acc'] = np.array(
binary_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_acc'] = np.log10(
region_data['cisTopic_nr_acc'])
cistopic_obj.region_data = region_data

In [5]:
## Precomputed imputed data
import pickle
infile = open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific/menr_DT_nosimilarity.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

In [6]:
scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = rna_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)

In [8]:
filter_genes(scplus_obj, min_pct = 0.5)
filter_regions(scplus_obj, min_pct = 0.5)

2022-02-02 15:57:59,548 Preprocessing INFO     Going from 33160 genes to 13365 genes.
2022-02-02 16:00:00,004 Preprocessing INFO     Going from 438464 regions to 391957 regions.


In [9]:
# Save
import pickle
with open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho/scplus_obj.pkl', 'wb') as f:
  pickle.dump(scplus_obj, f)

In [None]:
# For the downstream analyses
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho/'
import pickle
infile = open(outDir+'scplus_obj.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()

from scenicplus.wrappers.run_scenicplus import *
run_scenicplus(scplus_obj,
    variable = ['ACC_consensus_cell_type'],
    species = 'mmusculus',
    assembly = 'mm10',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_mm.txt',
    save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho/',
    biomart_host = 'http://nov2020.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],
    region_ranking = None,
    gene_ranking = None,   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('Mouse_cortex_TEW', 'SCENIC+'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

#### V9: Direct+Orthology == DARs/Topics

In [2]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *

In [None]:
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycisTopic/'
# Load cisTopic object
import pickle
infile = open(outDir + 'cisTopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
# Load imputed accessibility
import pickle
infile = open(outDir + 'DARs/Imputed_accessibility.pkl', 'rb') #Here I am using pycisTopic gene activity matrix, but could be any :)
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
projDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/'
path_to_annotated_rna_loom = projDir + 'data/MO_GEX_seurat_Cortex.loom'
loom = SCopeLoom.read_loom(path_to_annotated_rna_loom)
cell_data = get_metadata(loom)
# Fix names
cell_data = cell_data.replace('TEW__c14e1d__Multiome_RNA_brain_10x_no_perm', '10x_no_perm')
cell_data = cell_data.replace('TEW__3cc0d9__bb22bc__Multiome_brain_TST_NP40_004', 'TST_NP40_004')
cell_data = cell_data.replace('TEW__75da5c__5b0f59__Multiome_brain_TST', 'TST')
cell_data = cell_data.replace('TEW__c3f7c1__1ac906__Multiome_brain_10xcomplex_UC', '10x_complex_UC')
cell_data = cell_data.replace('TEW__d112c8__547ada__Multiome_RNA_brain_10x_complex', '10x_complex')
cell_data['barcode'] = [x.split('___')[0] for x in cell_data.index.tolist()]
cell_data.index = cell_data['barcode'] + '___' + cell_data['sample_id']
expr_mat = loom.ex_mtx
expr_mat.index = cell_data.index
rna_anndata = anndata.AnnData(X=expr_mat)
rna_anndata.obs = cell_data

In [4]:
# Fix region data (bug in old pycistopic versions)
from pycisTopic.utils import region_names_to_coordinates
fragment_matrix = cistopic_obj.fragment_matrix
binary_matrix = cistopic_obj.binary_matrix
region_data = region_names_to_coordinates(cistopic_obj.region_names)
region_data['Width'] = abs(region_data.End -region_data.Start).astype(np.int32)
region_data['cisTopic_nr_frag'] = np.array(
fragment_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_frag'] = np.log10(
region_data['cisTopic_nr_frag'])
region_data['cisTopic_nr_acc'] = np.array(
binary_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_acc'] = np.log10(
region_data['cisTopic_nr_acc'])
cistopic_obj.region_data = region_data

In [5]:
## Precomputed imputed data
import pickle
infile = open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific_V9/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

In [6]:
scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = rna_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)

In [8]:
filter_genes(scplus_obj, min_pct = 0.5)
filter_regions(scplus_obj, min_pct = 0.5)

2022-02-02 15:57:59,548 Preprocessing INFO     Going from 33160 genes to 13365 genes.
2022-02-02 16:00:00,004 Preprocessing INFO     Going from 438464 regions to 391957 regions.


In [9]:
# Save
import pickle
with open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v9_direct_ortho/scplus_obj.pkl', 'wb') as f:
  pickle.dump(scplus_obj, f)

In [None]:
# For the downstream analyses
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v9_direct_ortho/'
import pickle
infile = open(outDir+'scplus_obj.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()

from scenicplus.wrappers.run_scenicplus import *
run_scenicplus(scplus_obj,
    variable = ['ACC_consensus_cell_type'],
    species = 'mmusculus',
    assembly = 'mm10',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_mm.txt',
    save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v9_direct_ortho/',
    biomart_host = 'http://nov2020.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],
    region_ranking = None,
    gene_ranking = None,   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('Mouse_cortex_TEW', 'SCENIC+'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )

#### V10: Direct+Orthology == DARs/Topics simplified

In [2]:
# Load functions
from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *

In [None]:
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycisTopic/'
# Load cisTopic object
import pickle
infile = open(outDir + 'cisTopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
# Load imputed accessibility
import pickle
infile = open(outDir + 'DARs/Imputed_accessibility.pkl', 'rb') #Here I am using pycisTopic gene activity matrix, but could be any :)
imputed_acc_obj = pickle.load(infile)
infile.close()
## RNA - Create Anndata
from loomxpy.loomxpy import SCopeLoom
from pycisTopic.loom import *
import itertools
import anndata
projDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/'
path_to_annotated_rna_loom = projDir + 'data/MO_GEX_seurat_Cortex.loom'
loom = SCopeLoom.read_loom(path_to_annotated_rna_loom)
cell_data = get_metadata(loom)
# Fix names
cell_data = cell_data.replace('TEW__c14e1d__Multiome_RNA_brain_10x_no_perm', '10x_no_perm')
cell_data = cell_data.replace('TEW__3cc0d9__bb22bc__Multiome_brain_TST_NP40_004', 'TST_NP40_004')
cell_data = cell_data.replace('TEW__75da5c__5b0f59__Multiome_brain_TST', 'TST')
cell_data = cell_data.replace('TEW__c3f7c1__1ac906__Multiome_brain_10xcomplex_UC', '10x_complex_UC')
cell_data = cell_data.replace('TEW__d112c8__547ada__Multiome_RNA_brain_10x_complex', '10x_complex')
cell_data['barcode'] = [x.split('___')[0] for x in cell_data.index.tolist()]
cell_data.index = cell_data['barcode'] + '___' + cell_data['sample_id']
expr_mat = loom.ex_mtx
expr_mat.index = cell_data.index
rna_anndata = anndata.AnnData(X=expr_mat)
rna_anndata.obs = cell_data

In [4]:
# Fix region data (bug in old pycistopic versions)
from pycisTopic.utils import region_names_to_coordinates
fragment_matrix = cistopic_obj.fragment_matrix
binary_matrix = cistopic_obj.binary_matrix
region_data = region_names_to_coordinates(cistopic_obj.region_names)
region_data['Width'] = abs(region_data.End -region_data.Start).astype(np.int32)
region_data['cisTopic_nr_frag'] = np.array(
fragment_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_frag'] = np.log10(
region_data['cisTopic_nr_frag'])
region_data['cisTopic_nr_acc'] = np.array(
binary_matrix.sum(axis=1)).flatten()
region_data['cisTopic_log_nr_acc'] = np.log10(
region_data['cisTopic_nr_acc'])
cistopic_obj.region_data = region_data

In [5]:
## Precomputed imputed data
import pickle
infile = open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/pycistarget_clustered_data_set_specific_only_annotated/menr.pkl', 'rb') 
menr = pickle.load(infile)
infile.close()

In [6]:
scplus_obj = create_SCENICPLUS_object(
        GEX_anndata = rna_anndata,
        cisTopic_obj = cistopic_obj,
        imputed_acc_obj = imputed_acc_obj,
        menr = menr,
        ACC_prefix = 'ACC_',
        GEX_prefix = 'GEX_',
        bc_transform_func = lambda x: x,
        normalize_imputed_acc = False)

In [8]:
filter_genes(scplus_obj, min_pct = 0.5)
filter_regions(scplus_obj, min_pct = 0.5)

2022-02-02 15:57:59,548 Preprocessing INFO     Going from 33160 genes to 13365 genes.
2022-02-02 16:00:00,004 Preprocessing INFO     Going from 438464 regions to 391957 regions.


In [9]:
# Save
import pickle
with open('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho_annotated_motifs/scplus_obj.pkl', 'wb') as f:
  pickle.dump(scplus_obj, f)

In [None]:
# For the downstream analyses
outDir = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho_annotated_motifs/'
import pickle
infile = open(outDir+'scplus_obj.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()

from scenicplus.wrappers.run_scenicplus import *
run_scenicplus(scplus_obj,
    variable = ['ACC_consensus_cell_type'],
    species = 'mmusculus',
    assembly = 'mm10',
    tf_file = '/staging/leuven/stg_00002/lcb/cflerin/resources/allTFs_mm.txt',
    save_path = '/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/10x_multiome_mouse_cortex/TEW_cortex/scenicplus_v10_direct_ortho_annotated_motifs/',
    biomart_host = 'http://nov2020.archive.ensembl.org/',
    upstream = [1000, 150000],
    downstream = [1000, 150000],
    region_ranking = None,
    gene_ranking = None,   
    calculate_TF_eGRN_correlation = False,
    calculate_DEGs_DARs = True,
    export_to_loom_file = True,
    export_to_UCSC_file = True,
    tree_structure = ('Mouse_cortex_TEW', 'SCENIC+'),
    path_bedToBigBed = '/data/leuven/software/biomed/haswell_centos7/2018a/software/Kent_tools/20190730-linux.x86_64/bin/',
    n_cpu = 20,
    _temp_dir = '/scratch/leuven/313/vsc31305/ray_spill'
    )