In [None]:
#supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

import scanpy as sc
#set some figure parameters for nice display inside jupyternotebooks.
%matplotlib inline
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(5, 5), facecolor='white')

## Prep our RNA adata structure

In [None]:
adata = sc.read_10x_h5("/rugpfs/fs0/tavz_lab/scratch/amillet/11_16_2022_Alon/E4AD_1yr-ATAC/outs/filtered_feature_bc_matrix.h5")
adata.var_names_make_unique()
adata

In [None]:
import pandas as pd
import numpy as np
cell_annot = pd.read_csv('/ru-auth/local/home/amillet/Multivelo/E4AD_1yr/Mglia_Only/annots.csv', sep=',', index_col=0)

In [None]:
adata = adata[np.isin(adata.obs.index,cell_annot.index)]

In [None]:
adata = adata[cell_annot.index,:]
adata.obs['celltype'] = cell_annot['mglia_ident']

In [None]:
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.raw = adata
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10)
sc.tl.umap(adata)
sc.pl.umap(adata, color = 'celltype')

In [None]:
adata.write(('E4AD_1yr/rna_adata.h5ad'), compression='gzip')

## Prep our ATAC adata structure

In [None]:
import os
import pycisTopic
#set some figure parameters for nice display inside jupyternotebooks.
%matplotlib inline

In [None]:
import scanpy as sc
adata = sc.read_h5ad('E4AD_1yr/rna_adata.h5ad')
cell_data = adata.obs
cell_data['sample_id'] = 'E4AD_1yr'
cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain.
del(adata)

In [None]:
# Get chromosome sizes (for mm10 here)
import pyranges as pr
import requests
import pandas as pd
target_url='https://hgdownload.cse.ucsc.edu/goldenPath/mm10/bigZips/mm10.chrom.sizes'
chromsizes=pd.read_csv(target_url, sep='\t', header=None)
chromsizes.columns=['Chromosome', 'End']
chromsizes['Start']=[0]*chromsizes.shape[0]
chromsizes=chromsizes.loc[:,['Chromosome', 'Start', 'End']]
# Exceptionally in this case, to agree with CellRangerARC annotations
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].replace('v', '.') for x in range(len(chromsizes['Chromosome']))]
chromsizes['Chromosome'] = [chromsizes['Chromosome'][x].split('_')[1] if len(chromsizes['Chromosome'][x].split('_')) > 1 else chromsizes['Chromosome'][x] for x in range(len(chromsizes['Chromosome']))]
chromsizes=pr.PyRanges(chromsizes)

In [None]:
# there are some rando chromosomes in the fragments file that are not present in chromsizes. to filter them,
# i write chromsizes to a csv and will use this in R to quickly do filtering of the fragments tsv
chromsizes.to_csv("mm10_chromsizes.csv")

In [None]:
# processing being done in scenicplus/E4AD_1yr/frag_filtering.R
# the new, filtered fragments file is being saved as scenicplus/E4AD_1yr/atac_fragments_filtered.tsv.gz
fragments_dict = {'E4AD_1yr': '/lustre/fs4/home/amillet/scenicplus/E4AD_1yr/atac_fragments_filtered.tsv.gz'}

In [None]:
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk
bw_paths, bed_paths = export_pseudobulk(input_data = cell_data,
                 variable = 'celltype',                                                                     # variable by which to generate pseubulk profiles, in this case we want pseudobulks per celltype
                 sample_id_col = 'sample_id',
                 chromsizes = chromsizes,
                 bed_path = '/lustre/fs4/home/amillet/scenicplus/E4AD_1yr/consensus_peak_calling/pseudobulk_bed_files/',  # specify where pseudobulk_bed_files should be stored
                 bigwig_path = '/lustre/fs4/home/amillet/scenicplus/E4AD_1yr/consensus_peak_calling/pseudobulk_bw_files/',# specify where pseudobulk_bw_files should be stored
                 path_to_fragments = fragments_dict,                                                        # location of fragment fiels
                 n_cpu = 8,                                                                                 # specify the number of cores to use, we use ray for multi processing
                 normalize_bigwig = True,
                 remove_duplicates = True,
                 _temp_dir = "/lustre/fs4/home/amillet/ray_spill",      
                 split_pattern = '-')

In [None]:
import pickle
pickle.dump(bed_paths,
            open('E4AD_1yr/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl', 'wb'))
pickle.dump(bw_paths,
           open('E4AD_1yr/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl', 'wb'))

In [None]:
bed_paths

In [None]:
import pickle
bed_paths = pickle.load(open('E4AD_1yr/consensus_peak_calling/pseudobulk_bed_files/bed_paths.pkl', 'rb'))
bw_paths =  pickle.load(open('E4AD_1yr/consensus_peak_calling/pseudobulk_bed_files/bw_paths.pkl', 'rb'))
from pycisTopic.pseudobulk_peak_calling import peak_calling
macs_path='macs2'
# Run peak calling
narrow_peaks_dict = peak_calling(macs_path,
                                 bed_paths,
                                 'E4AD_1yr/consensus_peak_calling/MACS/',
                                 genome_size='mm',
                                 n_cpu=8,
                                 input_format='BEDPE',
                                 shift=73,
                                 ext_size=146,
                                 keep_dup = 'all',
                                 q_value = 0.05,
                                 _temp_dir = "/lustre/fs4/home/amillet/ray_spill")

In [None]:
# ray has some weird rules about the temp_dir name length.
# had to choose a folder close to home for actual run,
# transferring to scenicplus/E4AD_1yr folder after the fact.
import shutil
shutil.move("/lustre/fs4/home/amillet/ray_spill", "/lustre/fs4/home/amillet/scenicplus/E4AD_1yr")

In [None]:
pickle.dump(narrow_peaks_dict,
            open('E4AD_1yr/consensus_peak_calling/MACS/narrow_peaks_dict.pkl', 'wb'))

In [None]:
from pycisTopic.iterative_peak_calling import *
# Other param
peak_half_width = 250
path_to_blacklist= 'mm10-blacklist.v2.bed' #downloaded from aertslab github
# Get consensus peaks
consensus_peaks=get_consensus_peaks(narrow_peaks_dict, peak_half_width, chromsizes=chromsizes, path_to_blacklist=path_to_blacklist)

In [None]:
consensus_peaks.to_bed(
    path = 'E4AD_1yr/consensus_peak_calling/consensus_regions.bed',
    keep=True,
    compression='infer',
    chain=False)

### QC

In [None]:
import pybiomart as pbm
dataset = pbm.Dataset(name='mmusculus_gene_ensembl',  host='http://www.ensembl.org')
annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].to_numpy(dtype = str)
filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT')
annot = annot[~filter]
annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1')
annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
annot = annot[annot.Transcript_type == 'protein_coding']
from pycisTopic.qc import *
path_to_regions = {'E4AD_1yr':'E4AD_1yr/consensus_peak_calling/consensus_regions.bed'}

metadata_bc, profile_data_dict = compute_qc_stats(
                fragments_dict = fragments_dict,
                tss_annotation = annot,
                stats=['barcode_rank_plot', 'duplicate_rate', 'insert_size_distribution', 'profile_tss', 'frip'],
                label_list = None,
                path_to_regions = path_to_regions,
                n_cpu = 1,
                valid_bc = None,
                n_frag = 100,
                n_bc = None,
                tss_flank_window = 1000,
                tss_window = 50,
                tss_minimum_signal_window = 100,
                tss_rolling_window = 10,
                remove_duplicates = True,
                _temp_dir = "/lustre/fs4/home/amillet/ray_spill")

if not os.path.exists('E4AD_1yr/quality_control'):
    os.makedirs('E4AD_1yr/quality_control')

pickle.dump(metadata_bc,
            open('E4AD_1yr/quality_control/metadata_bc.pkl', 'wb'))

pickle.dump(profile_data_dict,
            open('E4AD_1yr/quality_control/profile_data_dict.pkl', 'wb'))

In [None]:
                        #[min,  #max]
QC_filters = {
    'Log_unique_nr_frag': [3.3 , None],
    'FRIP':               [0.4, None],
    'TSS_enrichment':     [1   , None],
    'Dupl_rate':          [None, None]

}

# Return figure to plot together with other metrics, and cells passing filters. Figure will be saved as pdf.
from pycisTopic.qc import *
FRIP_NR_FRAG_fig, FRIP_NR_FRAG_filter=plot_barcode_metrics(metadata_bc['E4AD_1yr'],
                                       var_x='Log_unique_nr_frag',
                                       var_y='FRIP',
                                       min_x=QC_filters['Log_unique_nr_frag'][0],
                                       max_x=QC_filters['Log_unique_nr_frag'][1],
                                       min_y=QC_filters['FRIP'][0],
                                       max_y=QC_filters['FRIP'][1],
                                       return_cells=True,
                                       return_fig=True,
                                       plot=False)
# Return figure to plot together with other metrics, and cells passing filters
TSS_NR_FRAG_fig, TSS_NR_FRAG_filter=plot_barcode_metrics(metadata_bc['E4AD_1yr'],
                                      var_x='Log_unique_nr_frag',
                                      var_y='TSS_enrichment',
                                      min_x=QC_filters['Log_unique_nr_frag'][0],
                                      max_x=QC_filters['Log_unique_nr_frag'][1],
                                      min_y=QC_filters['TSS_enrichment'][0],
                                      max_y=QC_filters['TSS_enrichment'][1],
                                      return_cells=True,
                                      return_fig=True,
                                      plot=False)
# Return figure to plot together with other metrics, but not returning cells (no filter applied for the duplication rate  per barcode)
DR_NR_FRAG_fig=plot_barcode_metrics(metadata_bc['E4AD_1yr'],
                                      var_x='Log_unique_nr_frag',
                                      var_y='Dupl_rate',
                                      min_x=QC_filters['Log_unique_nr_frag'][0],
                                      max_x=QC_filters['Log_unique_nr_frag'][1],
                                      min_y=QC_filters['Dupl_rate'][0],
                                      max_y=QC_filters['Dupl_rate'][1],
                                      return_cells=False,
                                      return_fig=True,
                                      plot=False,
                                      plot_as_hexbin = True)

# Plot barcode stats in one figure
fig=plt.figure(figsize=(10,10))
plt.subplot(1, 3, 1)
img = fig2img(FRIP_NR_FRAG_fig)
plt.imshow(img)
plt.axis('off')
plt.subplot(1, 3, 2)
img = fig2img(TSS_NR_FRAG_fig)
plt.imshow(img)
plt.axis('off')
plt.subplot(1, 3, 3)
img = fig2img(DR_NR_FRAG_fig)
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
bc_passing_filters = {'E4AD_1yr':[]}
bc_passing_filters['E4AD_1yr'] = list((set(FRIP_NR_FRAG_filter) & set(TSS_NR_FRAG_filter)))
pickle.dump(bc_passing_filters,
            open('E4AD_1yr/quality_control/bc_passing_filters.pkl', 'wb'))
print(f"{len(bc_passing_filters['E4AD_1yr'])} barcodes passed QC stats")

### Topic Modeling

In [None]:
import scanpy as sc
adata = sc.read_h5ad('E4AD_1yr/rna_adata.h5ad')
scRNA_bc = adata.obs_names
cell_data = adata.obs
cell_data['sample_id'] = 'E4AD_1yr'
cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain.
del(adata)

In [None]:
import pickle
fragments_dict = {'E4AD_1yr': '/lustre/fs4/home/amillet/scenicplus/E4AD_1yr/atac_fragments_filtered.tsv.gz'}
path_to_regions = {'E4AD_1yr': 'E4AD_1yr/consensus_peak_calling/consensus_regions.bed'}
path_to_blacklist= 'mm10-blacklist.v2.bed'
metadata_bc = pickle.load(open('E4AD_1yr/quality_control/metadata_bc.pkl', 'rb'))
bc_passing_filters = pickle.load(open('E4AD_1yr/quality_control/bc_passing_filters.pkl', 'rb'))

In [None]:
print(f"{len(list(set(bc_passing_filters['E4AD_1yr']) & set(scRNA_bc)))} cell barcodes pass both scATAC-seq and scRNA-seq based filtering")


In [None]:
from pycisTopic.cistopic_class import *
key = 'E4AD_1yr'
cistopic_obj = create_cistopic_object_from_fragments(
                            path_to_fragments=fragments_dict[key],
                            path_to_regions=path_to_regions[key],
                            path_to_blacklist=path_to_blacklist,
                            metrics=metadata_bc[key],
                            valid_bc=list(set(scRNA_bc)), # removing `set(bc_passing_filters[key]) & ` as RNA is already prefiltered on both 
                            n_cpu=1,
                            project=key,
                            split_pattern='-')
cistopic_obj.add_cell_data(cell_data, split_pattern='-')
print(cistopic_obj)

In [None]:
pickle.dump(cistopic_obj,
            open('E4AD_1yr/cistopic_obj.pkl', 'wb'))

In [None]:
# import pickle
cistopic_obj = pickle.load(open('E4AD_1yr/cistopic_obj.pkl', 'rb'))
from pycisTopic.cistopic_class import *
models=run_cgs_models(cistopic_obj,
                    n_topics=[2,4,10,16,32,48],
                    n_cpu=5,
                    n_iter=500,
                    random_state=555,
                    alpha=50,
                    alpha_by_topic=True,
                    eta=0.1,
                    eta_by_topic=False,
                    save_path=None,
                    _temp_dir = "/lustre/fs4/home/amillet/ray_spill")

In [None]:
if not os.path.exists('E4AD_1yr/models'):
    os.makedirs('E4AD_1yr/models')

pickle.dump(models,
            open('E4AD_1yr/models/10x_pbmc_models_500_iter_LDA.pkl', 'wb'))

In [None]:
models = pickle.load(open('E4AD_1yr/models/10x_pbmc_models_500_iter_LDA.pkl', 'rb'))
cistopic_obj = pickle.load(open('E4AD_1yr/cistopic_obj.pkl', 'rb'))
from pycisTopic.lda_models import *
model = evaluate_models(models,
                       select_model=32,
                       return_model=True,
                       metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
                       plot_metrics=False)

In [None]:
cistopic_obj.add_LDA_model(model)
pickle.dump(cistopic_obj,
            open('E4AD_1yr/cistopic_obj.pkl', 'wb'))

In [None]:
from pycisTopic.clust_vis import *
run_umap(cistopic_obj, target  = 'cell', scale=True)
plot_metadata(cistopic_obj, reduction_name = 'UMAP', variables = ['celltype'])

In [None]:
plot_topic(cistopic_obj, reduction_name = 'UMAP', num_columns = 4)

In [None]:
from pycisTopic.topic_binarization import *
region_bin_topics_otsu = binarize_topics(cistopic_obj, method='otsu')
region_bin_topics_top3k = binarize_topics(cistopic_obj, method='ntop', ntop = 3000)

In [None]:
import numpy as np
from pycisTopic.diff_features import *
imputed_acc_obj = impute_accessibility(cistopic_obj, selected_cells=None, selected_regions=None, scale_factor=10**6)
normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4)
variable_regions = find_highly_variable_features(normalized_imputed_acc_obj, plot = False)
markers_dict = find_diff_features(cistopic_obj, imputed_acc_obj, variable='celltype', 
                                  var_features=variable_regions, split_pattern = '-',
                                 adjpval_thr = 0.1, log2fc_thr = np.log2(1.1))

In [None]:
if not os.path.exists('E4AD_1yr/candidate_enhancers'):
    os.makedirs('E4AD_1yr/candidate_enhancers')
import pickle
pickle.dump(region_bin_topics_otsu, open('E4AD_1yr/candidate_enhancers/region_bin_topics_otsu.pkl', 'wb'))
pickle.dump(region_bin_topics_top3k, open('E4AD_1yr/candidate_enhancers/region_bin_topics_top3k.pkl', 'wb'))
pickle.dump(markers_dict, open('E4AD_1yr/candidate_enhancers/markers_dict.pkl', 'wb'))

## PyCisTarget Motif Enrichment Analysis

In [None]:
import pickle
region_bin_topics_otsu = pickle.load(open('E4AD_1yr/candidate_enhancers/region_bin_topics_otsu.pkl', 'rb'))
region_bin_topics_top3k = pickle.load(open('E4AD_1yr/candidate_enhancers/region_bin_topics_top3k.pkl', 'rb'))
markers_dict = pickle.load(open('E4AD_1yr/candidate_enhancers/markers_dict.pkl', 'rb'))

In [None]:
import pyranges as pr
from pycistarget.utils import region_names_to_coordinates
region_sets = {}
region_sets['topics_otsu'] = {}
region_sets['topics_top_3'] = {}
region_sets['DARs'] = {}
for topic in region_bin_topics_otsu.keys():
    regions = region_bin_topics_otsu[topic].index[region_bin_topics_otsu[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['topics_otsu'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for topic in region_bin_topics_top3k.keys():
    regions = region_bin_topics_top3k[topic].index[region_bin_topics_top3k[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['topics_top_3'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for DAR in markers_dict.keys():
    regions = markers_dict[DAR].index[markers_dict[DAR].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['DARs'][DAR] = pr.PyRanges(region_names_to_coordinates(regions))

In [None]:
for key in region_sets.keys():
    print(f'{key}: {region_sets[key].keys()}')

In [None]:
rankings_db = '/ru-auth/local/home/amillet/scratch/references/cistarget/mm10_screen_v10_clust.regions_vs_motifs.rankings.feather'
scores_db = '/ru-auth/local/home/amillet/scratch/references/cistarget/mm10_screen_v10_clust.regions_vs_motifs.scores.feather'
motif_annotation = '/ru-auth/local/home/amillet/scratch/references/cistarget/motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl'

In [None]:
if not os.path.exists('E4AD_1yr/motifs'):
    os.makedirs('E4AD_1yr/motifs')
from scenicplus.wrappers.run_pycistarget import run_pycistarget
run_pycistarget(
    region_sets = region_sets,
    species = 'mus_musculus',
    save_path = 'E4AD_1yr/motifs',
    ctx_db_path = rankings_db,
    dem_db_path = scores_db,
    path_to_motif_annotations = motif_annotation,
    run_without_promoters = True,
    n_cpu = 8,
    _temp_dir = "/lustre/fs4/home/amillet/ray_spill",
    annotation_version = 'v10nr_clust',
    )

In [None]:
import dill
menr = dill.load(open('E4AD_1yr/motifs/menr.pkl', 'rb'))

In [None]:
menr['DEM_topics_otsu_All'].DEM_results('Topic17')

# Finally!! Time to run SCENIC+.

In [None]:
import dill
import scanpy as sc
import os
import warnings
warnings.filterwarnings("ignore")
import pandas
import pyranges
# Set stderr to null to avoid strange messages from ray
import sys
_stderr = sys.stderr
null = open(os.devnull,'wb')

adata = sc.read_h5ad('E4AD_1yr/rna_adata.h5ad')
cistopic_obj = dill.load(open('E4AD_1yr/cistopic_obj.pkl', 'rb'))
menr = dill.load(open('E4AD_1yr/motifs/menr.pkl', 'rb'))

In [None]:
from scenicplus.scenicplus_class import create_SCENICPLUS_object
import numpy as np
scplus_obj = create_SCENICPLUS_object(
    GEX_anndata = adata.raw.to_adata(),
    cisTopic_obj = cistopic_obj,
    menr = menr,
    bc_transform_func = lambda x: f'{x}-E4AD_1yr' #function to convert scATAC-seq barcodes to scRNA-seq ones
)
scplus_obj.X_EXP = np.array(scplus_obj.X_EXP.todense())
scplus_obj

Check which biomart host is best:

In [None]:
ensembl_version_dict = {'105': 'http://www.ensembl.org',
                        '104': 'http://may2021.archive.ensembl.org/',
                        '103': 'http://feb2021.archive.ensembl.org/',
                        '102': 'http://nov2020.archive.ensembl.org/',
                        '101': 'http://aug2020.archive.ensembl.org/',
                        '100': 'http://apr2020.archive.ensembl.org/',
                        '99': 'http://jan2020.archive.ensembl.org/',
                        '98': 'http://sep2019.archive.ensembl.org/',
                        '97': 'http://jul2019.archive.ensembl.org/',
                        '96': 'http://apr2019.archive.ensembl.org/',
                        '95': 'http://jan2019.archive.ensembl.org/',
                        '94': 'http://oct2018.archive.ensembl.org/',
                        '93': 'http://jul2018.archive.ensembl.org/',
                        '92': 'http://apr2018.archive.ensembl.org/',
                        '91': 'http://dec2017.archive.ensembl.org/',
                        '90': 'http://aug2017.archive.ensembl.org/',
                        '89': 'http://may2017.archive.ensembl.org/',
                        '88': 'http://mar2017.archive.ensembl.org/',
                        '87': 'http://dec2016.archive.ensembl.org/',
                        '86': 'http://oct2016.archive.ensembl.org/',
                        '80': 'http://may2015.archive.ensembl.org/',
                        '77': 'http://oct2014.archive.ensembl.org/',
                        '75': 'http://feb2014.archive.ensembl.org/',
                        '54': 'http://may2009.archive.ensembl.org/'}

import pybiomart as pbm
def test_ensembl_host(scplus_obj, host, species):
    dataset = pbm.Dataset(name=species+'_gene_ensembl',  host=host)
    annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype'])
    annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
    annot['Chromosome'] = annot['Chromosome'].astype('str')
    filter = annot['Chromosome'].str.contains('CHR|GL|JH|MT')
    annot = annot[~filter]
    annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type']
    gene_names_release = set(annot['Gene'].tolist())
    ov=len([x for x in scplus_obj.gene_names if x in gene_names_release])
    print('Genes recovered: ' + str(ov) + ' out of ' + str(len(scplus_obj.gene_names)))
    return ov

n_overlap = {}
for version in ensembl_version_dict.keys():
    print(f'host: {version}')
    try:
        n_overlap[version] =  test_ensembl_host(scplus_obj, ensembl_version_dict[version], 'mmusculus')
    except:
        print('Host not reachable')
v = sorted(n_overlap.items(), key=lambda item: item[1], reverse=True)[0][0]
print(f"version: {v} has the largest overlap, use {ensembl_version_dict[v]} as biomart host")

In [None]:
biomart_host = "http://sep2019.archive.ensembl.org/"

We prep a list of all known mouse TFs, first by downloading the annotated txt file from http://bioinfo.life.hust.edu.cn/AnimalTFDB4/#/Download for mouse and saving it to `scenicplus/Mus_musculus_TF.txt`. Then we filter to just the TF names and save as a new txt file for use.

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("Mus_musculus_TF.txt", sep = "\t")
df = df['Symbol']
np.savetxt(r'Mus_musculus_TF_readytouse.txt', df.values, fmt='%s')

In [None]:
scplus_obj.dr_cell['GEX_X_pca'] = scplus_obj.dr_cell['GEX_X_pca'].iloc[:, 0:2]

In [None]:
from scenicplus.wrappers.run_scenicplus import run_scenicplus
try:
    run_scenicplus(
        scplus_obj = scplus_obj,
        variable = ['GEX_celltype'],
        species = 'mmusculus',
        assembly = 'mm10',
        tf_file = 'Mus_musculus_TF_readytouse.txt',
        save_path = 'E4AD_1yr/scenicplus',
        biomart_host = biomart_host,
        upstream = [1000, 150000],
        downstream = [1000, 150000],
        calculate_TF_eGRN_correlation = True,
        calculate_DEGs_DARs = True,
        export_to_loom_file = True,
        export_to_UCSC_file = True,
        path_bedToBigBed = '/ru-auth/local/home/amillet/scenicplus',
        n_cpu = 12,
        _temp_dir = "/lustre/fs4/home/amillet/ray_spill")
except Exception as e:
    #in case of failure, still save the object
    dill.dump(scplus_obj, open('E4AD_1yr/scenicplus/scplus_obj.pkl', 'wb'), protocol=-1)
    raise(e)

[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  91%|█████████ | 1206/1326 [00:39<00:02, 41.33it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  91%|█████████▏| 1213/1326 [00:39<00:02, 47.14it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  92%|█████████▏| 1219/1326 [00:39<00:02, 50.29it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  93%|█████████▎| 1227/1326 [00:39<00:01, 57.33it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  93%|█████████▎| 1233/1326 [00:40<00:01, 58.02it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  94%|█████████▎| 1240/1326 [00:40<00:01, 57.79it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  94%|█████████▍| 1247/1326 [00:40<00:01, 59.36it/s][A
[32;1mProcessing:[0m Top 15 region-to-gene links per gene, negative r2g:  94%|█████████▍

[32;1mProcessing:[0m BASC binarized, negative r2g:  59%|█████▉    | 780/1326 [00:21<00:17, 31.96it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  59%|█████▉    | 784/1326 [00:21<00:17, 30.11it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  59%|█████▉    | 788/1326 [00:21<00:18, 29.25it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  60%|█████▉    | 794/1326 [00:21<00:15, 34.84it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  60%|██████    | 800/1326 [00:21<00:13, 38.89it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  61%|██████    | 805/1326 [00:21<00:14, 36.66it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  61%|██████    | 809/1326 [00:22<00:16, 31.89it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  61%|██████▏   | 813/1326 [00:22<00:16, 31.41it/s][A
[32;1mProcessing:[0m BASC binarized, negative r2g:  62%|██████▏   | 817/1326 [00:22<00:17, 28.61it/s][A
[32;1mProcessing:[0m BASC binarized

[32;1mProcessing:[0m 0.85 quantile, positive r2g:  26%|██▋       | 350/1326 [00:07<00:25, 38.51it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  27%|██▋       | 356/1326 [00:07<00:39, 24.33it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  27%|██▋       | 361/1326 [00:08<00:34, 27.60it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  28%|██▊       | 367/1326 [00:08<00:29, 32.10it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  28%|██▊       | 373/1326 [00:08<00:26, 36.64it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  29%|██▊       | 378/1326 [00:08<00:24, 38.40it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  29%|██▉       | 387/1326 [00:08<00:19, 49.30it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  30%|██▉       | 395/1326 [00:08<00:16, 56.16it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  30%|███       | 404/1326 [00:08<00:14, 63.44it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive

[32;1mProcessing:[0m 0.85 quantile, positive r2g:  95%|█████████▌| 1263/1326 [00:30<00:00, 65.39it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  96%|█████████▌| 1270/1326 [00:31<00:01, 44.52it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  96%|█████████▌| 1276/1326 [00:31<00:01, 43.93it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  97%|█████████▋| 1282/1326 [00:31<00:01, 36.08it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  97%|█████████▋| 1287/1326 [00:31<00:01, 33.83it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  97%|█████████▋| 1292/1326 [00:31<00:00, 36.16it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  98%|█████████▊| 1297/1326 [00:31<00:00, 39.04it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  98%|█████████▊| 1303/1326 [00:31<00:00, 42.36it/s][A
[32;1mProcessing:[0m 0.85 quantile, positive r2g:  99%|█████████▊| 1308/1326 [00:32<00:00, 43.61it/s][A
[32;1mProcessing:[0m 0.85 quantile,

[32;1mProcessing:[0m positive r2g, 0.9 quantile:  78%|███████▊  | 1030/1326 [00:20<00:04, 73.30it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  78%|███████▊  | 1039/1326 [00:20<00:03, 75.31it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  79%|███████▉  | 1047/1326 [00:21<00:03, 72.13it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  80%|███████▉  | 1055/1326 [00:21<00:04, 62.73it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  80%|████████  | 1062/1326 [00:21<00:04, 56.40it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  81%|████████  | 1068/1326 [00:21<00:07, 33.94it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  81%|████████  | 1073/1326 [00:22<00:10, 23.58it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  81%|████████  | 1077/1326 [00:22<00:12, 20.68it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quantile:  81%|████████▏ | 1080/1326 [00:22<00:11, 21.79it/s][A
[32;1mProcessing:[0m positive r2g, 0.9 quant

[32;1mProcessing:[0m positive r2g, 0.95 quantile:  52%|█████▏    | 687/1326 [00:12<00:08, 79.66it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  52%|█████▏    | 696/1326 [00:12<00:07, 79.85it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  53%|█████▎    | 705/1326 [00:13<00:07, 79.52it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  54%|█████▍    | 716/1326 [00:13<00:07, 86.60it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  55%|█████▍    | 728/1326 [00:13<00:06, 95.23it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  56%|█████▌    | 738/1326 [00:13<00:06, 85.01it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  56%|█████▋    | 747/1326 [00:13<00:09, 61.25it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  57%|█████▋    | 755/1326 [00:14<00:13, 41.41it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quantile:  57%|█████▋    | 761/1326 [00:14<00:16, 34.41it/s][A
[32;1mProcessing:[0m positive r2g, 0.95 quan

[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  26%|██▌       | 339/1326 [00:11<00:23, 41.97it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  26%|██▌       | 345/1326 [00:11<00:21, 46.51it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  26%|██▋       | 350/1326 [00:12<00:27, 35.18it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  27%|██▋       | 355/1326 [00:12<00:34, 28.55it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  27%|██▋       | 360/1326 [00:12<00:30, 32.00it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  28%|██▊       | 368/1326 [00:12<00:23, 41.07it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  28%|██▊       | 376/1326 [00:12<00:20, 46.52it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  29%|██▉       | 383/1326 [00:

[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  76%|███████▌  | 1006/1326 [00:29<00:06, 48.92it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  76%|███████▋  | 1012/1326 [00:29<00:06, 45.03it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  77%|███████▋  | 1017/1326 [00:30<00:09, 32.13it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  77%|███████▋  | 1021/1326 [00:30<00:09, 32.14it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  78%|███████▊  | 1028/1326 [00:30<00:07, 38.83it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  78%|███████▊  | 1033/1326 [00:30<00:07, 41.06it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  78%|███████▊  | 1039/1326 [00:30<00:06, 42.21it/s][A
[32;1mProcessing:[0m positive r2g, Top 5 region-to-gene links per gene:  79%|███████▊  | 1044/1

[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  13%|█▎        | 176/1326 [00:06<00:57, 20.07it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  13%|█▎        | 179/1326 [00:07<01:11, 15.95it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  14%|█▎        | 181/1326 [00:07<01:20, 14.17it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  14%|█▍        | 183/1326 [00:07<01:16, 14.93it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  14%|█▍        | 186/1326 [00:07<01:10, 16.11it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  14%|█▍        | 188/1326 [00:07<01:09, 16.35it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  14%|█▍        | 190/1326 [00:07<01:10, 16.14it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  15%|█▍        | 193/1

[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  57%|█████▋    | 762/1326 [00:24<00:23, 23.77it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  58%|█████▊    | 768/1326 [00:24<00:17, 31.58it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  58%|█████▊    | 772/1326 [00:25<00:24, 22.27it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  58%|█████▊    | 775/1326 [00:25<00:30, 17.89it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  59%|█████▊    | 778/1326 [00:25<00:36, 15.21it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  59%|█████▉    | 780/1326 [00:25<00:39, 13.75it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  59%|█████▉    | 782/1326 [00:26<00:38, 14.15it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  59%|█████▉    | 784/1

[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  95%|█████████▍| 1254/1326 [00:42<00:02, 25.28it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  95%|█████████▍| 1259/1326 [00:43<00:02, 28.59it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  95%|█████████▌| 1264/1326 [00:43<00:01, 33.31it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  96%|█████████▌| 1268/1326 [00:43<00:02, 28.38it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  96%|█████████▌| 1275/1326 [00:43<00:01, 36.69it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  97%|█████████▋| 1280/1326 [00:43<00:01, 38.83it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  97%|█████████▋| 1285/1326 [00:43<00:01, 39.52it/s][A
[32;1mProcessing:[0m positive r2g, Top 10 region-to-gene links per gene:  97%|█████████▋

[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  35%|███▌      | 465/1326 [00:20<00:38, 22.32it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  35%|███▌      | 468/1326 [00:20<00:41, 20.81it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  36%|███▌      | 471/1326 [00:20<00:37, 22.82it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  36%|███▌      | 474/1326 [00:20<00:34, 24.38it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  36%|███▌      | 478/1326 [00:20<00:30, 27.71it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  36%|███▋      | 483/1326 [00:21<00:26, 32.36it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  37%|███▋      | 488/1326 [00:21<00:24, 34.74it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  37%|███▋      | 492/1

[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  72%|███████▏  | 951/1326 [00:39<00:11, 32.11it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  72%|███████▏  | 955/1326 [00:39<00:10, 33.92it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  72%|███████▏  | 960/1326 [00:39<00:09, 36.74it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  73%|███████▎  | 965/1326 [00:39<00:09, 39.52it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  73%|███████▎  | 971/1326 [00:39<00:08, 43.46it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  74%|███████▎  | 976/1326 [00:39<00:08, 40.46it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  74%|███████▍  | 981/1326 [00:40<00:10, 33.21it/s][A
[32;1mProcessing:[0m positive r2g, Top 15 region-to-gene links per gene:  74%|███████▍  | 985/1

[32;1mProcessing:[0m BASC binarized, positive r2g:   6%|▋         | 86/1326 [00:03<00:53, 23.16it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   7%|▋         | 89/1326 [00:03<00:55, 22.25it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   7%|▋         | 92/1326 [00:03<00:57, 21.32it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   7%|▋         | 95/1326 [00:03<01:12, 17.01it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   7%|▋         | 97/1326 [00:03<01:17, 15.86it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   7%|▋         | 99/1326 [00:03<01:20, 15.33it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   8%|▊         | 101/1326 [00:03<01:16, 16.03it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   8%|▊         | 103/1326 [00:04<01:28, 13.79it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:   8%|▊         | 106/1326 [00:04<01:16, 15.86it/s][A
[32;1mProcessing:[0m BASC binarized, posi

[32;1mProcessing:[0m BASC binarized, positive r2g:  76%|███████▌  | 1008/1326 [00:23<00:04, 66.58it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  77%|███████▋  | 1016/1326 [00:24<00:05, 61.83it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  77%|███████▋  | 1024/1326 [00:24<00:04, 64.71it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  78%|███████▊  | 1032/1326 [00:24<00:04, 67.51it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  78%|███████▊  | 1039/1326 [00:24<00:04, 66.72it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  79%|███████▉  | 1046/1326 [00:24<00:04, 66.79it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  79%|███████▉  | 1053/1326 [00:24<00:04, 65.13it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  80%|████████  | 1061/1326 [00:24<00:03, 68.38it/s][A
[32;1mProcessing:[0m BASC binarized, positive r2g:  81%|████████  | 1068/1326 [00:24<00:05, 48.91it/s][A
[32;1mProcessing:[0m BASC 

2022-12-23 17:20:50,743 GSEA         INFO     Subsetting TF2G adjacencies for TF with motif.


2022-12-23 17:20:59,833	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


2022-12-23 17:21:01,679 GSEA         INFO     Running GSEA...


[2m[36m(_ray_run_gsea_for_e_module pid=15319)[0m   norm_no_tag = 1.0/Nmiss
[2m[36m(_ray_run_gsea_for_e_module pid=15319)[0m   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
initializing: 100%|██████████| 17869/17869 [05:29<00:00, 54.20it/s] 
Running using 12 cores: 100%|██████████| 10726/10726 [00:58<00:00, 184.81it/s]


2022-12-23 17:27:32,113 GSEA         INFO     Subsetting on adjusted pvalue: 1, minimal NES: 0 and minimal leading edge genes 10
2022-12-23 17:27:33,672 GSEA         INFO     Merging eRegulons
2022-12-23 17:27:34,012 GSEA         INFO     Storing eRegulons in .uns[eRegulons].
2022-12-23 17:27:37,653 SCENIC+_wrapper INFO     Formatting eGRNs
2022-12-23 17:30:38,524 SCENIC+_wrapper INFO     Converting eGRNs to signatures
2022-12-23 17:30:43,468 SCENIC+_wrapper INFO     Calculating eGRNs AUC
2022-12-23 17:30:43,469 SCENIC+_wrapper INFO     Calculating region ranking
2022-12-23 17:32:36,796 SCENIC+_wrapper INFO     Calculating eGRNs region based AUC
2022-12-23 17:32:53,046 SCENIC+_wrapper INFO     Calculating gene ranking
2022-12-23 17:33:00,173 SCENIC+_wrapper INFO     Calculating eGRNs gene based AUC
2022-12-23 17:33:10,871 SCENIC+_wrapper INFO     Calculating TF-eGRNs AUC correlation
2022-12-23 17:33:26,591 SCENIC+_wrapper INFO     Binarizing eGRNs AUC
2022-12-23 17:39:52,798 SCENIC+_wr

... storing 'ACC_celltype' as categorical
... storing 'ACC_sample_id' as categorical


2022-12-23 17:41:12,742 SCENIC+      INFO     Finished calculating DEGs for variable GEX_celltype
2022-12-23 17:41:12,744 SCENIC+      INFO     Calculating DARs for variable GEX_celltype
2022-12-23 17:41:53,013 SCENIC+      INFO     There are 41955 variable features


... storing 'ACC_celltype' as categorical
... storing 'ACC_sample_id' as categorical


2022-12-23 17:42:34,818 SCENIC+      INFO     Finished calculating DARs for variable GEX_celltype
2022-12-23 17:42:34,822 SCENIC+_wrapper INFO     Exporting to loom file
2022-12-23 17:42:34,823 SCENIC+      INFO     Formatting data
2022-12-23 17:42:37,812 SCENIC+      INFO     Creating minimal loom
2022-12-23 17:42:48,901 SCENIC+      INFO     Adding annotations
2022-12-23 17:42:50,442 SCENIC+      INFO     Adding clusterings
2022-12-23 17:42:50,497 SCENIC+      INFO     Adding markers
2022-12-23 17:42:50,767 SCENIC+      INFO     Exporting
2022-12-23 17:42:55,394 SCENIC+      INFO     Formatting data
2022-12-23 17:43:28,258 SCENIC+      INFO     Creating minimal loom
2022-12-23 17:45:32,777 SCENIC+      INFO     Adding annotations
2022-12-23 17:45:50,763 SCENIC+      INFO     Adding clusterings
2022-12-23 17:45:50,862 SCENIC+      INFO     Adding markers
2022-12-23 17:45:54,601 SCENIC+      INFO     Exporting
2022-12-23 17:46:58,177 SCENIC+_wrapper INFO     Exporting to UCSC
2022-12-2

Note: for this code to run, had to edit `src/scenicplus/loom.py` line 174 from `), columns=cv.get_feature_names(), index=regulons.keys())` to `), columns=cv.get_feature_names_out(), index=regulons.keys())` due to an update in scikit-learn. See: https://github.com/aertslab/scenicplus/issues/76

In [None]:
scplus_obj

## FINALLY time to explore the data!

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

In [None]:
import dill
scplus_obj = dill.load(open('E4AD_1yr/scenicplus/scplus_obj.pkl', 'rb'))

In [None]:
from scenicplus.preprocessing.filtering import apply_std_filtering_to_eRegulons
apply_std_filtering_to_eRegulons(scplus_obj)

In [None]:
work_dir = "E4AD_1yr"
from scenicplus.eregulon_enrichment import score_eRegulons
region_ranking = dill.load(open(os.path.join(work_dir, 'scenicplus/region_ranking.pkl'), 'rb')) #load ranking calculated using the wrapper function
gene_ranking = dill.load(open(os.path.join(work_dir, 'scenicplus/gene_ranking.pkl'), 'rb')) #load ranking calculated using the wrapper function
score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 5)
score_eRegulons(scplus_obj,
                gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures_filtered',
                key_added = 'eRegulon_AUC_filtered',
                enrichment_type = 'gene',
                auc_threshold = 0.05,
                normalize= False,
                n_cpu = 5)

In [None]:
from scenicplus.dimensionality_reduction import run_eRegulons_tsne, run_eRegulons_umap
run_eRegulons_umap(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_UMAP', #overwrite previously calculated UMAP
)
run_eRegulons_tsne(
    scplus_obj = scplus_obj,
    auc_key = 'eRegulon_AUC_filtered',
    reduction_name = 'eRegulons_tSNE', #overwrite previously calculated tSNE
)

In [None]:
scplus_obj.metadata_cell['GEX_celltype'].value_counts()

In [None]:
from scenicplus.dimensionality_reduction import plot_metadata_given_ax
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#specify color_dictionary

color_dict = {
    'Homeostatic Microglia': "#065143",
    'Selplg-lo Microglia': "#70B77E",
    'mt-Enriched Microglia': "#E0A890",
    'mt-Depleted Microglia': "#053C5E",
    'DAM-1': "#F56476",
    'DAM-2': "#CE1483" ,
    'TIMs': "#38A3A5",
    'Siglech-hi Microglia': "#80ED99"
}

fig, axs = plt.subplots(ncols=2, figsize = (16, 8))
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[0],
    reduction_name = 'eRegulons_UMAP',
    variable = 'GEX_celltype', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    color_dictionary={'GEX_celltype': color_dict}
)
plot_metadata_given_ax(
    scplus_obj=scplus_obj,
    ax = axs[1],
    reduction_name = 'eRegulons_tSNE',
    variable = 'GEX_celltype', #note the GEX_ prefix, this metadata originated from the gene expression metadata (on which we did the cell type annotation before)
    color_dictionary={'GEX_celltype': color_dict}
)
fig.tight_layout()
sns.despine(ax = axs[0]) #remove top and right edge of axis border
sns.despine(ax = axs[1]) #remove top and right edge of axis border
plt.show()

In [None]:
from scenicplus.dimensionality_reduction import plot_eRegulon
plot_eRegulon(
    scplus_obj = scplus_obj,
    reduction_name = 'eRegulons_UMAP',
    selected_regulons = ['Fos_+', 'Jun_+', 'Fosb_+', 'Klf4_+', 'Egr3_+'],
    scale = True,
    auc_key = 'eRegulon_AUC_filtered')

In [None]:
df = scplus_obj.uns['eRegulon_metadata_filtered']
df[df.TF == "Jun"]

In [None]:
from scenicplus.dimensionality_reduction import plot_AUC_given_ax

fig, ax = plt.subplots(figsize = (8,8))
plot_AUC_given_ax(
    scplus_obj = scplus_obj,
    reduction_name = 'eRegulons_tSNE',
    feature = 'Jun_+_(73g)',
    ax = ax,
    auc_key = 'eRegulon_AUC_filtered',
    signature_key = 'Gene_based')
sns.despine(ax = ax)
plt.show()

In [None]:
from scenicplus.cistromes import TF_cistrome_correlation, generate_pseudobulks

generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Gene_based')
generate_pseudobulks(
        scplus_obj = scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_key = 'Region_based')

TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Gene_based',
            out_key = 'filtered_gene_based')
TF_cistrome_correlation(
            scplus_obj,
            use_pseudobulk = True,
            variable = 'GEX_celltype',
            auc_key = 'eRegulon_AUC_filtered',
            signature_key = 'Region_based',
            out_key = 'filtered_region_based')

In [None]:
scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].head()

In [None]:
import numpy as np
n_targets = [int(x.split('(')[1].replace('r)', '')) for x in scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Cistrome']]
rho = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'].to_list()
adj_pval = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Adjusted_p-value'].to_list()

thresholds = {
        'rho': [-0.15, 0.15],
        'n_targets': 0
}
import seaborn as sns
fig, ax = plt.subplots(figsize = (10, 5))
sc = ax.scatter(rho, n_targets, c = -np.log10(adj_pval), s = 5)
ax.set_xlabel('Correlation coefficient')
ax.set_ylabel('nr. target regions')
#ax.hlines(y = thresholds['n_targets'], xmin = min(rho), xmax = max(rho), color = 'black', ls = 'dashed', lw = 1)
ax.vlines(x = thresholds['rho'], ymin = 0, ymax = max(n_targets), color = 'black', ls = 'dashed', lw = 1)
ax.text(x = thresholds['rho'][0], y = max(n_targets), s = str(thresholds['rho'][0]))
ax.text(x = thresholds['rho'][1], y = max(n_targets), s = str(thresholds['rho'][1]))
sns.despine(ax = ax)
fig.colorbar(sc, label = '-log10(adjusted_pvalue)', ax = ax)
plt.show()

In [None]:
selected_cistromes = scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based'].loc[
        np.logical_or(
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] > thresholds['rho'][1],
                scplus_obj.uns['TF_cistrome_correlation']['filtered_region_based']['Rho'] < thresholds['rho'][0]
        )]['Cistrome'].to_list()
selected_eRegulons = [x.split('_(')[0] for x in selected_cistromes]
selected_eRegulons_gene_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Gene_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
selected_eRegulons_region_sig = [
        x for x in scplus_obj.uns['eRegulon_signatures_filtered']['Region_based'].keys()
        if x.split('_(')[0] in selected_eRegulons]
#save the results in the scenicplus object
scplus_obj.uns['selected_eRegulon'] = {'Gene_based': selected_eRegulons_gene_sig, 'Region_based': selected_eRegulons_region_sig}
print(f'selected: {len(selected_eRegulons_gene_sig)} eRegulons')

In [None]:
dill.dump(scplus_obj, open(os.path.join(work_dir, 'scenicplus/scplus_obj.pkl'), 'wb'), protocol=-1)

In [None]:
from scenicplus.plotting.dotplot import heatmap_dotplot
heatmap_dotplot(
        scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['eRegulon_AUC_filtered']['Region_based'], #specify what to plot as dot sizes, target region enrichment in this case
        color_matrix = scplus_obj.to_df('EXP'), #specify  what to plot as colors, TF expression in this case
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'GEX_celltype',
        subset_eRegulons = scplus_obj.uns['selected_eRegulon']['Gene_based'],
        index_order = ['Homeostatic Microglia', 'Selplg-lo Microglia', 'mt-Enriched Microglia', 'mt-Depleted Microglia', 'DAM-1', 'DAM-2', 'TIMs', 'Siglech-hi Microglia'],
        figsize = (15, 20),
        orientation = 'vertical',
        split_repressor_activator = False,
        save = "eregulon_dotplot.png") # set to False since we get no repressors :hehe:

In [None]:
from scenicplus.plotting.dotplot import generate_dotplot_df
df = generate_dotplot_df(scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['eRegulon_AUC_filtered']['Region_based'], #specify what to plot as dot sizes, target region enrichment in this case
        color_matrix = scplus_obj.to_df('EXP'), #specify  what to plot as colors, TF expression in this case
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'GEX_celltype',
        subset_eRegulons = scplus_obj.uns['selected_eRegulon']['Gene_based'],)

In [None]:
index_order = ['Homeostatic Microglia', 'Selplg-lo Microglia', 'mt-Enriched Microglia', 'mt-Depleted Microglia', 'DAM-1', 'DAM-2', 'TIMs', 'Siglech-hi Microglia']
tmp = df[['index', 'eRegulon_name', 'color_val']
        ].pivot_table(index = 'index', columns = 'eRegulon_name'
        ).fillna(0)['color_val']
tmp = tmp.loc[index_order]
idx_max = tmp.idxmax(axis = 0)
order = pd.concat([idx_max[idx_max == x] for x in tmp.index.tolist() if len(df[df == x]) > 0]).index.tolist()

In [None]:
df.to_csv("eregulons_df.csv")
pd.DataFrame(order).to_csv("regulon_order.csv")

In [None]:
from scenicplus.RSS import *
regulon_specificity_scores(
        scplus_obj,
        variable = 'GEX_celltype',
        auc_key = 'eRegulon_AUC_filtered',
        signature_keys = ['Region_based'],
        selected_regulons = [x for x in scplus_obj.uns['selected_eRegulon']['Region_based'] if '-' not in x],
        out_key_suffix = '_filtered')

In [None]:
plot_rss(scplus_obj, 'GEX_celltype_filtered', num_columns=2, top_n=10, figsize = (10, 20))

In [None]:
flat_list = lambda t: [item for sublist in t for item in sublist]
selected_markers = list(set(flat_list(
    [scplus_obj.uns['RSS']['GEX_celltype_filtered'].loc[celltype].sort_values(ascending = False).head(10).index.to_list()
    for celltype in scplus_obj.uns['RSS']['GEX_celltype_filtered'].index])))

In [None]:
from scenicplus.plotting.correlation_plot import *

region_intersetc_data, Z = jaccard_heatmap(
        scplus_obj,
        method = 'intersect',
        gene_or_region_based = 'Region_based',
        use_plotly = False,
        selected_regulons = selected_markers,
        signature_key = 'eRegulon_signatures_filtered',
        figsize = (10, 10), return_data = True, vmax = 0.5, cmap = 'plasma')

## Perturbation simulation

In [None]:
from scenicplus.dimensionality_reduction import run_eRegulons_pca
run_eRegulons_pca(
        scplus_obj,
        auc_key = 'eRegulon_AUC_filtered',
        reduction_name = 'eRegulons_PCA_gene_based',
        selected_regulons = scplus_obj.uns['selected_eRegulon']['Gene_based'])

In [None]:
from pycisTopic.diff_features import find_highly_variable_features
hvg = find_highly_variable_features(scplus_obj.to_df('EXP')[list(set(scplus_obj.uns['eRegulon_metadata_filtered']['Gene']))].T, n_top_features = 200, plot = False)

In [None]:
mapping = {"Homeostatic Microglia":"Homeostatic Microglia",
          "Selplg-lo Microglia":"Homeostatic Microglia",
          "DAM-2":"DAMs",
          "mt-Enriched Microglia":"Homeostatic Microglia",
          "TIMs":"TIMs",
          "Siglech-hi Microglia":"Homeostatic Microglia",
          "mt-Depleted Microglia":"Homeostatic Microglia",
          "DAM-1":"DAMs"}
scplus_obj.metadata_cell['summarized_celltype'] = scplus_obj.metadata_cell['GEX_celltype'].map(mapping)
mapping_color_dict = {
    'Homeostatic Microglia': "#1ebd70",
    'DAMs': "#CE1483" ,
    'TIMs': "#e0a038",
}

In [None]:
from typing import Optional
from typing import List
import logging
def plot_perturbation_effect_in_embedding_custom(
    scplus_obj: 'SCENICPLUS', 
    reduction_name: str, 
    variable: str,
    calculate_perturbed_auc_values: bool = True,
    AUC_key: str = 'eRegulon_AUC',
    perturbed_matrix: pd.DataFrame = None, 
    perturbation: dict = None, 
    eRegulon_metadata_key: Optional[str] = 'eRegulon_metadata',
    eRegulon_signatures_key: str = 'eRegulon_signatures',
    n_iter: Optional[int] = 5, 
    regressors: Optional[dict] = None, 
    genes_to_use: Optional[List] = None, 
    regressor_type: Optional[str] = 'GBM',
    regressor_kwargs: Optional[dict] = None, 
    eRegulons_to_use: Optional[List] = None, 
    grid_offset_frac: Optional[float] = 0.005,
    grid_n_cols: Optional[int] = 25,
    grid_n_rows: Optional[int] = 25,
    grid_n_neighbors: Optional[int] = 25,
    n_cpu: Optional[int] = 1,
    figsize: Optional[tuple] = (6.4, 4.8),
    save: Optional[str] = None,
    density: Optional[float] = 3,
    arrow_width: Optional[float] = 0.5,
    arrowsize: Optional[float] = 1,
    **kwargs):
    """
    Plot dimensionality reduction with perturbation arrows in a grid.

    Parameters
    ----------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object.
    reduction_name: str
        Name of the dimensionality reduction on which to plot the perturbation effect.
        Should be included in scplus_obj.dr.keys()
    variable: str
        Categorical variable by which to color cells by.
    calculate_perturbed_auc_values: bool, optional
        Specify wether eRegulon AUC values should be calculated using the perturbed matrix.
    AUC_key: str, optional
        In case calculate_perturbed_auc_values is set to True, key under which to find non-perturbed AUC values.
    perturbed_matrix: pd.DataFrame, optional
        Perturbed gene expression matrix, calculated using the simulate_perturbation function. 
        If set to None, this will be calculated.
    perturbation: dict, optional
        Dictionary specifying perturbation to simulate, has to be provided when perturbed_matrix is set to None.
        Example: {"SOX10": 0}.
    eRegulon_metadata_key: str, optional
        Key in scplus_obj.uns.keys() under which to find the eRegulon metadata.
    eRegulon_signatures_key: str, optional
        Key in scplus_obj.uns.keys() under which to find the eRegulon signatures.
    n_iter: int
        Number of itertions to simulate. Default is 5
    regressors: dict
        Dictionary of regressors as generated by train_gene_expression_models. 
        If set to None, this dictionary will be generated internally.
    genes: List
        List of genes for which to train the regression models. Default uses all genes.
    regressor_type: str
        Method to use for regression, options are GBM (Gradient Boosting Machine) and RF (Random Forrest).
    regressor_kwargs: dict
        Keyword arguments containing parameters to use for training the regression model.
    eRegulons_to_use: List
        List of eRegulons to consider as predictors. Default uses all eRegulons in scplus_obj.uns[eRegulon_metadata_key]
    grid_offset_frac: float
        Fraction of whitespace to use surounding the plot for plotting the arrows.
    grid_n_cols: int
        Number of columns to plot the grid of arrows
    grid_n_rows: int
        Number of rows to plot the grid of arrows
    grid_n_neighbors: int
        Number of neighbors to consider when calculating the grid of arrows.
    n_cpu: int
        Number of cpus to use.
    figsize: tuple
        Tuple indicating the size of the plot
    save: str
        Path where to save the figure.

    """

    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('perturbation')
    if perturbed_matrix is None:
        if perturbation is None:
            raise ValueError("Please provide a perturbation, by setting the perturbation parameter to {<TF>: <new_expression_value>}")
        log.info(f'Caclulating perturbation matrix for: {perturbation} over {n_iter} iterations.')
        perturbed_matrix = simulate_perturbation(
            scplus_obj = scplus_obj,
            perturbation = perturbation, 
            eRegulon_metadata_key = eRegulon_metadata_key,
            n_iter = n_iter, 
            regressors = regressors, 
            genes = genes_to_use, 
            regressor_type = regressor_type,
            regressor_kwargs = regressor_kwargs, 
            eRegulons_to_use = eRegulons_to_use, 
            keep_intermediate = False)
    
    if calculate_perturbed_auc_values:
        log.info('Generating ranking based on perturbed matrix.')
        perturbed_ranking = _make_rankings(perturbed_matrix)
        log.info('Scoring eRegulons.')
        perturbed_matrix = score_eRegulons(
            scplus_obj = scplus_obj,
            ranking = perturbed_ranking,
            eRegulon_signatures_key = eRegulon_signatures_key,
            enrichment_type = 'gene',
            inplace = False,
            n_cpu = n_cpu)

    log.info(f'Projecting perturbation effect in embedding: {reduction_name}')
    delta_embedding = _project_perturbation_in_embedding(
        scplus_obj = scplus_obj, 
        original_matrix = scplus_obj.uns[AUC_key]['Gene_based'] if calculate_perturbed_auc_values else scplus_obj.to_df('EXP'), 
        perturbed_matrix = perturbed_matrix, 
        reduction_name = reduction_name, 
        sigma_corr = 0.05, 
        n_cpu = n_cpu)

    log.info('Calculating grid of arrows')
    embedding = scplus_obj.dr_cell[reduction_name].to_numpy()
    grid_xy, uv, mask = _calculate_grid_arrows(
        embedding=embedding, 
        delta_embedding=delta_embedding,
        offset_frac=grid_offset_frac,
        n_grid_cols=grid_n_cols,
        n_grid_rows=grid_n_rows,
        n_neighbors=grid_n_neighbors,
        n_cpu=n_cpu)
    distances = np.sqrt((uv**2).sum(1))
    norm = matplotlib.colors.Normalize(vmin=0.15, vmax=0.5, clip=True)
    scale = lambda X: [(x - min(X)) / (max(X) - min(X)) for x in X]
    uv[np.logical_or(~mask, np.array(scale(distances)) < 0.15)] = np.nan
    log.info('Plotting')
    fig, ax = plt.subplots(figsize=figsize)
    ax = plot_metadata_given_ax(
        scplus_obj=scplus_obj,
        reduction_name=reduction_name,
        ax = ax,
        variable = variable,
        show_label = False,
        show_legend = True,
        **kwargs)
    ax.streamplot(
            grid_xy.reshape(grid_n_cols,grid_n_rows, 2)[:, :, 0],
            grid_xy.reshape(grid_n_cols,grid_n_rows, 2)[:, :, 1],
            uv.reshape(grid_n_cols,grid_n_rows, 2)[:, :, 0],
            uv.reshape(grid_n_cols,grid_n_rows, 2)[:, :, 1], 
            density = density, 
            color = np.array(scale(distances)).reshape(grid_n_cols, grid_n_rows),
            cmap = 'Greys', 
            zorder = 10, 
            norm = norm,
            linewidth = arrow_width,
            arrowsize = arrowsize)
    ax.grid(False)
    if save is not None:
        fig.savefig(save,dpi=600)
    else:
        plt.show(fig)
        return ax

In [None]:
import scenicplus
import scenicplus.simulation
from scenicplus.simulation import _project_perturbation_in_embedding
from scenicplus.simulation import _calculate_grid_arrows
import seaborn as sns
import matplotlib
_ = plot_perturbation_effect_in_embedding_custom(
        scplus_obj = scplus_obj,
        reduction_name = 'eRegulons_PCA_gene_based',
        n_cpu = 5,
        perturbation = {'Fos': 0}, #specifies that we want to set the expression of Fos to 0 in all cells.
        variable = 'summarized_celltype',
        color_dictionary = {'summarized_celltype': mapping_color_dict},
        genes_to_use = hvg,
        figsize = (5, 5),
        density = 1,
        arrow_width = 4,
        arrowsize = 2,
        dot_size = 15,
        save = "E4AD_1yr/fos_ko.png")

In [None]:
_ = plot_perturbation_effect_in_embedding_custom(
        scplus_obj = scplus_obj,
        reduction_name = 'eRegulons_PCA_gene_based',
        n_cpu = 5,
        perturbation = {'Egr1': 0}, #specifies that we want to set the expression of Egr1 to 0 in all cells.
        variable = 'summarized_celltype',
        color_dictionary = {'summarized_celltype': mapping_color_dict},
        genes_to_use = hvg,
        figsize = (5, 5),
        density = 1.1,
        arrow_width = 3.5,
        arrowsize = 2.1,
        dot_size = 15,
        save = "E4AD_1yr/egr1_ko.png")

In [None]:
_ = plot_perturbation_effect_in_embedding_custom(
        scplus_obj = scplus_obj,
        reduction_name = 'eRegulons_PCA_gene_based',
        n_cpu = 5,
        perturbation = {'Sox5': 0}, #specifies that we want to set the expression of Sox5 to 0 in all cells.
        variable = 'summarized_celltype',
        color_dictionary = {'summarized_celltype': mapping_color_dict},
        genes_to_use = hvg,
        figsize = (5, 5),
        density = 1.1,
        arrow_width = 3.5,
        arrowsize = 2.1,
        dot_size = 15,
        save = "E4AD_1yr/sox5_ko.png")

In [None]:
_ = plot_perturbation_effect_in_embedding_custom(
        scplus_obj = scplus_obj,
        reduction_name = 'eRegulons_PCA_gene_based',
        n_cpu = 5,
        perturbation = {'Klf4': 0}, #specifies that we want to set the expression of Klf4 to 0 in all cells.
        variable = 'summarized_celltype',
        color_dictionary = {'summarized_celltype': mapping_color_dict},
        genes_to_use = hvg,
        figsize = (5, 5),
        density = 1.1,
        arrow_width = 3.5,
        arrowsize = 2.1,
        dot_size = 15,
        save = "E4AD_1yr/klf4_ko.png")

In [None]:
_ = plot_perturbation_effect_in_embedding_custom(
        scplus_obj = scplus_obj,
        reduction_name = 'eRegulons_PCA_gene_based',
        n_cpu = 5,
        perturbation = {'Nfkb2': 0}, #specifies that we want to set the expression of Nfkb2 to 0 in all cells.
        variable = 'summarized_celltype',
        color_dictionary = {'summarized_celltype': mapping_color_dict},
        genes_to_use = hvg,
        figsize = (5, 5),
        density = 1.1,
        arrow_width = 3.5,
        arrowsize = 2.1,
        dot_size = 15,
        save = "E4AD_1yr/nfkb2_ko.png")