In [1]:
#import everything

In [3]:
import sys
import scanpy as sc
import pandas as pd
import numpy as np
import os, glob
import pickle

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons, _distributed_calc
from pyscenic.aucell import aucell
from dask.diagnostics import ProgressBar
from distributed import LocalCluster, Client
import seaborn as sns

import logging

ModuleNotFoundError: No module named 'yaml'

In [None]:
#ensure correct path to kernel where packages were installed
sys.executable

In [3]:
#must be ver 1.0.0
import dask
dask.__version__

'1.0.0'

In [4]:
#must NOT be version 1.0 or newer
pd.__version__

'0.25.3'

In [5]:
#must be ver 1.28.1
import distributed
distributed.__version__

'1.28.1'

In [6]:
#easiest to import h5ad files
adata = sc.read_h5ad("Enterocyte_Lineage_Cluster_Counts.h5ad")

In [7]:
#check properties, here we have 35k cells by 35k genes
adata

AnnData object with n_obs × n_vars = 35589 × 35966 
    obs: 'ADTYPE', 'ATYPIA', 'PDIM', 'PTYPE', 'SIZEINVIVO', 'SEGMENT', 'BATCH', 'leiden', 'Enterocyte_Lineage', 'Secretory_Lineage', 'Enteroendocrine', 'Tuft', 'Tumor', 'Hyperplastic_Lineage'

In [8]:
#check current directory
!pwd

/home/bob/Dropbox (VU Basic Sciences)/Collaboration/PCA/Cumulative_SCENIC


In [9]:
DATA_FOLDER="/home/bob/Dropbox (VU Basic Sciences)/Collaboration/PCA/Cumulative_SCENIC/Cumulative_SCENIC_out_ent_lin/" #output directory
RESOURCES_FOLDER="/home/bob/SCENIC/pySCENIC/resources/" #resource directory, or location where ..tfs.txt is or where the motifs file tbl file is
DATABASE_FOLDER = "/home/bob/SCENIC/pySCENIC/databases/" #database directory, or where the .feather databases are located

DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "hg19-*.mc9nr.feather") #make a combined import for all feather databases
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.hgnc-m0.001-o0.0.tbl") #motif annotations file
                                                         #motifs-v9-nr.hgnc-m0.001-o0.0.tbl
HS_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_tfs.txt') #tf list file

ADJACENCIES_FNAME = os.path.join(DATA_FOLDER, "7_10_species_adj.csv") #output adjacency results from first step of scenic 
MODULES_FNAME = os.path.join(DATA_FOLDER, "7_10_species_modules.p") #output for detected modules 
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "7_10_species_motifs.csv") #output for detected motifs
REGULONS_FNAME = os.path.join(DATA_FOLDER, "7_10_species_regulons.p") #output for regulon enrichments
AUC_FNAME = os.path.join(DATA_FOLDER, "7_10_species_aucell.csv") #output for relative regulon values, this is what is typically used for SCENIC based UMAPS

In [10]:
#import all ranking databases, should get an output thats like 'featherrankingdatabse' for like six databases, make sure you are importing the mouse or human one depending on situation
tf_names = load_tf_names(HS_TFS_FNAME)
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

[FeatherRankingDatabase(name="hg19-tss-centered-5kb-7species.mc9nr"),
 FeatherRankingDatabase(name="hg19-tss-centered-10kb-7species.mc9nr"),
 FeatherRankingDatabase(name="hg19-500bp-upstream-10species.mc9nr"),
 FeatherRankingDatabase(name="hg19-500bp-upstream-7species.mc9nr"),
 FeatherRankingDatabase(name="hg19-tss-centered-10kb-10species.mc9nr"),
 FeatherRankingDatabase(name="hg19-tss-centered-5kb-10species.mc9nr")]

In [11]:
#convert imported AnnData object into a pandas dataframe
data_in = pd.DataFrame(adata.X).astype(int)
data_in.columns = adata.var_names
data_in.index = adata.obs_names

In [12]:
#check dataframe size
data_in.shape

(35589, 35966)

In [13]:
#free up some memory by resetting the AnnData object, we only need the dataframe at this point
adata = 0

In [14]:
#set up a local Dask cluster for parallelizing
#n workers should = number of cores
#threads per worker should be the number of threads per core
#n workers * threads per worker should = total number of threads. 
#your mac mini probably has something like 12 threads total
#the memory limit scales per core, and threads per worker is not incorporated
#best to divide the 64GB of ram evenly across the number of workers
#you can access an interactive progress bar interface at localhost:2345 if you have bokeh installed
local_cluster = LocalCluster(n_workers=4,threads_per_worker=8,dashboard_address=':2345',memory_limit='20000MB')#lots of sleeping processes
custom_client_coexp = Client(local_cluster) #might not need this

In [None]:
%%time  #this should take the longest time, several hours
adj = grnboost2(data_in, tf_names=tf_names, verbose=True,client_or_address=custom_client_coexp)

preparing dask client
parsing input
creating dask graph


  expression_matrix = expression_data.as_matrix()


4 partitions
computing dask graph


In [None]:
adj.head() #check to see if it outputted properly

In [None]:
adj.to_csv(ADJACENCIES_FNAME, index=False, sep='\t') #save to file at previously defined locations

In [None]:
%%time
modules = list(modules_from_adjacencies(adj, data_in)) #default settings takes like 3 minutes, just creates a module file from the adjacencies
#bunch of nanny warnings show up if running in same notebook as coexpression

In [None]:
#

In [None]:
with open(MODULES_FNAME, "wb") as f: #save the modules as a formatted 'pickle' file
    pickle.dump(modules, f)

In [None]:
len(modules) #check length

In [None]:
local_cluster.close() #close the Dask parallelization cluster/client
custom_client_coexp.close()

In [None]:
####
#IT IS BEST TO COMPLETELY RESTART THE KERNEL AT THIS POINT TO FREE UP MEMORY AND CLEAR THE MEMORY
#ALL PREVIOUS IMPORTS MUST BE REDONE, just skip the adjacency calculation steps and jump directly back to here and start from the module files saved to disk previously
###

In [10]:
MODULES_FNAME #check save location again

'/home/bob/Dropbox (VU Basic Sciences)/Collaboration/PCA/Cumulative_SCENIC/Cumulative_SCENIC_out_ent_lin/7_10_species_modules.p'

In [8]:
with open(MODULES_FNAME, 'rb') as f: #load 'pickle' file
    modules = pickle.load(f)

In [9]:
len(modules) #ensure it's the same length as expected

10512

In [None]:
#fire up another dask clusters, n workers * threads_per_worker must equal 12 im guessing for your mac mini. 
#This part is very memory hungry and will either freeze or time out if not enough is set per n_worker
#to be on the safe side maybe stick to 4 workers or less with 2 threads each, with 16gb memory per worker. if that doesnt work decrease to 2 workers and double the memory per worker.

In [11]:
local_cluster = LocalCluster(n_workers=12,threads_per_worker=2,dashboard_address=':2345',memory_limit='10000MB')#lots of sleeping processes
custom_client_coexp = Client(local_cluster)

In [12]:
#gets messed up/slow if running without closing previous dask client.

In [None]:
df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME,client_or_address=custom_client_coexp)  #stick to default settings

  (["('from-delayed-a1e8ebdea217195b4d3c82e66399f1e4 ... e4', 63071)"],)
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


In [None]:
# Save the enriched motifs and the discovered regulons to disk.
df.to_csv(MOTIFS_FNAME)

In [None]:
#######

In [None]:
#convert motifs to regulons

In [None]:
regulons = df2regulons(df)

In [None]:
#save regulons to disk
with open(REGULONS_FNAME, "wb") as f:
    pickle.dump(regulons, f)

In [None]:
#close parallelization cluster
local_cluster.close()
custom_client_coexp.close()

In [None]:
#

Phase III: AUCell

In [None]:
#again, it is recommended to restart the kernel at this point like the previous step and re-import everything necessary

In [14]:
with open(REGULONS_FNAME, 'rb') as f: #load regulon 'pickle' file
    regulons = pickle.load(f)

In [15]:
data_in #check original count matrix

Unnamed: 0,MT-ND3,AC004076.9,PHC3,GNG7,RP11-1348G14.5,TUBD1,EFTUD2,PFN1,ANKRD11,P2RY8,...,RP11-5K23.5,RP11-327J17.9,AC092669.6,SNORA59A,OR4F6,RNA5-8SP7,RP11-1023P17.2,RP11-552O4.1,EGFLAM-AS3,RP11-373J21.1
GACGATTGATTAGTGGAC-mpp_03a-First,1339,0,2,0,0,0,0,32,4,0,...,0,0,0,0,0,0,0,0,0,0
TGAGGTTTCTCCACAGTTT-mpp_03a-First,1353,1,2,0,0,3,5,24,6,0,...,0,0,0,0,0,0,0,0,0,0
TAAATAGGCCAACCGT-mpp_03a-First,983,0,0,0,0,1,4,29,3,0,...,0,0,0,0,0,0,0,0,0,0
AGGCAACGGCCTCTTT-mpp_03a-First,973,0,3,0,0,3,0,40,4,0,...,0,0,0,0,0,0,0,0,0,0
GAGAATTCGTAGCGCCTT-mpp_03a-First,1614,0,1,1,0,2,0,9,2,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTTGCACGGCCACATC-MPP121A1-Second,15,0,0,0,0,0,1,3,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAACTCGAGGGAACGA-MPP121A1-Second,19,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
GAACTAGGATACGAAACG-MPP121A1-Second,11,0,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
TGAACTAGCCACACAAGGC-MPP121A1-Second,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
auc_mtx = aucell(data_in, regulons, num_workers=16) #this should be very fast with minimal ram constraints, this is the final enrichment matrix after running this step 

In [17]:
auc_mtx.head() #should be a cell x regulon matrix

Regulon,ARNTL(+),ASCL2(+),ATF3(+),ATF4(+),BCL3(+),BCLAF1(+),BHLHE40(+),BRCA1(+),CBX7(+),CDX2(+),...,XBP1(+),YBX1(+),YY1(+),ZBTB17(+),ZBTB7A(+),ZBTB7B(+),ZMIZ1(+),ZNF76(+),ZNF831(+),ZNF878(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GACGATTGATTAGTGGAC-mpp_03a-First,0.099676,0.05938,0.0943,0.105433,0.151058,0.061749,0.048363,0.029279,0.0,0.726394,...,0.089586,0.762002,0.054935,0.060131,0.183721,0.186385,0.183734,0.0,0.0,0.011836
TGAGGTTTCTCCACAGTTT-mpp_03a-First,0.068311,0.127467,0.061857,0.115404,0.110959,0.067394,0.09311,0.046894,0.0,0.697419,...,0.098821,0.791405,0.068927,0.047526,0.147425,0.138264,0.13722,0.0,0.0,0.0
TAAATAGGCCAACCGT-mpp_03a-First,0.059505,0.106743,0.042071,0.113876,0.06624,0.07414,0.049361,0.104166,0.114002,0.687322,...,0.130027,0.924853,0.089731,0.018241,0.111302,0.078256,0.087889,0.0,0.0,0.0
AGGCAACGGCCTCTTT-mpp_03a-First,0.101625,0.075405,0.120555,0.105697,0.171882,0.056137,0.094188,0.03883,0.071765,0.680486,...,0.088209,0.722617,0.06175,0.049959,0.179117,0.164715,0.215308,0.0,0.0,0.0
GAGAATTCGTAGCGCCTT-mpp_03a-First,0.082671,0.036068,0.083264,0.103427,0.145354,0.053247,0.065623,0.023728,0.032077,0.730583,...,0.073473,0.809067,0.052858,0.047457,0.210144,0.229295,0.250382,0.0,0.0,0.0


In [18]:
# Save the enriched motifs and the discovered regulons to disk.
auc_mtx.to_csv(AUC_FNAME)

In [None]:
#finally close the cluster agin

In [None]:
local_cluster.close()
custom_client_coexp.close()