__Author:__ Bram Van de Sande

__Date:__ 5 FEB 2018

__Outline:__ This notebook clarifies the process by which the co-expression modules derived from GENIE3 can be refined into true regulomes (i.e. excluding indirect targets of transcription factors). Aka "RcisTarget".

In [1]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.genesig import GeneSignature, Regulome
from pyscenic.regulome import module2regulome, derive_regulomes
from pyscenic.utils import load_motif_annotations

from dask import delayed
from dask.dot import dot_graph
from dask.multiprocessing import get
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
from dask.diagnostics import ProgressBar
from bokeh.io import output_notebook, push_notebook, show
output_notebook()
from dask.diagnostics import visualize

In [2]:
%load_ext snakeviz

In [3]:
DATA_FOLDER="/Users/bramvandesande/Projects/lcb/tmp"
RESOURCES_FOLDER="/Users/bramvandesande/Projects/lcb/resources"
DATABASE_FOLDER = "/Users/bramvandesande/Projects/lcb/databases/"

SQLITE_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.db")
FEATHER_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.feather")

MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")

NOMENCLATURE = "MGI"

Make databases in feather format are available.

In [4]:
if False:
    def derive_db_name(fname):
        return os.path.basename(fname).split(".")[0]

    from pyscenic.rnkdb import convert2feather
    
    for fname in glob.glob(SQLITE_GLOB):
        convert2feather(fname, DATABASE_FOLDER, derive_db_name(fname), NOMENCLATURE)

### Load resources

Co-expression modules were derived from GENIE3 output.

In [5]:
with open(os.path.join(DATA_FOLDER,'modules.pickle'), 'rb') as f:
    modules = pickle.load(f)

In [6]:
len(modules)

5106

### Load whole genome ranking databases

In [7]:
db_fnames = glob.glob(FEATHER_GLOB)
def name(fname):
    return os.path.basename(fname).split(".")[0]
dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature="MGI") for fname in db_fnames]

In [8]:
len(dbs)

6

### Load motif annotations

In [9]:
motif_annotations = load_motif_annotations(MOTIF_ANNOTATIONS_FNAME)

In [10]:
motif_annotations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,motif_similarity_qvalue,orthologous_identity,description
gene_name,#motif_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hoxa9,bergman__Abd-B,0.0006,1.0,gene is annotated for similar motif cisbp__M10...
Zfp128,bergman__Aef1,0.0,0.220264,motif is annotated for orthologous gene FBgn00...
Zfp853,bergman__Cf2,0.0,0.166667,motif is annotated for orthologous gene FBgn00...
Nr1h2,bergman__EcR_usp,0.0,0.378924,gene is orthologous to FBgn0000546 in D. melan...
Nr1h3,bergman__EcR_usp,0.0,0.408989,gene is orthologous to FBgn0000546 in D. melan...


### Test pipeline and benchmark on a sample

Before scaling it via dask to work on the full combinatorial space of databases x modules.

In [11]:
regulomes = list((idx, module2regulome(dbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

In [12]:
regulomes

[(0, None),
 (1, None),
 (2, None),
 (3, None),
 (4,
  Regulome(name='Regulome for Ahr', nomenclature='MGI', gene2weights=<frozendict {'Vps33a': 1.0, 'Picalm': 1.0, 'Arhgap12': 1.0, 'Eif4g1': 1.0, '4933434E20Rik': 1.0, 'Cuta': 1.0, 'Dolk': 1.0, 'Gtf2h1': 1.0, 'Srek1': 1.0, 'Mthfr': 1.0, 'Aff4': 1.0, 'Rragb': 1.0, 'Zzz3': 1.0, 'Spag9': 1.0, 'Ube2q1': 1.0, 'Nr1d1': 1.0, 'Snapc3': 1.0, '1700052N19Rik': 1.0, 'Cpeb1': 1.0, 'Gnptg': 1.0, 'Usp8': 1.0, 'Mfsd1': 1.0, 'Ubn1': 1.0, 'Luc7l2': 1.0, 'Mcoln1': 1.0, 'Phf15': 1.0, 'Pcdh8': 1.0, 'Lox': 1.0, 'Socs5': 1.0, 'Spg21': 1.0, 'Atp2b1': 1.0, 'Erp29': 1.0, 'Srp54c': 1.0, 'Atp11b': 1.0, 'Tug1': 1.0, 'Abhd11': 1.0, 'Aldh3b2': 1.0, 'Mal2': 1.0, 'Flrt2': 1.0, 'Olfm1': 1.0, 'Prickle2': 1.0, 'Tmem110': 1.0, 'Psma6': 1.0, 'Nptn': 1.0, 'U2af2': 1.0, 'Josd1': 1.0, 'Dusp5': 1.0, 'Cbfa2t3': 1.0, 'Ptprs': 1.0, 'Trim30d': 1.0, 'Mtdh': 1.0, 'Ralyl': 1.0, 'Slc36a1': 1.0, 'Fbxo10': 1.0, 'Sla2': 1.0, 'Zmat3': 1.0, 'Morf4l1': 1.0, 'Bdnf': 1.0, 'Jph1': 1.0, 'Lin7c'

Profile a single execution of module2regulome.

In [13]:
%%snakeviz
module2regulome(dbs[0], modules[4], motif_annotations)

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmppvsnwqv1'. 


### Parallelized pipeline using dask

In [21]:
with ProgressBar():
    with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
        regulomes = derive_regulomes(dbs[0:2], modules[0:50], MOTIF_ANNOTATIONS_FNAME)

[########################################] | 100% Completed |  3min  7.8s


In [22]:
len(regulomes)

18

In [24]:
regulomes

[Regulome(name='Regulome for Ahr', nomenclature='MGI', gene2weights=<frozendict {'Vps33a': 1.0, 'Picalm': 1.0, 'Arhgap12': 1.0, 'Eif4g1': 1.0, '4933434E20Rik': 1.0, 'Cuta': 1.0, 'Dolk': 1.0, 'Gtf2h1': 1.0, 'Srek1': 1.0, 'Mthfr': 1.0, 'Aff4': 1.0, 'Rragb': 1.0, 'Zzz3': 1.0, 'Spag9': 1.0, 'Ube2q1': 1.0, 'Nr1d1': 1.0, 'Snapc3': 1.0, '1700052N19Rik': 1.0, 'Cpeb1': 1.0, 'Gnptg': 1.0, 'Usp8': 1.0, 'Mfsd1': 1.0, 'Ubn1': 1.0, 'Luc7l2': 1.0, 'Mcoln1': 1.0, 'Phf15': 1.0, 'Pcdh8': 1.0, 'Lox': 1.0, 'Socs5': 1.0, 'Spg21': 1.0, 'Atp2b1': 1.0, 'Erp29': 1.0, 'Srp54c': 1.0, 'Atp11b': 1.0, 'Tug1': 1.0, 'Abhd11': 1.0, 'Aldh3b2': 1.0, 'Mal2': 1.0, 'Flrt2': 1.0, 'Olfm1': 1.0, 'Prickle2': 1.0, 'Tmem110': 1.0, 'Psma6': 1.0, 'Nptn': 1.0, 'U2af2': 1.0, 'Josd1': 1.0, 'Dusp5': 1.0, 'Cbfa2t3': 1.0, 'Ptprs': 1.0, 'Trim30d': 1.0, 'Mtdh': 1.0, 'Ralyl': 1.0, 'Slc36a1': 1.0, 'Fbxo10': 1.0, 'Sla2': 1.0, 'Zmat3': 1.0, 'Morf4l1': 1.0, 'Bdnf': 1.0, 'Jph1': 1.0, 'Lin7c': 1.0, 'Uvrag': 1.0, 'Irf2bp2': 1.0, 'Cbx6': 1.0, 'Grm

In [23]:
visualize([prof, rprof, cprof])

Test with custom client.

In [25]:
from distributed import LocalCluster, Client

In [26]:
local_cluster = LocalCluster(n_workers=6, 
                             threads_per_worker=1)

custom_client = Client(local_cluster)

In [27]:
custom_client

0,1
Client  Scheduler: tcp://127.0.0.1:52900  Dashboard: http://127.0.0.1:52901,Cluster  Workers: 6  Cores: 6  Memory: 12.88 GB


In [28]:
regulomes = derive_regulomes(dbs[0:2],
                             modules[0:50],
                             MOTIF_ANNOTATIONS_FNAME,
                             client_or_address=custom_client)

In [29]:
regulomes

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52913, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52913, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52996, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52996, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52909, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

tornado.application - ERROR - Exception in callback <bound method Nanny.memory_monitor of <Nanny: tcp://127.0.0.1:52909, threads: 1>>
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 348, in catch_zombie
    yield
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/psutil/_psosx.py", line 387, in _get_pidtaskinfo
    ret = cext.proc_pidtaskinfo_oneshot(self.pid)
ProcessLookupError: [Errno 3] No such process

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 1026, in _run
    return self.callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/distributed/nanny.py", line 245, in memory_monitor
    memory = psutil.Process(self.process.pid).memory_info().r

In [15]:
custom_client.close()
local_cluster.close()