In [33]:
from pyscenic.rnkdb import RankingDatabase
from pyscenic.genesig import GeneSignature, Regulome
from pyscenic.recovery import enrichment, leading_edge
import matplotlib.pyplot as pyplot
from functools import partial
import glob
import os
from dask import delayed

In [2]:
%matplotlib inline

Gene signatures are downloaded from MSigDB (http://software.broadinstitute.org/gsea/msigdb). The module C6 is used in this notebook.

In [3]:
GMT_FNAME = "/Users/u0043358/Projects/resources/c6.all.v6.1.symbols.gmt.txt"

In [4]:
msigdb_c6 = GeneSignature.from_gmt(
                        fname=GMT_FNAME,
                        nomenclature="HGNC",
                        gene_separator="\t",
                        field_separator="\t")
len(msigdb_c6)

189

In [5]:
msigdb_c6[0]

GeneSignature(name='GLI1_UP.V1_DN', nomenclature='HGNC', gene2weights=<frozendict {'COPZ1': 1.0, 'C10orf46': 1.0, 'C20orf118': 1.0, 'TMEM181': 1.0, 'CCNL2': 1.0, 'YIPF1': 1.0, 'GTDC1': 1.0, 'OPN3': 1.0, 'RSAD2': 1.0, 'SLC22A1': 1.0, 'LIN37': 1.0, 'RNF10': 1.0, 'RUNX1': 1.0, 'QRICH2': 1.0, 'SEPT9': 1.0, 'ARPC2': 1.0, 'CD2AP': 1.0, 'ACSL6': 1.0, 'ZMYND11': 1.0, 'BBX': 1.0, 'TMEM62': 1.0, 'FOLR1': 1.0, 'TMEM189': 1.0, 'PPIL1': 1.0, 'CYP3A7': 1.0, 'SELM': 1.0, 'ESYT1': 1.0, 'BTBD16': 1.0, 'SPECC1': 1.0}>)

Make connection to a database of whole genome rankings.

In [6]:
DB_GLOB = "/Users/u0043358/Projects/databases/hg19-*.db"

In [7]:
db_fnames = glob.glob(DB_GLOB)
db_fnames

['/Users/u0043358/Projects/databases/hg19-tss-centered-5kb-10species.mc9nr.db',
 '/Users/u0043358/Projects/databases/hg19-500bp-upstream-10species.mc9nr.db',
 '/Users/u0043358/Projects/databases/hg19-tss-centered-10kb-7species.mc9nr.db',
 '/Users/u0043358/Projects/databases/hg19-500bp-upstream-7species.mc9nr.db',
 '/Users/u0043358/Projects/databases/hg19-tss-centered-5kb-7species.mc9nr.db',
 '/Users/u0043358/Projects/databases/hg19-tss-centered-10kb-10species.mc9nr.db']

In [8]:
def name(fname):
    return os.path.basename(fname).split(".")[0]
dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature="HGNC") for fname in db_fnames]

In [9]:
dbs

[RankingDatabase(name="hg19-tss-centered-5kb-10species",n_features=24453),
 RankingDatabase(name="hg19-500bp-upstream-10species",n_features=24453),
 RankingDatabase(name="hg19-tss-centered-10kb-7species",n_features=24453),
 RankingDatabase(name="hg19-500bp-upstream-7species",n_features=24453),
 RankingDatabase(name="hg19-tss-centered-5kb-7species",n_features=24453),
 RankingDatabase(name="hg19-tss-centered-10kb-10species",n_features=24453)]

Look for enrichment of a gene signature.

In [None]:
df = enrichment(dbs[0], msigdb_c6[0]).sort_values(by=('Enrichment', 'NES'), ascending=False)
df[('Metadata', 'Signature')] = msigdb_c6[0].name
df[('Metadata', 'Database')] = dbs[0].name

In [26]:
len(df)

24453

In [27]:
df.head()

Unnamed: 0_level_0,Enrichment,Enrichment,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,...,Ranking,Ranking,Ranking,Ranking,Ranking,Ranking,Ranking,Ranking,Metadata,Metadata
Unnamed: 0_level_1,AUC,NES,0,1,2,3,4,5,6,7,...,SEPT9,SLC22A1,SPECC1,TMEM181,TMEM189,TMEM62,YIPF1,ZMYND11,Signature,Database
predrem__nrMotif81,0.000223,5.126996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,167,13245,8538,111,13463,85,2483,719,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species
taipale_tf_pairs__E2F3_FOXI1_NNMCACCGCGCCCMN_CAP_repr,0.000212,4.789967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,198,11066,12243,4486,990,250,154,4134,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species
cisbp__M3230,0.000208,4.673257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,43,2423,2552,13525,18634,773,350,4661,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species
hocomoco__ZIM3_HUMAN.H11MO.0.C,0.000203,4.516057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2496,6671,3171,9306,13108,330,3078,12106,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species
transfac_pro__M06436,0.000202,4.502957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,120,17079,6585,19020,12863,4175,3315,7253,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species


In [28]:
len(df[df[('Enrichment', 'NES')] >= 2.5])

386

In [29]:
avgrcc = df['Recovery'].mean(axis=0)
stdrcc = df['Recovery'].std(axis=0)
avg2stdrcc = avgrcc + 2.0 * stdrcc

In [30]:
df[('Enrichment', 'LE')] = df[df[('Enrichment', 'NES')] >= 2.5][['Recovery', 'Ranking']].apply(
    partial(leading_edge, avg2stdrcc=avg2stdrcc, genes=df['Ranking'].columns.values, nomenclature="HGNC"),
    axis=1)

In [31]:
df.head()

Unnamed: 0_level_0,Enrichment,Enrichment,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,...,Ranking,Ranking,Ranking,Ranking,Ranking,Ranking,Ranking,Metadata,Metadata,Enrichment
Unnamed: 0_level_1,AUC,NES,0,1,2,3,4,5,6,7,...,SLC22A1,SPECC1,TMEM181,TMEM189,TMEM62,YIPF1,ZMYND11,Signature,Database,LE
predrem__nrMotif81,0.000223,5.126996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13245,8538,111,13463,85,2483,719,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species,"[(TMEM62, 85.0), (RUNX1, 107.0), (TMEM181, 111..."
taipale_tf_pairs__E2F3_FOXI1_NNMCACCGCGCCCMN_CAP_repr,0.000212,4.789967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11066,12243,4486,990,250,154,4134,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species,"[(BBX, 42.0), (YIPF1, 154.0), (SEPT9, 198.0), ..."
cisbp__M3230,0.000208,4.673257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2423,2552,13525,18634,773,350,4661,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species,"[(ARPC2, 42.0), (SEPT9, 43.0), (GTDC1, 109.0),..."
hocomoco__ZIM3_HUMAN.H11MO.0.C,0.000203,4.516057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6671,3171,9306,13108,330,3078,12106,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species,"[(RUNX1, 132.0), (GTDC1, 204.0), (RNF10, 245.0..."
transfac_pro__M06436,0.000202,4.502957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,17079,6585,19020,12863,4175,3315,7253,GLI1_UP.V1_DN,hg19-tss-centered-5kb-10species,"[(COPZ1, 34.0), (BBX, 42.0), (GTDC1, 85.0), (S..."


Create parallelized version

In [35]:
@delayed
def combine(db, gs):
    df = enrichment(db, gs).sort_values(by=('Enrichment', 'NES'), ascending=False)
    df[('Metadata', 'Signature')] = gs.name
    df[('Metadata', 'Database')] = db.name
    # add tf as metadata if available
    return df[df[('Enrichment', 'NES')] >= 2.5]
    
@delayed
def add_le(df):
    avgrcc = df['Recovery'].mean(axis=0)
    stdrcc = df['Recovery'].std(axis=0)
    avg2stdrcc = avgrcc + 2.0 * stdrcc
    df[('Enrichment', 'LE')] = df[['Recovery', 'Ranking']].apply(
                        partial(leading_edge,
                                avg2stdrcc=avg2stdrcc,
                                genes=df['Ranking'].columns.values,
                                nomenclature="HGNC"),
                        axis=1)
    
@delayed
def add_tf(df):
    # add columns from motif2TF
    # filter on combinations of the proper TFs
    # create regulomes.
    return df

@delayed
def concat(dfs):
    return pd.concat(dfs)

tasks1 = [combine(db, gs) for db in dbs for gs in msigdb_c6]
tasks2 = [add_le(task) for task in tasks1]
res = concat(tasks2)

In [None]:
res.compute()