__Author:__ Bram Van de Sande

__Date:__ 31 JAN 2018

__Outline:__ 

In [1]:
import pickle
import os
import glob
from pyscenic.rnkdb import RankingDatabase
from pyscenic.genesig import GeneSignature, Regulome
from pyscenic.recovery import enrichment, leading_edge
import matplotlib.pyplot as plt
from functools import partial
from dask import delayed
import numpy as np
import pandas as pd
from itertools import repeat

In [2]:
DATA_FOLDER="/Users/bramvandesande/Projects/lcb/tmp"
RESOURCES_FOLDER="/Users/bramvandesande/Projects/lcb/resources"
DB_GLOB = "/Users/bramvandesande/Projects/lcb/databases/mm9-*.db"

Co-expression modules were derived from GENIE3 output.

In [3]:
with open(os.path.join(DATA_FOLDER,'modules.pickle'), 'rb') as f:
    modules = pickle.load(f)

In [4]:
len(modules)

5148

Remove modules that are less than 20 genes.

In [5]:
modules = list(filter(lambda m: len(m) >= 20, modules))

In [6]:
len(modules)

5089

Load whole genome ranking databases.

In [7]:
db_fnames = glob.glob(DB_GLOB)
def name(fname):
    return os.path.basename(fname).split(".")[0]
dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature="MGI") for fname in db_fnames]

In [8]:
len(dbs)

6

Load TF annotations (using a snapshot from motif2TF).

In [9]:
COLUMN_NAMES = ['gene_name', 'motif_similarity_qvalue', 'orthologous_identity', 'description']

In [10]:
motif2tf = pd.read_csv(os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl"),
                      sep='\t', index_col=0)
motif2tf = motif2tf[COLUMN_NAMES]
motif2tf.columns = list(zip(repeat('Motif2TF'), COLUMN_NAMES))

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
motif2tf.head()

Unnamed: 0_level_0,"(Motif2TF, gene_name)","(Motif2TF, motif_similarity_qvalue)","(Motif2TF, orthologous_identity)","(Motif2TF, description)"
#motif_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bergman__Abd-B,Hoxa9,0.0006,1.0,gene is annotated for similar motif cisbp__M10...
bergman__Aef1,Zfp128,0.0,0.220264,motif is annotated for orthologous gene FBgn00...
bergman__Cf2,Zfp853,0.0,0.166667,motif is annotated for orthologous gene FBgn00...
bergman__EcR_usp,Nr1h2,0.0,0.378924,gene is orthologous to FBgn0000546 in D. melan...
bergman__EcR_usp,Nr1h3,0.0,0.408989,gene is orthologous to FBgn0000546 in D. melan...


In [62]:
def generate_features(db, gs, rank_threshold=1500):
    """
    
    """
    df = enrichment(db, gs, rank_threshold=1500).sort_values(by=('Enrichment', 'NES'), ascending=False)
    df[('Metadata', 'Signature')] = gs.name
    df[('Metadata', 'Database')] = db.name
    if isinstance(gs, Regulome):
        df[('Metadata', 'Factor')] = gs.transcription_factor
    return df

In [63]:
df_features = generate_features(dbs[0], modules[6])

In [64]:
def generate_recovery_curves(df_features):
    """
    """
    avgrcc = df_features['Recovery'].mean(axis=0)
    stdrcc = df_features['Recovery'].std(axis=0)
    return avgrcc + 2.0 * stdrcc

In [65]:
avg2stdrcc = generate_recovery_curves(df_features)

In [66]:
def filter_features(df_features, nes_threshold=3.0):
    return df_features[df_features[('Enrichment', 'NES')] >= nes_threshold]

In [67]:
df_enriched_features = filter_features(df_features)

In [68]:
def add_tf_annotations(df_features, motif2tf):
    df = pd.merge(df_features, motif2tf, how='left', left_index=True, right_index=True)
    df = df[df[('Metadata', 'Factor')] == df['Motif2TF', 'gene_name']]
    return df

In [69]:
df_annotated_features = add_tf_annotations(df_enriched_features, motif2tf)



In [70]:
df_annotated_features.head()

Unnamed: 0_level_0,Enrichment,Enrichment,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,...,Ranking,Ranking,Ranking,Metadata,Metadata,Metadata,Motif2TF,Motif2TF,Motif2TF,Motif2TF
Unnamed: 0_level_1,AUC,NES,0,1,2,3,4,5,6,7,...,Zfp790,Zic1,Zic2,Signature,Database,Factor,gene_name,motif_similarity_qvalue,orthologous_identity,description
cisbp__M0998,0.00302,3.213836,0.0,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,...,20140,961,666,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000278,1.0,motif similar to cisbp__M1004 ('Alx1[gene ID: ...
cisbp__M1032,0.002952,3.099688,0.0,0.0,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,...,20304,1639,575,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000199,1.0,gene is annotated for similar motif cisbp__M10...
cisbp__M1058,0.002983,3.151343,0.0,0.0,5.005771,5.009509,5.009509,5.009509,5.009509,5.009509,...,20414,1143,363,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000594,1.0,motif similar to cisbp__M1004 ('Alx1[gene ID: ...


In [71]:
def add_targetome(df_features, avg2stdrcc, nomenclature="MGI"):
    df_features[('Enrichment', 'LE')] = df_features[['Recovery', 'Ranking']].apply(
                        partial(leading_edge,
                                avg2stdrcc=avg2stdrcc,
                                genes=df['Ranking'].columns.values,
                                nomenclature=nomenclature),
                        axis=1)
    del df_features['Ranking']
    return df_features

In [72]:
df_results = add_targetome(df_annotated_features, avg2stdrcc)

In [73]:
df_results.head()

Unnamed: 0_level_0,Enrichment,Enrichment,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Recovery,Metadata,Metadata,Metadata,Motif2TF,Motif2TF,Motif2TF,Motif2TF,Enrichment
Unnamed: 0_level_1,AUC,NES,0,1,2,3,4,5,6,7,...,1498,1499,Signature,Database,Factor,gene_name,motif_similarity_qvalue,orthologous_identity,description,LE
cisbp__M0998,0.00302,3.213836,0.0,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,...,126.869505,126.869505,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000278,1.0,motif similar to cisbp__M1004 ('Alx1[gene ID: ...,"[(Alg1, 1.0), (9530059O14Rik, 15.0), (1810026J..."
cisbp__M1032,0.002952,3.099688,0.0,0.0,5.005771,5.005771,5.005771,5.005771,5.005771,5.005771,...,117.179191,117.179191,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000199,1.0,gene is annotated for similar motif cisbp__M10...,"[(Alg1, 2.0), (9530059O14Rik, 20.0), (Arfgef2,..."
cisbp__M1058,0.002983,3.151343,0.0,0.0,5.005771,5.009509,5.009509,5.009509,5.009509,5.009509,...,131.242117,131.242117,Regulome for Alx1 (target weight >= 0.001),mm9-500bp-upstream-7species,Alx1,Alx1,0.000594,1.0,motif similar to cisbp__M1004 ('Alx1[gene ID: ...,"[(Alg1, 2.0), (9530059O14Rik, 3.0), (Arfgef2, ..."


In [83]:
# TODO: Regulome should have a score property! It should combine
# TODO: Final LE should also incorporate importances from GENIE3: No operation for aggregation is needed
# only when importance is available per gene.

for metadata, group in df_results.groupby(by=[('Metadata', 'Signature'),
                                        ('Metadata', 'Database'), 
                                        ('Metadata', 'Factor')]):
    tf_name = metadata[2]
    regulome_name = "{} on {}".format(metadata[0], metadata[1])
    def combine(row):
        print(row)
    group[('Enrichment', 'LE')].apply(combine)
    print(tf_name)
    print(regulome_name)

[('Alg1', 1.0), ('9530059O14Rik', 15.0), ('1810026J23Rik', 34.0), ('Arfgef2', 35.0), ('Adprh', 60.0), ('Ano1', 92.0), ('9630033F20Rik', 120.0), ('Adrb2', 123.0), ('Arhgap29', 125.0), ('2410137F16Rik', 128.0), ('Ankrd49', 156.0), ('Asrgl1', 165.0), ('Add2', 180.0), ('Amy1', 181.0), ('5330426P16Rik', 210.0), ('AI414108', 224.0), ('Aldh5a1', 240.0), ('4930579G24Rik', 252.0), ('Arntl', 278.0), ('Abcg2', 287.0), ('Adap1', 289.0), ('5430435G22Rik', 327.0), ('Acadvl', 385.0), ('Alkbh5', 409.0), ('Ankrd6', 414.0), ('Arpc1a', 422.0), ('9430076C15Rik', 423.0), ('Acot6', 462.0), ('2700029M09Rik', 480.0), ('A730020M07Rik', 510.0), ('Anapc13', 524.0), ('2210018M11Rik', 565.0), ('Arhgap23', 568.0), ('Amotl1', 588.0), ('Ash2l', 601.0), ('Atf6b', 666.0), ('Abcb1a', 762.0), ('Adarb1', 794.0), ('Abhd14b', 808.0), ('Alx3', 838.0), ('Add1', 847.0), ('AI837181', 849.0), ('A830018L16Rik', 854.0), ('Ap2b1', 860.0)]
[('Alg1', 2.0), ('9530059O14Rik', 20.0), ('Arfgef2', 29.0), ('1810026J23Rik', 36.0), ('Arhgap2

In [None]:
def generate_regulome(df, nomenclature="MGI"):
    df.
    

In [13]:
df = combine(dbs[0], modules[0])

4197
(24453, 4196)


In [15]:
len(df)

102

In [None]:
len(dbs[0].genes)

In [31]:
len(modules[0])

4559

In [33]:
len(set(modules[0].genes).intersection(set(dbs[0].genes)))

4196

In [29]:
dbs[0].genes

('0610007C21Rik',
 '0610007L01Rik',
 '0610007P08Rik',
 '0610007P14Rik',
 '0610007P22Rik',
 '0610008F07Rik',
 '0610009B14Rik',
 '0610009B22Rik',
 '0610009D07Rik',
 '0610009O20Rik',
 '0610010B08Rik',
 '0610010F05Rik',
 '0610010K14Rik',
 '0610010O12Rik',
 '0610011F06Rik',
 '0610011L14Rik',
 '0610012G03Rik',
 '0610012H03Rik',
 '0610030E20Rik',
 '0610031J06Rik',
 '0610037L13Rik',
 '0610037P05Rik',
 '0610038B21Rik',
 '0610039K10Rik',
 '0610040B10Rik',
 '0610040J01Rik',
 '0910001L09Rik',
 '100043387',
 '1100001G20Rik',
 '1110001A16Rik',
 '1110001J03Rik',
 '1110002B05Rik',
 '1110002L01Rik',
 '1110002N22Rik',
 '1110003E01Rik',
 '1110004E09Rik',
 '1110004F10Rik',
 '1110005A03Rik',
 '1110006O24Rik',
 '1110007C09Rik',
 '1110008F13Rik',
 '1110008J03Rik',
 '1110008L16Rik',
 '1110008P14Rik',
 '1110012D08Rik',
 '1110012J17Rik',
 '1110012L19Rik',
 '1110014N23Rik',
 '1110017D15Rik',
 '1110017F19Rik',
 '1110018G07Rik',
 '1110018J18Rik',
 '1110020A21Rik',
 '1110020G09Rik',
 '1110021J02Rik',
 '1110021L09Ri

In [None]:
@delayed
def combine(db, gs, nes_threshold=3.0):
    df = enrichment(db, gs, rank_threshold=1500).sort_values(by=('Enrichment', 'NES'), ascending=False)
    df[('Metadata', 'Signature')] = gs.name
    df[('Metadata', 'Database')] = db.name
    df[('Metadata', 'Factor')] = gs.transcription_factor
    # Data to keep here is AUC distribution
    return df[df[('Enrichment', 'NES')] >= nes_threshold]


@delayed
def add_le(df):
    avgrcc = df['Recovery'].mean(axis=0)
    stdrcc = df['Recovery'].std(axis=0)
    avg2stdrcc = avgrcc + 2.0 * stdrcc
    # Data to keep here avg2stdrcc and avgrcc!
    df[('Enrichment', 'LE')] = df[['Recovery', 'Ranking']].apply(
                        partial(leading_edge,
                                avg2stdrcc=avg2stdrcc,
                                genes=df['Ranking'].columns.values,
                                nomenclature="HGNC"),
                        axis=1)
    del df['Ranking']
    return df

@delayed
def add_tf_and_filter(df, motif2tf):
    mdf = pd.merge(df, motif2tf, how='left', left_index=True, right_index=True)
    mdf = mdf[mdf[('Metadata', 'Factor')] == mdf['Motif2TF', 'gene_name']]
    return mdf
    
@delayed
def create_regulomes(df):
    # TODO: 
    def regulomes():
        for factor, group in df.groupby(by=('Metadata', 'Factor')):
            yield Regulome()
            
    return reduce(Regulome.union, regulomes())

@delayed
def concat(dfs):
    return pd.concat(dfs)

tasks1 = [combine(db, gs) for db in dbs[0:2] for gs in msigdb_c6[0:2]]
tasks2 = [add_le(task) for task in tasks1]
res = concat(tasks2)