# Empty Motif Data Bug

In [1]:
!readlink -f .

/ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/GitHub/pySCENIC/debug


In [None]:
from pyscenic.transc

Code for debugging the empty motifData bug. See https://github.com/aertslab/pySCENIC/issues/70

In [None]:
import os
import pandas as pd
from pyscenic.transform import df2regulons as df2regs
# from pyscenic.cli import pyscenic
from typing import Type, Sequence, Optional
from pyscenic.genesig import Regulon
from functools import reduce
import math
COLUMN_NAME_NES = "NES"
COLUMN_NAME_AUC = "AUC"
COLUMN_NAME_CONTEXT = "Context"
COLUMN_NAME_TARGET_GENES = "TargetGenes"
COLUMN_NAME_RANK_AT_MAX = "RankAtMax"
COLUMN_NAME_TYPE = "Type"
COLUMN_NAME_TF = 'TF'
COLUMN_NAME_ANNOTATION = 'Annotation'
COLUMN_NAME_MOTIF_ID = 'MotifID'
REPRESSING_MODULE = 'repressing'
ACTIVATING_MODULE = 'activating'
COLUMN_NAME_MOTIF_SIMILARITY_QVALUE = 'MotifSimilarityQvalue'
COLUMN_NAME_ORTHOLOGOUS_IDENTITY = 'OrthologousIdentity'

In [3]:
FILE_EXTENSION2SEPARATOR = {
    '.tsv': '\t',
    '.csv': ','
}

In [4]:
def read_motifs_enriched_table(fname):
    ext = os.path.splitext(fname,)[1]
    df = pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[ext], index_col=[0,1], header=[0,1], skipinitialspace=True)
    df[('Enrichment', 'Context')] = df[('Enrichment', 'Context')].apply(lambda s: eval(s))
    df[('Enrichment', 'TargetGenes')] = df[('Enrichment', 'TargetGenes')].apply(lambda s: eval(s))
    return df

In [5]:
df = read_motifs_enriched_table("/ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/REW/10x/exp/ih/combined/20181123/fb.r6_16.xk/10x_REW/results/pySCENIC/Filtered_Raw_Matrix_MultiRuns/scenic/run_1/reg.csv")

In [6]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment,Enrichment
Unnamed: 0_level_1,Unnamed: 1_level_1,AUC,Annotation,Context,MotifSimilarityQvalue,NES,OrthologousIdentity,RankAtMax,TargetGenes
TF,MotifID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Atf3,taipale__JDP2_full_NATGACGTCAYN,0.064482,gene is annotated for similar motif transfac_p...,"(dm6-5kb-upstream-full-tx-11species, weight>75...",2e-06,4.566469,1.0,3391,"[(LamC, 0.4272245628726853), (CrebB, 3.0106320..."
Atf3,transfac_pro__M01186,0.060153,gene is annotated for similar motif transfac_p...,"(dm6-5kb-upstream-full-tx-11species, weight>75...",0.000101,3.739763,1.0,2119,"[(Proc, 0.4272245628726853), (LamC, 3.01063207..."
Atf3,transfac_public__M00178,0.058939,gene is annotated for similar motif transfac_p...,"(dm6-5kb-upstream-full-tx-11species, weight>75...",8.6e-05,3.507823,1.0,2143,"[(CG10960, 0.4272245628726853), (LamC, 3.01063..."
Atf3,elemento__TGACGTCA,0.059298,gene is annotated for similar motif transfac_p...,"(dm6-5kb-upstream-full-tx-11species, weight>75...",0.000136,3.576546,1.0,2186,"[(Gprk2, 0.4272245628726853), (LamC, 3.0106320..."
Atf3,cisbp__M4307,0.060558,gene is annotated for similar motif transfac_p...,"(dm6-5kb-upstream-full-tx-11species, weight>75...",0.000577,3.817196,1.0,2139,"[(cbt, 0.4272245628726853), (ush, 3.0106320782..."


In [7]:
# df.loc[df.index.levels[0][:2].values]
# df.index.levels[0]
df.loc['Atf3'].max()

Enrichment  AUC                                                               0.280856
            Annotation               motif similar to transfac_pro__M07666 ('I$ATF3...
            Context                  (dm6-5kb-upstream-full-tx-11species, activatin...
            MotifSimilarityQvalue                                              0.00093
            NES                                                                6.92737
            OrthologousIdentity                                                      1
            RankAtMax                                                             4838
            TargetGenes              [(ush, 0.4272245628726853), (cbt, 3.0106320782...
dtype: object

In [8]:
df.loc['Atf3'].loc['transfac_pro__M07666'][('Enrichment','Context')]

MotifID
transfac_pro__M07666    (dm6-5kb-upstream-full-tx-11species, weight>75...
transfac_pro__M07666    (weight>90.0%, dm6-5kb-upstream-full-tx-11spec...
transfac_pro__M07666    (dm6-5kb-upstream-full-tx-11species, top50, ac...
transfac_pro__M07666    (dm6-5kb-upstream-full-tx-11species, top10perT...
transfac_pro__M07666    (dm6-5kb-upstream-full-tx-11species, activatin...
Name: (Enrichment, Context), dtype: object

In [145]:
if df.columns.nlevels == 2:
    df.columns = df.columns.droplevel(0)

In [191]:
def _regulon4group(tf_name, context, df_group) -> Optional[Regulon]:
    def score(nes, motif_similarity_qval, orthologuous_identity):
        # The combined score starts from the NES score which is then corrected for less confidence in the TF annotation
        # in two steps:
        # 1. The orthologous identifity (a fraction between 0 and 1.0) is used directly to normalize the NES.
        # 2. The motif similarity q-value is converted to a similar fraction: -log10(q-value)
        # A motif that is directly annotated for the TF in the correct species is not penalized.

        correction_fraction = 1.0
        try:
            max_value = 10  # A q-value smaller than 10**-10 is considered the same as a q-value of 0.0.
            correction_fraction = min(-math.log(motif_similarity_qval, 10), max_value)/max_value if not math.isnan(motif_similarity_qval) else 1.0
        except ValueError: # Math domain error
            pass
        score = nes * correction_fraction

        # We assume that a non existing orthologous identity signifies a direct annotation.
        return score if math.isnan(orthologuous_identity) else score * orthologuous_identity

    def derive_interaction_type(ctx):
        return "(-)" if REPRESSING_MODULE in ctx else "(+)"

    def row2regulon(row):
        # The target genes as well as their weights/importances are directly taken from the dataframe.
        return Regulon(name="{}{}".format(tf_name,derive_interaction_type(context)),
                        score=score(row[COLUMN_NAME_NES],
                                    row[COLUMN_NAME_MOTIF_SIMILARITY_QVALUE],
                                    row[COLUMN_NAME_ORTHOLOGOUS_IDENTITY]),
                        context=context,
                        transcription_factor=tf_name,
                        gene2weight=row[COLUMN_NAME_TARGET_GENES])

    # Find most enriched directly annotated motif and add this to the context.
    df_selected = df_group[((df_group[COLUMN_NAME_ANNOTATION] == 'gene is directly annotated')
                            | (df_group[COLUMN_NAME_ANNOTATION].str.startswith('gene is orthologous to')
                               & df_group[COLUMN_NAME_ANNOTATION].str.endswith('which is directly annotated for motif')))]
    df_selected = df_selected.sort_values(by=COLUMN_NAME_NES, ascending=False)
    if(len(df_selected)) == 0:
        print(tf_name)
    motif_logo = '{}.png'.format(df_selected.head(1).reset_index()[COLUMN_NAME_MOTIF_ID].values[0]) if len(df_selected) > 0 else ""

    # First we create a regulon for each enriched and annotated feature and then we aggregate these regulons into a
    # single one using the union operator. This operator combined all target genes into a single set of genes keeping
    # the maximum weight associated with a gene. In addition, the maximum combined score is kept as the score of the
    # entire regulon.
    return reduce(Regulon.union, (row2regulon(row) for _, row in df_group.iterrows())).copy(context=frozenset(set(context).union({motif_logo})))

In [166]:
def get_type(row):
#     print(row[('Enrichment', 'Context')])
    ctx = row[COLUMN_NAME_CONTEXT]
#     print(ctx)
    # Activating is the default!
    return REPRESSING_MODULE if REPRESSING_MODULE in ctx else ACTIVATING_MODULE
df[COLUMN_NAME_TYPE] = df.apply(get_type,axis=1)

In [160]:
# [print(tf_name, frozenset([interaction_type])) for (tf_name, interaction_type), df_grp in df.groupby(by=['TF','Type'])]

In [192]:
# Group all rows per TF and type (+)/(-). Each group results in a single regulon.
not_none = lambda r: r is not None
tmp = list(filter(not_none, (_regulon4group(tf_name, frozenset([interaction_type]), df_grp)
                       for (tf_name, interaction_type), df_grp in df.groupby(by=[COLUMN_NAME_TF,
                                                                                 COLUMN_NAME_TYPE]))))

Antp
Atac3
Blimp-1
CG16779
CG44247
CG5245
CG8209
CG9727
CHES-1-like
Dif
Doc3
Dr
E(spl)m3-HLH
E(spl)m8-HLH
E(spl)mbeta-HLH
ERR
Eip74EF
Ets97D
GATAd
H2.0
HDAC1
Hnf4
Hr4
Max
Nf-YA
Poxn
Psi
Smox
SoxN
Spps
Tet
Trf2
achi
aop
ap
bowl
crol
crp
da
dar1
disco-r
drm
ems
en
eyg
ftz-f1
grn
gt
inv
l(3)neo38
mor
nej
odd
opa
pan
pdm2
pdm3
peng
pho
pnt
rump
sd
sr
toe
usp
vtd
zfh2


In [202]:
tmp[0]

Regulon(name='Adf1(+)', gene2weight=<frozendict {'CG2556': 1.0, 'Mad': 1.072583681092771, 'CR45643': 3.456320045116416, 'crol': 1.9199835421000075, 'CG6118': 4.8500685006837285, 'HmgD': 0.637917649601581, 'CG14082': 1.434114686395855, 'CG10479': 1.6273701990966611, 'CG32365': 1.4336502922497167, 'Np': 1.7568945917972374, 'Bub1': 1.9187745121121376, 'sprt': 0.9817479735465064, 'spi': 1.2141215738524254, 'thw': 1.2508791487861441, 'CG31522': 1.6161862037250796, 'twi': 0.6674791002347131, 'CG43980': 0.7233327308768884, 'CG1910': 1.2747287361883866, 'CG12769': 1.221888842033957, 'Pex3': 0.1836240934846464, 'CR45361': 1.5996702616663483, 'Sema2b': 0.6337340663232829, 'CG32698': 0.5476029583179107}>, transcription_factor='Adf1', context=frozenset({'activating', 'flyfactorsurvey__Adf1_SANGER_5_FBgn0000054.png'}), score=3.767024527046528)

In [199]:
list(filter(lambda x: x.name == "zfh2(+)", regulons_1))

[Regulon(name='zfh2(+)', gene2weight=<frozendict {'sog': 30.085751895312065, 'CadN': 8.885491375184518, 'Sema2b': 15.956563506958751, 'hth': 10.789997953351353, 'ds': 16.431737116073297, 'CG42747': 30.085751895312065, 'esg': 12.024845947063437, 'kek1': 26.642096833657888, 'CG34355': 19.774529592898627, 'cysu': 9.385556410858312, 'olf413': 10.8165172420487, 'fz': 21.472113848893628, 'Dr': 18.753824719076725, 'zfh2': 10.101429985297044, 'bi': 12.343094454045133, 'ImpE1': 13.938166902208788, 'CG13300': 14.037992089771173, 'Nost': 37.20130889807077, 'sns': 15.956563506958751, 'pgant2': 7.764321360904495, 'elB': 15.956563506958751, 'Rfx': 5.803710264844428, 'CG30377': 16.431737116073297, 'CG34347': 16.431737116073297}>, transcription_factor='zfh2', context=frozenset({'', 'activating'}), score=0.20293746666763052)]

In [200]:
df.loc['zfh2'].loc['transfac_pro__M01314']['Annotation']

'gene is orthologous to ENSG00000136367 in H. sapiens (identity = 15%) which is annotated for similar motif cisbp__M0893 (\'ZFHX2[gene ID: "ENSG00000136367" species: "Homo sapiens" TF status: "direct" TF family: "Homeodomain" DBDs: "Homeobox"]\'; q-value = 5.78e-05)'

In [201]:
df.loc['zfh2']

Unnamed: 0_level_0,AUC,Annotation,Context,MotifSimilarityQvalue,NES,OrthologousIdentity,RankAtMax,TargetGenes,Type
MotifID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
transfac_pro__M01314,0.23529,gene is orthologous to ENSG00000136367 in H. s...,"(activating, dm6-5kb-upstream-full-tx-11specie...",5.8e-05,3.075725,0.15574,624,"[(sog, 30.085751895312065), (CadN, 8.885491375...",activating
cisbp__M6042,0.208494,gene is orthologous to ENSG00000136367 in H. s...,"(activating, dm6-5kb-upstream-full-tx-11specie...",0.000779,3.414248,0.15574,210,"[(CadN, 4.79749118333691), (CG42747, 30.085751...",activating
taipale__Lhx8_DBD_NTAATTANNNNTAATTAN,0.206088,gene is orthologous to ENSG00000136367 in H. s...,"(activating, dm6-5kb-upstream-full-tx-11specie...",0.000779,3.332517,0.15574,294,"[(CadN, 4.79749118333691), (CG42747, 30.085751...",activating
cisbp__M6042,0.149507,gene is orthologous to ENSG00000136367 in H. s...,"(activating, top10perTarget, dm6-5kb-upstream-...",0.000779,3.049716,0.15574,210,"[(CadN, 4.79749118333691), (CG42747, 30.085751...",activating


In [73]:
# list(filter(lambda x: x.name == "Atf3(+)", regulons_1))