In [None]:
# calculate epigenome based SERA (super enhancer regulatory activity) 
# and MotifES (Motif Enrichment Score) scores
# Start with: super enhancer data files generated with ROSE package
# and HOMER motif enrichment analysis result files with findMotifsGenome

In [None]:
import pandas as pd
import glob
import os.path
import numpy as np

In [None]:
# Start with super enhancer data files generated with ROSE package
# http://younglab.wi.mit.edu/super_enhancer_code.html
rose_SE_dir="path-to-super-enhancer-folder/superenhancer_rose_out/"
# get all sample_SuperEnhancers.table.txt files
SE_files=glob.glob(rose_SE_dir+"*/*_SuperEnhancers.table.txt")

In [None]:
# get the transcription start site (TSS) for all genes, data from Gencode
mouse_TSS_file="data/UCSC_mm10_GencodeCompVM25_wSymbol_TSS.txt"
mouse_TSS_df=pd.read_csv(mouse_TSS_file,sep="\t")
mouse_TSS_df.head()

In [None]:
# extract all transcription factors' TSS
# annotation of genome-wide transcription factors (TFs) download from AnimalTFDB database
# http://bioinfo.life.hust.edu.cn/AnimalTFDB/#!/
mouse_TF_df=pd.read_csv("data/AnimalTFDB_Mus_musculus_TF.txt",sep="\t",index_col=0)
# get TFs symbols
TF_symbols=set(mouse_TF_df.Symbol)
mouse_TSS_df_tf=mouse_TSS_df[mouse_TSS_df['geneName'].isin(TF_symbols)]
mouse_TSS_df_tf.head()

In [None]:
# sort TF names
mouse_TSS_tf_names_sorted=list(mouse_TSS_df_tf.drop_duplicates("geneName").sort_values('geneName')["geneName"])

In [None]:
# calculate SERA (super enhancer regulatory activity)
def regulatoryActivity(SE_df, chrom, TSS, upstream=500000, downstream=500000):
    df_selected=SE_df.loc[(SE_df["CHROM"]==chrom) & (SE_df["center"]>(TSS-upstream)) & (SE_df["center"]<(TSS+downstream))]
    ra_list=[signal*(2**(-(abs(TSS-center)/20000))) for signal,center in zip(df_selected.iloc[:,-4], df_selected.iloc[:,-1])]
    ra_sum=sum(ra_list)
    return ra_sum

In [None]:
# get SERA for all TFs
def getSERA(se_file, tf_df, result_dict):
    SE_signal_df=pd.read_csv(se_file,sep="\t",skiprows=5)
    SE_signal_df["center"]=SE_signal_df.apply(lambda row: (row.START + row.STOP)/2, axis=1)
    sampleName=os.path.split(se_file)[-1].replace("_SuperEnhancers.table.txt","")
    print("processing: {0}".format(se_file))
    tf_df_new=tf_df.copy(deep=True)
    tf_df_new["SE_ra"]=tf_df_new.apply(lambda row: regulatoryActivity(SE_signal_df, row.chrom, row.TSS), axis=1)
    tf_df_new_max=tf_df_new.sort_values('SE_ra', ascending=False).drop_duplicates("geneName")
    tf_df_new_max=tf_df_new_max.loc[:,["geneId","geneName","SE_ra"]].sort_values('geneName')
    tf_df_new_max.to_csv(se_file.replace("_SuperEnhancers.table.txt","_SuperEnhancers.table.TF_SERA.txt"),sep="\t",index=False)
    result_dict[sampleName]=list(tf_df_new_max["SE_ra"])

In [None]:
# loop through all sample_SuperEnhancers.table.txt
sample_SERA_dict={"TFNamesSorted":mouse_TSS_tf_names_sorted}
for se_file in SE_files:
    getSERA(se_file, mouse_TSS_df_tf, sample_SERA_dict)

In [None]:
# create dataframe
sample_SERA_df=pd.DataFrame(sample_SERA_dict)
# perform log2 transformation
sample_SERA_df = np.log2(sample_SERA_df+1) # apply log2(v+1)
# Export to file
sample_SERA_df.to_csv(rose_SE_dir+"mouse_TF_SERA_scores.txt",sep="\t",index=False)

In [None]:
# for motif enrichment score
# start with HOMER motif enrichment analysis result files with findMotifsGenome
# http://homer.ucsd.edu/homer/ngs/peakMotifs.html
homer_result_files=glob.glob(SE_out_dir+"*/*_homer_wIMAGEmotifs/knownResults.txt")
# For HOMER motif enrichment analysis, We use near-complete database of position weight matrices (PWMs)
# from IMAGE package, https://github.com/JesperGrud/IMAGE
IMAGE_motif_SYMBOL_file="data/motifs/IMAGE_v1/Genename_Motif.txt"
IMAGE_motif_SYMBOL_df=pd.read_csv(IMAGE_motif_SYMBOL_file,sep="\t",names=["Symbol","motif","inferOrDirect"])
IMAGE_motif_SYMBOL_df.head()

In [None]:
IMAGE_motif_SYMBOL_dict=dict(zip(IMAGE_motif_SYMBOL_df.motif, IMAGE_motif_SYMBOL_df.Symbol))

In [None]:
# extract motif enrichment log p values from HOMER motif enrichment analysis result files
def getMotifLogPvalue(homer_result_file):
    homer_result_df=pd.read_csv(homer_result_file,sep="\t")
    sampleName=os.path.basename(os.path.dirname(homer_result_file)).replace("_homer_wIMAGEmotifs","")
    homer_result_df=homer_result_df.sort_values('Motif Name')
    Motif_list=list(homer_result_df["Motif Name"])
    LogPvalue_list=list(homer_result_df["Log P-value"])
    return (sampleName, Motif_list, LogPvalue_list)

In [None]:
# get MotifES scores for all samples
homer_result_dict={}
for homer_result_file in homer_result_files:
    (sampleName, Motifs, LogPvalue_list)=getMotifLogPvalue(homer_result_file)
    LogPvalue_list=[0-el for el in LogPvalue_list]
    homer_result_dict[sampleName]=LogPvalue_list
homer_result_dict["Motifs"]=Motifs
# create dataframe
homer_result_df=pd.DataFrame(homer_result_dict)
homer_result_df.head()

In [None]:
# Export to file
homer_result_df.to_csv(rose_SE_dir+"mouse_TF_MotifES_scores.txt",sep="\t",index=False)