In [1]:
import pandas as pd
import numpy as np
import os
import time
import pybedtools
pybedtools.set_tempdir('tmp')

In [2]:
in_files = {'ER_pos' : 'sites/ER_pos_0_5_FC.new.txt',
'ER_neg' : 'sites/ER_neg_0_5_FC.new.txt',
'ER_pos_heme' : 'sites/ER_pos_heme_0_5_FC.new.txt',
'ER_neg_heme' : 'sites/ER_neg_heme_0_5_FC.new.txt'}

GTRD_dir = '../../GTRD/10000_filtered_sites_CIS_BP/'

In [3]:
length_dict = {}
for key in in_files.keys():
    data = pd.read_csv(in_files[key], sep='\t')
    print(key,len(data))
    length_dict[key]=len(data)

ER_pos 15142
ER_neg 12151
ER_pos_heme 12217
ER_neg_heme 12710


In [4]:
out_df = pd.DataFrame(columns = ['site_list','GTRD','overlaps'])
start_time = time.time()

for key in in_files.keys():
    data = pd.read_csv(in_files[key], sep='\t')
    bed_data = pybedtools.BedTool.from_dataframe(data[['Chrom','Start','End']])
    for i,file in enumerate(sorted(os.listdir(GTRD_dir))):
        GTRD_data = pd.read_csv(GTRD_dir+file, sep='\t')
        GTRD_data['new_start'] = GTRD_data['position']-250
        GTRD_data['new_end'] = GTRD_data['position']+250
        bed_GTRD = pybedtools.BedTool.from_dataframe(GTRD_data[['Chrom','new_start','new_end']])
        intersection = bed_data.intersect(bed_GTRD, u = True)
        out_df = out_df.append({'site_list':key,'GTRD':file,'overlaps':len(intersection)}, ignore_index=True)
        if i%100==0:
            print(i, np.round(time.time()-start_time,2), key,file,len(intersection))
            
    pybedtools.cleanup(remove_all=True)

#reformat
out_df = out_df.set_index(['GTRD','site_list']).unstack()

new_columns = []
for column in out_df.columns:
    new_columns.append(column[1])
out_df.columns = new_columns

#add full name and number of sites to the column names
full_name_dict = {'ER_neg':'ER_negative_non_hematopoietic',
                  'ER_neg_heme':'ER_negative_hematopoietic',
                  'ER_pos':'ER_positive_non_hematopoietic',
                  'ER_pos_heme':'ER_positive_hematopoietic'}
col_name_dict = {}

for key in full_name_dict.keys():
    col_name_dict[key]=full_name_dict[key]+'_'+str(length_dict[key])+'_sites'
out_df = out_df.rename(columns = col_name_dict)

0 0.22 ER_pos AHR.hg38.10000.txt 2054
100 10.6 ER_pos HNF4G.hg38.10000.txt 1529
200 21.27 ER_pos POU2F1.hg38.10000.txt 773
300 31.88 ER_pos ZEB2.hg38.10000.txt 223
0 37.03 ER_neg AHR.hg38.10000.txt 234
100 47.65 ER_neg HNF4G.hg38.10000.txt 156
200 58.21 ER_neg POU2F1.hg38.10000.txt 97
300 68.49 ER_neg ZEB2.hg38.10000.txt 127
0 73.68 ER_pos_heme AHR.hg38.10000.txt 1437
100 84.38 ER_pos_heme HNF4G.hg38.10000.txt 894
200 95.1 ER_pos_heme POU2F1.hg38.10000.txt 521
300 105.72 ER_pos_heme ZEB2.hg38.10000.txt 647
0 111.07 ER_neg_heme AHR.hg38.10000.txt 258
100 122.04 ER_neg_heme HNF4G.hg38.10000.txt 173
200 134.73 ER_neg_heme POU2F1.hg38.10000.txt 218
300 145.09 ER_neg_heme ZEB2.hg38.10000.txt 432


In [5]:
#reformat and export
to_export = out_df.copy()
to_export.index = to_export.index.str.split('.hg38.10000', expand=True).droplevel(1)
to_export.index.name = 'TF_name'
to_export[['ER_positive_non_hematopoietic_15142_sites',
       'ER_positive_hematopoietic_12217_sites',
       'ER_negative_non_hematopoietic_12151_sites',
       'ER_negative_hematopoietic_12710_sites']].to_csv('S5_TFBS_overlaps.txt',sep='\t')

In [6]:
for column in out_df.columns:
    print(column)
    print(out_df.sort_values(by=column, ascending = False))
    
    print('\n')

ER_negative_non_hematopoietic_12151_sites
                      ER_negative_non_hematopoietic_12151_sites  \
GTRD                                                              
GRHL2.hg38.10000.txt                                        930   
TRPS1.hg38.10000.txt                                        709   
STAT3.hg38.10000.txt                                        637   
RBPJ.hg38.10000.txt                                         605   
TEAD4.hg38.10000.txt                                        596   
...                                                         ...   
ZNF486.hg38.10000.txt                                        39   
HMGA1.hg38.10000.txt                                         28   
ZNF146.hg38.10000.txt                                        23   
CTCF.hg38.10000.txt                                          20   
ZBTB2.hg38.10000.txt                                         19   

                      ER_negative_hematopoietic_12710_sites  \
GTRD                   