In [1]:
import pandas as pd
import numpy as np
import os
import time
import pybedtools
pybedtools.set_tempdir('tmp')

In [2]:
in_files = {'ER_positive_non_hematopoietic':'../../sites/TCGA_ATAC/ER_differential/sites_by_qval/ER_pos_specific.5e-4_qval.DESeq2.txt',
          'ER_positive_hematopoietic':'../../sites/TCGA_ATAC/ER_differential/sites_by_qval/ER_pos_heme.5e-4_qval.DESeq2.txt',
          'ER_negative_non_hematopoietic':'../../sites/TCGA_ATAC/ER_differential/sites_by_qval/ER_neg_specific.5e-4_qval.DESeq2.txt',
          'ER_negative_hematopoietic':'../../sites/TCGA_ATAC/ER_differential/sites_by_qval/ER_neg_heme.5e-4_qval.DESeq2.txt'}

GTRD_dir = '../../sites/TFBS/10000_unfiltered_sites_CIS_BP_v2/'

In [3]:
length_dict = {}
for key in in_files.keys():
    data = pd.read_csv(in_files[key], sep='\t')
    print(key,len(data))
    length_dict[key]=len(data)

ER_positive_non_hematopoietic 18240
ER_positive_hematopoietic 9930
ER_negative_non_hematopoietic 19347
ER_negative_hematopoietic 22365


In [4]:
out_df = pd.DataFrame(columns = ['site_list','GTRD','overlaps'])
start_time = time.time()

for key in in_files.keys():
    data = pd.read_csv(in_files[key], sep='\t')
    bed_data = pybedtools.BedTool.from_dataframe(data[['Chrom','Start','End']])
    for i,file in enumerate(sorted(os.listdir(GTRD_dir))):
        GTRD_data = pd.read_csv(GTRD_dir+file, sep='\t')
        GTRD_data['new_start'] = GTRD_data['position']-250
        GTRD_data['new_end'] = GTRD_data['position']+250
        bed_GTRD = pybedtools.BedTool.from_dataframe(GTRD_data[['Chrom','new_start','new_end']])
        intersection = bed_data.intersect(bed_GTRD, u = True)
        out_df = out_df.append({'site_list':key,'GTRD':file,'overlaps':len(intersection)}, ignore_index=True)
        if i%100==0:
            print(i, np.round(time.time()-start_time,2), key,file,len(intersection))
            
    pybedtools.cleanup(remove_all=True)

#reformat
out_df = out_df.set_index(['GTRD','site_list']).unstack()

new_columns = []
for column in out_df.columns:
    new_columns.append(column[1])
out_df.columns = new_columns

#add number of sites to the column names
col_name_dict = {}

for key in in_files.keys():
    col_name_dict[key]=key+'_'+str(length_dict[key])+'_sites'
out_df = out_df.rename(columns = col_name_dict)

0 0.21 ER_positive_non_hematopoietic AHR.hg38.10000.txt 1722
100 10.08 ER_positive_non_hematopoietic HIF3A.hg38.10000.txt 610
200 20.01 ER_positive_non_hematopoietic OTX2.hg38.10000.txt 127
300 29.81 ER_positive_non_hematopoietic TLX1.hg38.10000.txt 40
0 39.19 ER_positive_hematopoietic AHR.hg38.10000.txt 1021
100 48.65 ER_positive_hematopoietic HIF3A.hg38.10000.txt 390
200 58.38 ER_positive_hematopoietic OTX2.hg38.10000.txt 80
300 67.83 ER_positive_hematopoietic TLX1.hg38.10000.txt 208
0 77.47 ER_negative_non_hematopoietic AHR.hg38.10000.txt 211
100 87.29 ER_negative_non_hematopoietic HIF3A.hg38.10000.txt 345
200 97.21 ER_negative_non_hematopoietic OTX2.hg38.10000.txt 185
300 107.07 ER_negative_non_hematopoietic TLX1.hg38.10000.txt 81
0 116.78 ER_negative_hematopoietic AHR.hg38.10000.txt 292
100 126.97 ER_negative_hematopoietic HIF3A.hg38.10000.txt 648
200 137.16 ER_negative_hematopoietic OTX2.hg38.10000.txt 269
300 147.32 ER_negative_hematopoietic TLX1.hg38.10000.txt 482


In [5]:
out_df

Unnamed: 0_level_0,ER_negative_hematopoietic_22365_sites,ER_negative_non_hematopoietic_19347_sites,ER_positive_hematopoietic_9930_sites,ER_positive_non_hematopoietic_18240_sites
GTRD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AHR.hg38.10000.txt,292,211,1021,1722
AR.hg38.10000.txt,124,117,817,1879
ARID3A.hg38.10000.txt,452,196,353,394
ARNT.hg38.10000.txt,568,197,340,317
ARNTL.hg38.10000.txt,598,441,94,60
...,...,...,...,...
ZNF792.hg38.10000.txt,624,243,377,339
ZSCAN16.hg38.10000.txt,246,109,95,87
ZSCAN22.hg38.10000.txt,725,261,132,53
ZSCAN4.hg38.10000.txt,343,120,145,125


In [6]:
#reformat and export
to_export = out_df.copy()
to_export.index = to_export.index.str.split('.hg38.10000', expand=True).droplevel(1)
to_export.index.name = 'TF_name'
to_export[['ER_positive_non_hematopoietic_18240_sites',
    'ER_positive_hematopoietic_9930_sites',
    'ER_negative_non_hematopoietic_19347_sites',
    'ER_negative_hematopoietic_22365_sites']].to_csv('files/S12_ATAC_TFBS_overlaps.tsv',sep='\t')

In [7]:
for column in out_df.columns:
    print(column)
    display(out_df.sort_values(by=column, ascending = False))
    
    print('\n')

ER_negative_hematopoietic_22365_sites


Unnamed: 0_level_0,ER_negative_hematopoietic_22365_sites,ER_negative_non_hematopoietic_19347_sites,ER_positive_hematopoietic_9930_sites,ER_positive_non_hematopoietic_18240_sites
GTRD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KDM2B.hg38.10000.txt,1302,266,75,8
NFKB1.hg38.10000.txt,1208,631,128,82
CTCF.hg38.10000.txt,1203,95,119,3
RELA.hg38.10000.txt,1174,395,127,40
KMT2A.hg38.10000.txt,1140,288,30,5
...,...,...,...,...
DUX4.hg38.10000.txt,78,77,38,65
PHOX2B.hg38.10000.txt,71,45,47,109
HMGA1.hg38.10000.txt,23,38,15,43
ZNF486.hg38.10000.txt,21,37,17,29




ER_negative_non_hematopoietic_19347_sites


Unnamed: 0_level_0,ER_negative_hematopoietic_22365_sites,ER_negative_non_hematopoietic_19347_sites,ER_positive_hematopoietic_9930_sites,ER_positive_non_hematopoietic_18240_sites
GTRD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GRHL2.hg38.10000.txt,938,1013,298,549
TRPS1.hg38.10000.txt,939,655,241,252
TEAD4.hg38.10000.txt,839,650,255,218
NFKB1.hg38.10000.txt,1208,631,128,82
STAT3.hg38.10000.txt,1113,599,186,106
...,...,...,...,...
ZBTB2.hg38.10000.txt,835,48,155,17
PHOX2B.hg38.10000.txt,71,45,47,109
HMGA1.hg38.10000.txt,23,38,15,43
ZNF486.hg38.10000.txt,21,37,17,29




ER_positive_hematopoietic_9930_sites


Unnamed: 0_level_0,ER_negative_hematopoietic_22365_sites,ER_negative_non_hematopoietic_19347_sites,ER_positive_hematopoietic_9930_sites,ER_positive_non_hematopoietic_18240_sites
GTRD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FOXA1.hg38.10000.txt,144,80,1369,2750
ESR1.hg38.10000.txt,177,110,1257,1949
GATA3.hg38.10000.txt,216,75,1230,1522
PGR.hg38.10000.txt,367,247,1221,1758
AHR.hg38.10000.txt,292,211,1021,1722
...,...,...,...,...
KMT2A.hg38.10000.txt,1140,288,30,5
ZNF486.hg38.10000.txt,21,37,17,29
ZNF146.hg38.10000.txt,20,14,16,33
HMGA1.hg38.10000.txt,23,38,15,43




ER_positive_non_hematopoietic_18240_sites


Unnamed: 0_level_0,ER_negative_hematopoietic_22365_sites,ER_negative_non_hematopoietic_19347_sites,ER_positive_hematopoietic_9930_sites,ER_positive_non_hematopoietic_18240_sites
GTRD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FOXA1.hg38.10000.txt,144,80,1369,2750
ESR1.hg38.10000.txt,177,110,1257,1949
AR.hg38.10000.txt,124,117,817,1879
PGR.hg38.10000.txt,367,247,1221,1758
AHR.hg38.10000.txt,292,211,1021,1722
...,...,...,...,...
YY1.hg38.10000.txt,746,264,59,5
SP1.hg38.10000.txt,546,306,67,5
KMT2A.hg38.10000.txt,1140,288,30,5
MAX.hg38.10000.txt,879,277,72,4




