In [1]:
import pandas as pd
import numpy as np
import time
import sys
import os


In [2]:
data_path='../Homo_sapiens_meta_clusters.interval'


In [3]:
#path to directory with bed files

#minimum number of sites
number_of_sites = [1000,5000,10000,20000,30000,50000]
min_sites = min(number_of_sites)

#path to output yaml

output_name = 'unfiltered_sites_CIS_BP_v2'
out_yaml='all_'+output_name+'.yaml'

#path to cis-bp info
CIS_BP_list = '../CIS_BP/Homo_sapiens_2021_04_29_Cis-BP_v_2.00/TF_Information.txt'


In [4]:
#may take 2-8 minutes
#import data

start_time=time.time()

data=pd.read_csv(data_path, usecols=['#CHROM','START','END','tfTitle','uniprotId','peak.count','exp.count'],sep='\t')

print(time.time()-start_time)

#rename columns
data=data.rename(columns={'#CHROM': 'Chrom', 'START': "Start", 'END': 'End', 'tfTitle': 'Gene'}) 
#remove any rows without a gene name (these all seem to be a Tet1 splice variant with uniprot id A0A023HHK9)
data=data[~data['Gene'].isnull()] 
#replace / with - so that the file names don't have /
data['Gene']=data['Gene'].str.replace('/','-') 
data['Gene'] = data['Gene']+'.hg38'

print(time.time()-start_time)
sys.stdout.flush()
data.head()

131.4678132534027
170.64420127868652


Unnamed: 0,Chrom,Start,End,uniprotId,Gene,exp.count,peak.count
0,chr1,9989,10082,Q9UBN7,HDAC6.hg38,1,2
1,chr1,9992,10073,Q6P4R8,NFRKB.hg38,2,4
2,chr1,10004,10069,Q9NUA8,ZBTB40.hg38,1,3
3,chr1,10020,10117,P03372,ESR1.hg38,4,5
4,chr1,10036,10115,O14646,CHD1.hg38,1,2


In [5]:
valid_TFs = pd.read_csv(CIS_BP_list, sep='\t')#['TF_Name']
valid_TFs = valid_TFs.sort_values(by = 'TF_Name')
#exclude TFs where the binding site is not known
valid_TFs = valid_TFs[valid_TFs['TF_Status']!='N']
#get just TF names
valid_TFs = pd.Series(valid_TFs['TF_Name'].unique())
valid_TFs = valid_TFs+'.hg38'

In [6]:
print(len(valid_TFs))

1200


In [7]:
chroms = ['chr'+str(m) for m in range(1,23)]

In [9]:
output=[]
for name,df in data.groupby('Gene'):
    df = df[df['Chrom'].isin(chroms)].copy()
    if len(df)>=min_sites and name in valid_TFs.values: #if there were at least min_sites on the list and it is a real TF
        #get the top sites
        df = df.sort_values(by='peak.count', ascending=False)
        df['position'] = (np.floor((df['Start']+df['End'])/2)).astype(int)
        for i in number_of_sites:
            current = df.iloc[0:i].reset_index(drop=True)
            if len(current)==i:
                out_file = '../'+str(i)+'_'+output_name+'/'+name+'.'+str(i)+'.txt'
                output.append([name,i,out_file,len(df)])
                current.to_csv(out_file, sep='\t', index=False)
    else: 
        print(name,'\t',len(df),'\t', name in valid_TFs.values)


ADA2.hg38 	 1449 	 False
ADCYAP1.hg38 	 117 	 False
ADNP.hg38 	 6371 	 False
AEBP2.hg38 	 2151 	 False
AFF1.hg38 	 60016 	 False
AFF4.hg38 	 74042 	 False
AGO1.hg38 	 57951 	 False
AGO2.hg38 	 31222 	 False
AHRR.hg38 	 5073 	 False
ALKBH3.hg38 	 606 	 False
ALX4.hg38 	 232 	 True
ALYREF.hg38 	 31 	 False
AMH.hg38 	 85412 	 False
APC.hg38 	 12604 	 False
APOBEC3B.hg38 	 28581 	 False
ARHGAP35.hg38 	 269 	 False
ARID1A.hg38 	 43331 	 False
ARID1B.hg38 	 32353 	 False
ARID2.hg38 	 121817 	 False
ARID4B.hg38 	 46706 	 False
ARNT2.hg38 	 869 	 True
ARRB1.hg38 	 13759 	 False
ASF1A.hg38 	 10710 	 False
ASH1L.hg38 	 1760 	 False
ASH2L.hg38 	 16926 	 False
ASXL2.hg38 	 489 	 False
ATF7IP.hg38 	 47579 	 False
ATM.hg38 	 224 	 False
ATOH1.hg38 	 214 	 True
ATRX.hg38 	 7119 	 False
AUTS2.hg38 	 2021 	 False
BAHD1.hg38 	 1923 	 False
BANP.hg38 	 725 	 False
BAP1.hg38 	 5411 	 False
BATF2.hg38 	 13 	 False
BBX.hg38 	 67 	 True
BCHE.hg38 	 23710 	 False
BCL3.hg38 	 89906 	 False
BCLAF1.hg38 	 11869 

In [10]:
output_df = pd.DataFrame(output)
output_df['name']= output_df[0]+'.'+output_df[1].astype(str)
output_df['path'] = output_df[2]
output_df['number_of_sites'] = output_df[1]
output_df['path'] = [os.path.abspath(item) for item in output_df['path']]
print(len(output_df[0].unique()))
output_df = output_df[['name','path','number_of_sites']]



566


In [11]:
output_df.head()

Unnamed: 0,name,path,number_of_sites
0,AHR.hg38.1000,/fh/fast/ha_g/user/adoebley/projects/griffin_r...,1000
1,AHR.hg38.5000,/fh/fast/ha_g/user/adoebley/projects/griffin_r...,5000
2,AHR.hg38.10000,/fh/fast/ha_g/user/adoebley/projects/griffin_r...,10000
3,AHR.hg38.20000,/fh/fast/ha_g/user/adoebley/projects/griffin_r...,20000
4,AHR.hg38.30000,/fh/fast/ha_g/user/adoebley/projects/griffin_r...,30000


In [12]:
with open (out_yaml, 'w+' )as f:
    f.write('site_lists:\n  ')
    f.write('\n  '.join([': '.join(item) for item in output_df[['name','path']].values]))

In [14]:
for i in number_of_sites:
    with open (str(i)+'_'+output_name+'.yaml', 'w+' )as f:
        current_df = output_df[output_df['number_of_sites']==i]
        f.write('site_lists:\n  ')
        f.write('\n  '.join([': '.join(item) for item in current_df[['name','path']].values]))
        print(i,len(current_df))

1000 566
5000 446
10000 377
20000 316
30000 270
50000 202
