In [11]:
import os 
import pandas as pd
import numpy as np
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML
pd.set_option('display.min_rows', 100) 
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')
pbt.set_tempdir('/mnt/hpcscratch/jreyna/')
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 5000

# make the directory to save our data
outdir = 'results/main/coloc/sgl_intersect/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Loading the gene data

In [12]:
print('# Load the gene data')

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

# create a copy of the original gene bed before coordinate shrinking
orig_genes_df = genes_df.copy()

# convert the start/end position into start/end for the TSS
# if the gene is + then the start is uses as the tss otherwise
# the end is used as the tss
genes_df.loc[(genes_df.strand == '+'), 'end'] = genes_df.loc[(genes_df.strand == '+'), 'start']
genes_df.loc[(genes_df.strand == '+'), 'start'] = genes_df.loc[(genes_df.strand == '+'), 'start'] - 1
genes_df.loc[(genes_df.strand == '-'), 'end'] = genes_df.loc[(genes_df.strand == '-'), 'end']
genes_df.loc[(genes_df.strand == '-'), 'start'] = genes_df.loc[(genes_df.strand == '-'), 'end'] - 1
genes_df.loc[:, 'chrom'] = genes_df.loc[:, 'chrom'].str.replace('chr', '')
genes_df.loc[:, 'bin_start'] = (np.floor(genes_df.loc[:, 'start'] / res) * res).astype(int)
genes_df.loc[:, 'bin_end'] = genes_df.loc[:, 'bin_start'] + res

# make a genes pbt for intersection
print("# make a genes pbt for intersection")
print(genes_df.head())
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
# make a genes pbt for intersection
   chrom  start    end        gname          gene_id strand  bin_start  \
0      1  11868  11869      DDX11L1  ENSG00000223972      +      10000   
12     1  29569  29570       WASH7P  ENSG00000227232      -      25000   
25     1  17435  17436    MIR6859-1  ENSG00000278267      -      15000   
28     1  29553  29554  MIR1302-2HG  ENSG00000243485      +      25000   
36     1  30365  30366    MIR1302-2  ENSG00000284332      +      30000   

    bin_end  
0     15000  
12    30000  
25    20000  
28    30000  
36    35000  
There are 58825 genes in this GTF-derived file.


## Loading the HiChIP data

In [13]:
# get reference files 
genome_sizes="results/refs/hg19/hg19.chrom.sizes"

# loading the sample sheet
samplesheet = pd.read_table('config/sgl_samplesheets/sgl.samplesheet.tsv')
samplesheet = samplesheet.loc[~samplesheet['eqtl_source'].isna()]

In [14]:
# loading all hichip data
hichip_template = 'results/main/h3k27ac_hichip/{loop_source}/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed'
hichip_data = []
hichip_pbts = {}
curr_id = 0

for hichip_fn in glob.glob(hichip_template.format(loop_source='*')):
    
    print(hichip_fn)
    
    celltype = hichip_fn.split('/')[3]
        
    # load hichip
    curr_hichip = pd.read_table(hichip_fn)
    curr_hichip['celltype'] = celltype
    curr_hichip['loop.id'] = range(curr_id, curr_id + len(curr_hichip))
    curr_id += len(curr_hichip)
    hichip_data.append(curr_hichip)
    
    # create pbt
    hichip_pbt = pbt.BedTool.from_dataframe(curr_hichip[['chr1', 's1', 'e1', 'chr2', 's2', 'e2', 'loop.id']])
    hichip_pbts[celltype] = hichip_pbt

# concat everything together
hichip = pd.concat(hichip_data)

results/main/h3k27ac_hichip/NK-cell_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/CD8_T-cell_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/Th2_memory/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/monocyte_nonclassical_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/CD4_T-cell_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/Tfh_memory/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/monocyte_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/Treg_naive/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/Th1_memory/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/Treg_memory/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01.bed
results/main/h3k27ac_hichip/B-cell_naive/FitHiChIP_S/

## Loading the Coloc data (after adding LD SNPs)

In [15]:
# create the tamplate names for coloc files
coloc_template = 'results/main/GRCh37/coloc/{eqtl_db}/{gwas_source}/{eqtl_source}/{ge_source}/ldpairs/coloc_ld_snps.swapped_ld_cols.tsv'

all_colocs_data = []
sgl_data = []
curr_id = 0

# get the colocs and intersect
for i, sr in samplesheet.iterrows():
    
    print(i)
    
    # these samples have not been processed correctly so I'm skipping them for now
    if i in [120, 124, 125, 126, 127, 128, 129, 130]:
        continue
    
    sample_info = sr.to_dict()
    
    if sr.eqtl_source == 'ImmuNexUT':
        sample_info['eqtl_db'] = 'ImmuNexUT'
    else:
        sample_info['eqtl_db'] = 'eQTL_Catalogue'
    
    coloc_fn = coloc_template.format(**sample_info)

    # processs only if the coloc file exists
    if os.path.exists(coloc_fn) and sr['loop_source'] in hichip_pbts:

        # load and process colocs 
        colocs = pd.read_table(coloc_fn)
        colocs['coloc.id'] = range(curr_id, curr_id + len(colocs))
        colocs['gwas_source'] = sr['gwas_source']
        colocs['eqtl_source'] = sr['eqtl_source']
        colocs['ge_source'] = sr['ge_source']
        colocs['ge_source'] = sr['loop_source']

        # save the current colocs data
        all_colocs_data.append(colocs)
        
        # update the current coloc_id
        curr_id = curr_id + len(colocs)
        
        # start creating the coloc bedpe 
        colocs_bedpe = colocs[['chr', 'pos', 'pos', 'ld_rsID', 'geneName']]
        colocs_bedpe.columns = ['chr', 'start', 'end', 'rsid', 'genename']
        colocs_bedpe.loc[:, 'start'] = colocs_bedpe['start'] - 1
                
        # add gene coordinates 
        colocs_bedpe = colocs_bedpe.merge(genes_df, left_on='genename', right_on='gene_id', suffixes=['_snp', '_gene'])  
        
        # re-organize columns
        colocs_bedpe = colocs_bedpe[['chr', 'start_snp', 'end_snp',
                                     'chrom', 'start_gene', 'end_gene',
                                     'rsid', 'genename', 'gname', 'gene_id',
                                     'strand', 'bin_start', 'bin_end']]
        
        # re-name columns
        colocs_bedpe.columns = ['chr_snp', 'start_snp', 'end_snp', 'chr_gene', 'start_gene', 'end_gene',
                                'rsid', 'geneid', 'genename', 'geneid2', 'strand_gene', 'bin_start', 'bin_end']
        
        # add chr to column entris 
        colocs_bedpe['chr_gene'] = 'chr' + colocs_bedpe['chr_gene'].astype(str)
        
        # add a coloc id for merging
        colocs_bedpe['coloc.id'] = range(len(colocs_bedpe))
        
        # create a pbt with coloc id 
        colocs_pbt = pbt.BedTool.from_dataframe(colocs_bedpe[['chr_snp', 'start_snp', 'end_snp',
                                                              'chr_gene', 'start_gene', 'end_gene',
                                                              'coloc.id']])
        
        # extracting hichip for the current dataset
        hichip_pbt = hichip_pbts[sr['loop_source']]
            
        # intersect the two 
        inter = colocs_pbt.pairtopair(hichip_pbt, slop=res)
        inter = inter.to_dataframe(disable_auto_names=True, header=None)
        
        if len(inter) > 0:
            sgl_data.append(inter)
            print('found overlap')
            

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
found overlap
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
46
47
48
49
50
51
52
53
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
found overlap
147
found overlap
148
149
150
151
152
153
154
155
156
157
158
159
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188


In [16]:
all_colocs = pd.concat(all_colocs_data)

In [17]:
all_colocs

Unnamed: 0,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,pp_H2_Coloc_Summary,pp_H3_Coloc_Summary,pp_H4_Coloc_Summary,ld_rsID,variant_id,geneName,dist,pvalue,FDR,slope_snp,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,old_pos,was_converted,rsID,main.chr,main.pos,rs_id,LD,coloc.id,gwas_source,eqtl_source,ge_source
0,chr9,4285986,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10758591,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.957052,0,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
1,chr9,4289196,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814914,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.957616,1,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
2,chr9,4290544,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814915,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.885806,2,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
3,chr9,4293150,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814916,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.878018,3,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
4,chr9,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,1.000000,4,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
5,chr9,4283682,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10974435,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.930208,5,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
6,chr9,4284961,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,6,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
7,chr9,4282942,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs3892354,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.891468,7,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
8,chr9,4295880,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs4339696,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.956543,8,T1D_34012112_Gaulton,Quach_2016,monocyte_naive
9,chr9,4282536,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs4380994,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.942558,9,T1D_34012112_Gaulton,Quach_2016,monocyte_naive


In [18]:
sgls = pd.concat(sgl_data)
sgls.drop([14, 15, 16], axis=1, inplace=True)
sgls.columns = ['chr_snp', 'start_snp', 'end_snp', 'chr_gene', 'start_gene', 'end_gene',
               'coloc.id', 'chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'loop.id']
sgls = sgls.merge(all_colocs, on='coloc.id', suffixes=['', '.coloc'])
sgls = sgls.merge(hichip, on='loop.id', suffixes=['', '.loop'])

In [19]:
sgls.shape

(6, 73)

In [22]:
sgls

Unnamed: 0,chr_snp,start_snp,end_snp,chr_gene,start_gene,end_gene,coloc.id,chrA,startA,endA,chrB,startB,endB,loop.id,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,pp_H2_Coloc_Summary,pp_H3_Coloc_Summary,pp_H4_Coloc_Summary,ld_rsID,variant_id,geneName,dist,pvalue,FDR,slope_snp,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,old_pos,was_converted,rsID,main.chr,main.pos,rs_id,LD,gwas_source,eqtl_source,ge_source,chr1,s1,e1,chr2,s2,e2,cc,Coverage1,isPeak1,Bias1,Mapp1,GCContent1,RESites1,Coverage2,isPeak2,Bias2,Mapp2,GCContent2,RESites2,p,exp_cc_Bias,p_Bias,dbinom_Bias,P-Value_Bias,Q-Value_Bias,celltype
0,chr18,12884342,12884343,chr18,11857553,11857554,6,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4284961,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.41858,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,3e-06,monocyte_naive
1,chr18,12882358,12882359,chr18,11857553,11857554,27,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr18,67538824,0.0,0.0,8.1e-05,0.095307,0.904612,rs1788103,18:69870115:T:C,ENSG00000206052,469227,7.02623e-07,0.000302,-0.407996,T,C,2041,0.407548,5008,0.00473,-0.10064,0.013979,6.04e-13,69870115,1,rs17207042,chr18,67537351,rs17207042,0.996115,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.41858,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,3e-06,monocyte_naive
2,chr18,12876392,12876393,chr18,11857553,11857554,30,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr18,67527609,0.0,0.0,8.1e-05,0.095307,0.904612,rs1788232,18:69870115:T:C,ENSG00000206052,469227,7.02623e-07,0.000302,-0.407996,T,C,2041,0.407548,5008,0.00473,-0.10064,0.013979,6.04e-13,69870115,1,rs17207042,chr18,67537351,rs17207042,0.991213,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.41858,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,3e-06,monocyte_naive
3,chr18,12876396,12876397,chr18,11857553,11857554,33,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr18,67535184,0.0,0.0,8.1e-05,0.095307,0.904612,rs1790588,18:69870115:T:C,ENSG00000206052,469227,7.02623e-07,0.000302,-0.407996,T,C,2041,0.407548,5008,0.00473,-0.10064,0.013979,6.04e-13,69870115,1,rs17207042,chr18,67537351,rs17207042,0.996115,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.41858,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,3e-06,monocyte_naive
4,chr6,91014028,91014029,chr6,89673468,89673469,9,chr6,89670000,89675000,chr6,91005000,91010000,469518,chr9,4282536,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs4380994,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.942558,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr6,89670000,89675000,chr6,91005000,91010000,18,3005,1,1.802139,0,0,0,17414,1,10.443413,0,0,0,1.130191e-07,6.483465,4.502776e-08,9.782614e-05,0.0001467332,0.009584,CD4_T-cell_naive
5,chr6,91002493,91002494,chr6,89673468,89673469,15,chr6,89670000,89675000,chr6,91005000,91010000,469518,chr9,4287466,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs7041847,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.94929,T1D_34012112_Gaulton,Quach_2016,monocyte_naive,chr6,89670000,89675000,chr6,91005000,91010000,18,3005,1,1.802139,0,0,0,17414,1,10.443413,0,0,0,1.130191e-07,6.483465,4.502776e-08,9.782614e-05,0.0001467332,0.009584,CD4_T-cell_naive


In [10]:
sgl_fn = os.path.join(outdir, 'coloc_ld_sgls.tsv')
sgls.to_csv(sgl_fn, sep='\t', index=False)