In [1]:
import os 
import pandas as pd
import numpy as np
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML
pd.set_option('display.min_rows', 100) 
pd.set_option('display.max_columns', None)

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')
pbt.set_tempdir('/mnt/hpcscratch/jreyna/')
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 5000

# make the directory to save our data
outdir = 'results/main/Intersect_T1D_Finemap_GWAS_SNPs_with_HiChIP/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [2]:
major_gwas = ['T1D_32005708', 'T1D_34594039_GCST90018925', 'T1D_34012112_Gaulton']

In [3]:
gwas_glob = 'results/main/finemapping/*/GRCh37/offset_1000000/Summary/sss/FINAL_top_snp_credible_set.txt'
gwas_glob = glob.glob(gwas_glob)
data = []
for fn in gwas_glob:
    
    # get meta data from the path
    path_info = fn.split('/')
    
    if path_info[3] not in major_gwas:
        print('skipped: {}'.format(path_info[3]))
        continue
    
    # get the bin coordinates
    df = pd.read_table(fn)
    df.loc[:, 'bin_start'] = np.floor(df.loc[:, 'position'] / res).astype(int) * res
    df.loc[:, 'bin_end'] = df.loc[:, 'bin_start'] + res
    df = df.loc[(df.allele1.str.len() == 1 ) & (df.allele2.str.len() == 1)]
    df.loc[:, 'gwas_source'] = path_info[3]
    data.append(df)

gwas_df = pd.concat(data)

In [4]:
gwas_df.head()

Unnamed: 0,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start,bin_end,gwas_source
0,1,chr1:113310083-115099755,2336,1:114089649,1,114089649,A,G,0.016251,-0.1324,0.0749,-1.76769,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.9614436,114085000,114090000,T1D_34594039_GCST90018925
1,1,chr1:113310083-115099755,2829,1:114270326,1,114270326,A,C,0.260388,-0.1634,0.0209,-7.81818,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,114270000,114275000,T1D_34594039_GCST90018925
2,1,chr1:113310083-115099755,5046,1:114909703,1,114909703,T,C,0.131631,-0.0046,0.0261,-0.176245,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.5699493,114905000,114910000,T1D_34594039_GCST90018925
3,1,chr1:113310083-115099755,3035,1:114377568,1,114377568,G,A,0.114168,-0.4287,0.0286,-14.9895,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,114375000,114380000,T1D_34594039_GCST90018925
4,1,chr1:113310083-115099755,3131,1:114420328,1,114420328,T,C,0.325922,0.132,0.0189,6.98413,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.433164e-12,114420000,114425000,T1D_34594039_GCST90018925


In [5]:
# create a pybedtools for finemap data
gwas_bed = gwas_df.loc[:, ['chromosome','bin_start','bin_end', 'position', 'gwas_source']]
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

In [6]:
gwas_bed.head()

Unnamed: 0,chromosome,bin_start,bin_end,position,gwas_source
0,1,114085000,114090000,114089649,T1D_34594039_GCST90018925
1,1,114270000,114275000,114270326,T1D_34594039_GCST90018925
2,1,114905000,114910000,114909703,T1D_34594039_GCST90018925
3,1,114375000,114380000,114377568,T1D_34594039_GCST90018925
4,1,114420000,114425000,114420328,T1D_34594039_GCST90018925


## Load HiChIP Loops

In [7]:
def parse_seB(x): 
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [8]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

# only analyze loop data from main cell types 
loops = ['results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz']

In [9]:
loop_data = []
for loop in loops:
    print(loop)
    
    # extract cell line
    cline = loop.split('/')[5]
    
    # load and parse the data
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    df['chrom'] = df['chrom'].str.replace('chr', '')
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    df['startA'] = df['startA'] + 1 - int(res / 2)
    df['endA'] = df['startA'] + res
    df['startB'] = df['startB'] + 1 - int(res / 2)
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 
loop_df = pd.concat(loop_data)

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz


In [10]:
# create a pybedtools for the looping data
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

In [11]:
loop_bed.head()

Unnamed: 0,chrom,startA,endA,chrom.1,startB,endB,cline
0,1,710000,715000,1,1305000,1310000,CD4N
1,1,710000,715000,1,755000,760000,CD4N
2,1,710000,715000,1,760000,765000,CD4N
3,1,710000,715000,1,775000,780000,CD4N
4,1,710000,715000,1,805000,810000,CD4N


## Intersect Fine Mapped GWAS and loops

#### Perform the intersection

In [12]:
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt, type='either')
gwas_hichip = intersect_pbt.to_dataframe(header=None, disable_auto_names=True)
gwas_hichip = gwas_hichip.iloc[:, [7,8,9,10,0,1,2,3,4,5,6,11]]
loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'bin_start', 'bin_end', 'pos'] + loop_cols + ['cline_loop', 'gwas_source']

#### Add back fields from the original gwas data

In [13]:
gwas_hichip = gwas_hichip.merge(gwas_df.drop('gwas_source', axis=1),
                                left_on=['chr_snp', 'pos'],
                                right_on=['chromosome', 'position'])
# add the sid
gwas_hichip['sid'] = 'chr' +  gwas_hichip['chr_snp'].astype(str) + ':' + gwas_hichip['position'].astype(str)

In [14]:
gwas_hichip.head()

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid
0,1,19970000,19975000,19972330,1,19535000,19540000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
1,1,19970000,19975000,19972330,1,19715000,19720000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
2,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
3,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
4,1,19970000,19975000,19972330,1,19925000,19930000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330


#### Add loop ids which are used for unique set analysis downstream

In [15]:
def make_lid(sr, cols):
    lid = sr[cols].tolist()
    lid = [str(x) for x in lid]
    lid = ':'.join(lid)
    return(lid)
lid_cols = [2,3,4,5,6,7]
lids = []
for sr in gwas_hichip.values: 
    new_lid = make_lid(sr, lid_cols)
    lids.append(new_lid)
gwas_hichip['loop_id'] = lids

In [16]:
gwas_hichip.head()

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id
0,1,19970000,19975000,19972330,1,19535000,19540000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19535000:19540000:1
1,1,19970000,19975000,19972330,1,19715000,19720000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19715000:19720000:1
2,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1
3,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19920000:19925000:1
4,1,19970000,19975000,19972330,1,19925000,19930000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19925000:19930000:1


In [17]:
gwas_hichip.shape

(2240, 35)

## Intersect PC-HiC with HiChIP Loops

In [18]:
# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'CM',
                        'naive-b': 'NB',
                        'total-b': 'NB',
                        'gms_merged': 'NB', 
                        'naive-cd4': 'CD4N', 
                        'total-cd4': 'CD4N', 
                        'non-activated-total-cd4': 'CD4N',
                        'naive-cd8': 'CD8N',
                        'total-cd8': 'CD8N'} 

In [19]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
gwas_hichip['mid'] = range(gwas_hichip.shape[0])

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for fn in pc_hics:
    pchic_cline = os.path.basename(fn).split('.')[0]
    if pchic_cline in hichip_pchic_matches: 
        
        # getting hichip data for the current cell line
        hichip_cline = hichip_pchic_matches[pchic_cline]
        hichip_cline_df = gwas_hichip.loc[gwas_hichip.cline_loop == hichip_cline, hichip_bedpe_cols]
        
        # loading pc-hic data for the current cell line
        
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting hichip and pc-hic bedpes
        hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        both_loops = hichip_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

overlap found: results/main/pc_hic/2016_javierre/processing/non-activated-total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/monocytes.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd8.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd8.bedpe
overlap found: results/main/pc_hic/2019_jung/processing/gms_merged.bedpe


In [20]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

In [21]:
# merging and cleaning the merged dataset 
gwas_hichip = gwas_hichip.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
gwas_hichip.drop('mid', axis=1, inplace=True)

## Integrate genes 

### Load the gene data

In [24]:
print('# Load the gene data')

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

# create a copy of the original gene bed before coordinate shrinking
orig_genes_df = genes_df.copy()

# convert the start/end position into start/end for the TSS
# if the gene is + then the start is uses as the tss otherwise
# the end is used as the tss
genes_df.loc[(genes_df.strand == '+'), 'end'] = genes_df.loc[(genes_df.strand == '+'), 'start']
genes_df.loc[(genes_df.strand == '+'), 'start'] = genes_df.loc[(genes_df.strand == '+'), 'start'] - 1
genes_df.loc[(genes_df.strand == '-'), 'end'] = genes_df.loc[(genes_df.strand == '-'), 'end']
genes_df.loc[(genes_df.strand == '-'), 'start'] = genes_df.loc[(genes_df.strand == '-'), 'end'] - 1
genes_df.loc[:, 'chrom'] = genes_df.loc[:, 'chrom'].str.replace('chr', '')
genes_df.loc[:, 'bin_start'] = (np.floor(genes_df.loc[:, 'start'] / res) * res).astype(int)
genes_df.loc[:, 'bin_end'] = genes_df.loc[:, 'bin_start'] + res

# make a genes pbt for intersection
print("# make a genes pbt for intersection")
print(genes_df.head())
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
# make a genes pbt for intersection
   chrom  start    end        gname          gene_id strand  bin_start  \
0      1  11868  11869      DDX11L1  ENSG00000223972      +      10000   
12     1  29569  29570       WASH7P  ENSG00000227232      -      25000   
25     1  17435  17436    MIR6859-1  ENSG00000278267      -      15000   
28     1  29553  29554  MIR1302-2HG  ENSG00000243485      +      25000   
36     1  30365  30366    MIR1302-2  ENSG00000284332      +      30000   

    bin_end  
0     15000  
12    30000  
25    20000  
28    30000  
36    35000  
There are 58825 genes in this GTF-derived file.


### Determine which anchor the SNP falls into

In [25]:
snp_anchor = []
for i, sr in gwas_hichip.iterrows():
    if (sr.startA_loop <= sr.position) & (sr.position <= sr.endA_loop):
        snp_anchor.append('AnchorA')
    elif (sr.startB_loop <= sr.position) & (sr.position <= sr.endB_loop):
        snp_anchor.append('AnchorB')
    else:
        snp_anchor.append('bug')
        print('bug')
        break
gwas_hichip.loc[:, 'snp_anchor'] = snp_anchor

In [26]:
print('SNP anchor designation:', gwas_hichip['snp_anchor'].unique().tolist())

SNP anchor designation: ['AnchorB', 'AnchorA']


### Extract anchors opposite of a SNP anchor

In [27]:
# using a basic serial id for merging post bedtools intersection
gwas_hichip['gh_id'] = range(gwas_hichip.shape[0])

anchor_cols = ['chrB_loop', 'startB_loop', 'endB_loop', 'gh_id']
nonsnp_anchorsA = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorA', anchor_cols]
anchor_cols =  ['chrA_loop', 'startA_loop', 'endA_loop', 'gh_id']
nonsnp_anchorsB = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorB', anchor_cols]

nonsnp_anchorsA.columns = ['chr', 'start', 'end', 'gh_id']
nonsnp_anchorsB.columns = ['chr', 'start', 'end', 'gh_id']
nonsnp_anchors = pd.concat([nonsnp_anchorsA, nonsnp_anchorsB], axis=0)
nonsnp_anchors_pbt = pbt.BedTool.from_dataframe(nonsnp_anchors)

In [28]:
nonsnp_anchors.head()

Unnamed: 0,chr,start,end,gh_id
9,1,36020000,36025000,9
11,1,36020000,36025000,11
17,1,63985000,63990000,17
18,1,114445000,114450000,18
19,1,114470000,114475000,19


### Intersecting genes on anchors opposing a SNP anchor

In [29]:
gene_overlaps = nonsnp_anchors_pbt.intersect(genes_pbt, wa=True, wb=True)
gene_overlaps = gene_overlaps.to_dataframe(header=None, disable_auto_names=True)

In [30]:
print('The number of anchor gene overlaps is:', gene_overlaps.shape)

The number of anchor gene overlaps is: (142, 12)


In [31]:
gene_overlaps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,10,124765000,124770000,70,10,124766195,124766196,RF00019,ENSG00000199466,+,124765000,124770000
1,12,10360000,10365000,110,12,10361044,10361045,LINC02598,ENSG00000256155,-,10360000,10365000
2,12,10360000,10365000,110,12,10363768,10363769,AC022075.1,ENSG00000245648,+,10360000,10365000
3,12,12850000,12855000,117,12,12850981,12850982,AC007688.1,ENSG00000241352,+,12850000,12855000
4,12,12875000,12880000,120,12,12875498,12875499,RPL13AP20,ENSG00000234498,+,12875000,12880000


### Add gene overlaps to SNP-Loop Pairs

In [32]:
gene_overlaps.columns = ['chrSNP', 'startSNP', 'endSNP', 'gh_id',
                         'chrGene', 'startGene', 'endGene',
                         'genename', 'geneid', 'strand', 'bin_start', 'bin_end']
gwas_hichip_genes = gwas_hichip.merge(gene_overlaps,
                                      on=['gh_id'],
                                      how='left')
gwas_hichip_genes = gwas_hichip_genes.loc[~gwas_hichip_genes.chrSNP.isna()]

#### Make a table of uniq SNPs and Genes

In [36]:
# find the unique SNPs
uniq_snps_by_cells = gwas_hichip_genes.groupby('cline_loop').sid.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = gwas_hichip_genes.groupby('cline_loop').geneid.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='finemapping')

In [37]:
excel_analysis

'results/main/Intersect_T1D_Finemap_GWAS_SNPs_with_HiChIP/Unique_Counts_By_Cell_Line.xlsx'

#### Write the gene list as well

In [38]:
gh_list = gwas_hichip_genes.geneid.unique()
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

#### Get the unique genes per cell type 

In [39]:
genes_by_cell = gwas_hichip_genes[['cline_loop', 'geneid']].drop_duplicates()
genes_by_cell.sort_values(['cline_loop', 'geneid'], inplace=True)
genes_by_cell['source'] = 'finemap_with_hichip'
genes_by_cell.columns = ['cline', 'geneid', 'source']
fn = os.path.join(outdir, 'genes_by_cell.xlsx')
genes_by_cell.to_excel(fn, index=False)

## Summarize the SNPs, Loops and Intersection

In [40]:
total_gwas = gwas_df.shape[0]
# # summarize the total number of GWAS loops per cell # DOESN't MAKE SENSE
# cell_summary['total_gwas'] = gwas_hichip.groupby('cline_loop').nunique('sid')['chr_snp']
# cell_summary['total_gwas'] = cell_summary['total_gwas'].to_frame()
# cell_summary['total_gwas'].columns = ['Total GWAS SNPs']
# cell_summary['total_gwas']

In [41]:
cell_summary = {}

### Summarize the Number of Loops per Cell (pre-intersection)

In [42]:
cell_summary['total_loops'] = loop_df.groupby('cline').count()['startA'].to_frame()
cell_summary['total_loops'].columns = ['total_hichip']
cell_summary['total_loops']

Unnamed: 0_level_0,total_hichip
cline,Unnamed: 1_level_1
CD4N,114421
CD8N,84599
CM,84298
NB,128288
NK,129890


### Summarize the Number of SNP-Loop (SL) Pairs per Cell

In [43]:
cell_summary['sl_pairs'] = gwas_hichip['cline_loop'].value_counts().to_frame()
cell_summary['sl_pairs'].columns = ['sl_pairs']
cell_summary['sl_pairs']

Unnamed: 0,sl_pairs
NB,443
CD4N,162
CD8N,119
CM,109


### Summarize the Number of Unique GWAS SNPs which Overlap a HiChIP Loop per Cell

In [54]:
cell_summary['uniq_gwas'] = gwas_hichip.groupby('cline_loop')['sid'].nunique().to_frame()
cell_summary['uniq_gwas'].columns = ['uniq_gwas_in_slpairs']
cell_summary['uniq_gwas']

Unnamed: 0_level_0,uniq_gwas_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,48
CD8N,48
CM,39
NB,118


### Summarize the Number of Loops with GWAS Overlaps (per cell)

In [55]:
loop_cols = ['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop', 'endB_loop']
cell_summary['uniq_loops'] = gwas_hichip.groupby('cline_loop')['loop_id'].nunique().to_frame()
cell_summary['uniq_loops'].columns = ['uniq_loops_in_slpairs']
cell_summary['uniq_loops']

Unnamed: 0_level_0,uniq_loops_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,117
CD8N,90
CM,95
NB,325


### Construct a large summary table

In [56]:
concat_list = [cell_summary['total_loops'], cell_summary['sl_pairs'],
               cell_summary['uniq_gwas'], cell_summary['uniq_loops']]
summary = pd.concat(concat_list, axis=1)
summary['pct_uniq_gwas_in_slpairs'] = summary['uniq_gwas_in_slpairs'] / total_gwas * 100
summary['pct_uniq_loops_in_slpairs'] = summary['uniq_loops_in_slpairs'] / summary['total_hichip'] * 100

In [57]:
summary

Unnamed: 0,total_hichip,sl_pairs,uniq_gwas_in_slpairs,uniq_loops_in_slpairs,pct_uniq_gwas_in_slpairs,pct_uniq_loops_in_slpairs
CD4N,114421,162.0,48.0,117.0,6.760563,0.102254
CD8N,84599,119.0,48.0,90.0,6.760563,0.106384
CM,84298,109.0,39.0,95.0,5.492958,0.112695
NB,128288,443.0,118.0,325.0,16.619718,0.253336
NK,129890,,,,,


In [58]:
final_summary = summary.copy()

In [59]:
final_colnames = ['Total\\nHiChIP Loops', 
                  'Number of\\nGWAS-Loop Pairs',
                  'Number of\\nUnique GWAS SNPs in GL Pairs', 
                  'Number of\\nUnique loops in GL Pairs',
                  'Percentage of\\nUnique GWAS SNPs in GL Pairs', 
                  'Percentage of\\nUnique loops in GL Pairs']
final_colnames = ['Total HiChIP Loops', 
                  'Number of GWAS-Loop Pairs',
                  'Number of Unique GWAS SNPs in GL Pairs', 
                  'Number of Unique loops in GL Pairs',
                  'Percentage of Unique GWAS SNPs in GL Pairs', 
                  'Percentage of Unique loops in GL Pairs']
final_summary.columns = final_colnames

In [60]:
display(HTML(final_summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Total HiChIP Loops,Number of GWAS-Loop Pairs,Number of Unique GWAS SNPs in GL Pairs,Number of Unique loops in GL Pairs,Percentage of Unique GWAS SNPs in GL Pairs,Percentage of Unique loops in GL Pairs
CD4N,114421,162.0,48.0,117.0,6.760563,0.102254
CD8N,84599,119.0,48.0,90.0,6.760563,0.106384
CM,84298,109.0,39.0,95.0,5.492958,0.112695
NB,128288,443.0,118.0,325.0,16.619718,0.253336
NK,129890,,,,,


## Summarize SGLs with PC-HiC Support

In [62]:
for hichip_cline, cline_df in gwas_hichip.groupby('cline_loop'):
    
    print(hichip_cline)

CD4N
CD8N
CM
NB


In [66]:
gwas_hichip_genes.shape

(142, 57)

In [72]:
pchic_support = []
for i, sr in gwas_hichip_genes.iterrows():
    
    pchic_supp = 0 
    if sr.cline_loop == 'CD4N':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
    elif sr.cline_loop == 'CD8N':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CM':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline_loop == 'NB':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5 or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
            
    pchic_support.append(pchic_supp)
        

In [75]:
gwas_hichip_genes['pchic_support'] = pchic_support

In [79]:
for grp, grp_df in gwas_hichip_genes.groupby('cline_loop'):
    print(grp)
    
    display(grp_df.loc[grp_df.pchic_support == 1])

CD4N


Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,pchic.gms_merged,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end,pchic_support
1,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.632917e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1,,,,16.519501,,,,,,AnchorB,1,1.0,19810000.0,19815000.0,1.0,19814366.0,19814367.0,AL391883.1,ENSG00000235434,+,19810000.0,19815000.0,1
110,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,,,9.746942,,AnchorA,110,12.0,10360000.0,10365000.0,12.0,10361044.0,10361045.0,LINC02598,ENSG00000256155,-,10360000.0,10365000.0,1
111,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,,,9.746942,,AnchorA,110,12.0,10360000.0,10365000.0,12.0,10363768.0,10363769.0,AC022075.1,ENSG00000245648,+,10360000.0,10365000.0,1
118,12,9910000,9915000,9914005,12,9910000,9915000,12,12850000,12855000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,5.199689,,,,AnchorA,117,12.0,12850000.0,12855000.0,12.0,12850981.0,12850982.0,AC007688.1,ENSG00000241352,+,12850000.0,12855000.0,1
121,12,9910000,9915000,9914005,12,9910000,9915000,12,12875000,12880000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,13.522526,,,,AnchorA,120,12.0,12875000.0,12880000.0,12.0,12875498.0,12875499.0,RPL13AP20,ENSG00000234498,+,12875000.0,12880000.0,1
123,12,9910000,9915000,9914005,12,9910000,9915000,12,9950000,9955000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,25.476183,,,,,,AnchorA,122,12.0,9950000.0,9955000.0,12.0,9951315.0,9951316.0,CLEC12A,ENSG00000172322,+,9950000.0,9955000.0,1
124,12,9910000,9915000,9914005,12,9910000,9915000,12,9950000,9955000,CD4N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,25.476183,,,,,,AnchorA,122,12.0,9950000.0,9955000.0,12.0,9953335.0,9953336.0,CLEC12A-AS1,ENSG00000231560,-,9950000.0,9955000.0,1
189,12,56435000,56440000,56435929,12,56110000,56115000,12,56435000,56440000,CD4N,T1D_32005708,26,chr12:55868078-57109885,1001,12:56435929,12,56435929,G,C,0.4197,-0.2461,0.0238,-10.3403,1.0,11.4253,0.838226,2.39604,0.838226,2.39604,1.0,56435000,56440000,chr12:56435929,56440000:56435929:12:56110000:56115000:12,,,,11.726741,,,,,,AnchorB,184,12.0,56110000.0,56115000.0,12.0,56113904.0,56113905.0,AC034102.4,ENSG00000257553,-,56110000.0,56115000.0,1
194,12,56435000,56440000,56435929,12,56435000,56440000,12,56860000,56865000,CD4N,T1D_32005708,26,chr12:55868078-57109885,1001,12:56435929,12,56435929,G,C,0.4197,-0.2461,0.0238,-10.3403,1.0,11.4253,0.838226,2.39604,0.838226,2.39604,1.0,56435000,56440000,chr12:56435929,56440000:56435929:12:56435000:56440000:12,,,,5.284738,,,,,,AnchorA,189,12.0,56860000.0,56865000.0,12.0,56861374.0,56861375.0,RF00554,ENSG00000212383,+,56860000.0,56865000.0,1
210,12,56435000,56440000,56435412,12,56110000,56115000,12,56435000,56440000,CD4N,T1D_32005708,26,chr12:55868078-57109885,1262,12:56435412,12,56435412,A,G,0.3351,0.2353,0.0244,9.64344,0.909011,4.14824,0.733566,2.27471,0.806994,2.37339,2.620026e-22,56435000,56440000,chr12:56435412,56440000:56435412:12:56110000:56115000:12,,,,11.726741,,,,,,AnchorB,204,12.0,56110000.0,56115000.0,12.0,56113904.0,56113905.0,AC034102.4,ENSG00000257553,-,56110000.0,56115000.0,1


CD8N


Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,pchic.gms_merged,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end,pchic_support
5,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD8N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.632917e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1,,,,,14.945686,,,,,AnchorB,5,1.0,19810000.0,19815000.0,1.0,19814366.0,19814367.0,AL391883.1,ENSG00000235434,+,19810000.0,19815000.0,1
70,10,124125000,124130000,124128690,10,124125000,124130000,10,124765000,124770000,CD8N,T1D_32005708,18,chr10:123412149-124412149,2031,10:124128690,10,124128690,C,T,0.1877,-0.1123,0.0311,-3.61093,0.195227,2.77323,0.375469,0.868274,1.92325,0.940696,0.9998475,124125000,124130000,chr10:124128690,124130000:124128690:10:124125000:124130000:10,,,,,7.879563,,,,,AnchorA,70,10.0,124765000.0,124770000.0,10.0,124766195.0,124766196.0,RF00019,ENSG00000199466,+,124765000.0,124770000.0,1
131,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CD8N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,8.932532,,,,,AnchorA,129,12.0,10360000.0,10365000.0,12.0,10361044.0,10361045.0,LINC02598,ENSG00000256155,-,10360000.0,10365000.0,1
132,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CD8N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,8.932532,,,,,AnchorA,129,12.0,10360000.0,10365000.0,12.0,10363768.0,10363769.0,AC022075.1,ENSG00000245648,+,10360000.0,10365000.0,1
145,12,9910000,9915000,9914005,12,9910000,9915000,12,12875000,12880000,CD8N,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,,,,6.026704,AnchorA,142,12.0,12875000.0,12880000.0,12.0,12875498.0,12875499.0,RPL13AP20,ENSG00000234498,+,12875000.0,12880000.0,1
237,12,111880000,111885000,111884608,12,111840000,111845000,12,111880000,111885000,CD8N,T1D_34594039_GCST90018925,6,chr12:111326477-113406415,1112,12:111884608,12,111884608,C,T,0.358577,-0.1433,0.0189,-7.58201,0.993045,5.58789,-0.137929,0.221493,-0.138895,0.221965,1.0,111880000,111885000,chr12:111884608,111885000:111884608:12:111840000:111845000:12,,,,,5.869683,,,,,AnchorB,228,12.0,111840000.0,111845000.0,12.0,111842227.0,111842228.0,MAPKAPK5,ENSG00000089022,+,111840000.0,111845000.0,1
238,12,111880000,111885000,111884608,12,111840000,111845000,12,111880000,111885000,CD8N,T1D_34594039_GCST90018925,6,chr12:111326477-113406415,1112,12:111884608,12,111884608,C,T,0.358577,-0.1433,0.0189,-7.58201,0.993045,5.58789,-0.137929,0.221493,-0.138895,0.221965,1.0,111880000,111885000,chr12:111884608,111885000:111884608:12:111840000:111845000:12,,,,,5.869683,,,,,AnchorB,228,12.0,111840000.0,111845000.0,12.0,111842901.0,111842902.0,MAPKAPK5-AS1,ENSG00000234608,-,111840000.0,111845000.0,1
284,14,69260000,69265000,69260849,14,68735000,68740000,14,69260000,69265000,CD8N,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68735000:68740000:14,,,,,13.729248,,,,,AnchorB,272,14.0,68735000.0,68740000.0,14.0,68736027.0,68736028.0,RNU6-921P,ENSG00000207089,-,68735000.0,68740000.0,1
375,14,69245000,69250000,69247359,14,69150000,69155000,14,69245000,69250000,CD8N,T1D_32005708,38,chr14:68763341-69811424,243,14:69247359,14,69247359,T,G,0.04,-0.3439,0.0684,-5.02778,0.760816,3.80751,-0.404174,0.24964,-0.531237,0.120047,0.9999998,69245000,69250000,chr14:69247359,69250000:69247359:14:69150000:69155000:14,,,,,7.957766,,,,,AnchorB,361,14.0,69150000.0,69155000.0,14.0,69153149.0,69153150.0,DCAF5,ENSG00000139990,-,69150000.0,69155000.0,1
376,14,69245000,69250000,69247359,14,69150000,69155000,14,69245000,69250000,CD8N,T1D_32005708,38,chr14:68763341-69811424,243,14:69247359,14,69247359,T,G,0.04,-0.3439,0.0684,-5.02778,0.760816,3.80751,-0.404174,0.24964,-0.531237,0.120047,0.9999998,69245000,69250000,chr14:69247359,69250000:69247359:14:69150000:69155000:14,,,,,7.957766,,,,,AnchorB,361,14.0,69150000.0,69155000.0,14.0,69154310.0,69154311.0,AL391262.1,ENSG00000276063,+,69150000.0,69155000.0,1


CM


Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,pchic.gms_merged,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end,pchic_support
184,12,9910000,9915000,9914005,12,9910000,9915000,12,10335000,10340000,CM,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,7.315664,,,,,,,,AnchorA,180,12.0,10335000.0,10340000.0,12.0,10338291.0,10338292.0,LINC02617,ENSG00000256288,-,10335000.0,10340000.0,1
185,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CM,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,8.450608,,,,,,,,AnchorA,181,12.0,10360000.0,10365000.0,12.0,10361044.0,10361045.0,LINC02598,ENSG00000256155,-,10360000.0,10365000.0,1
186,12,9910000,9915000,9914005,12,9910000,9915000,12,10360000,10365000,CM,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,8.450608,,,,,,,,AnchorA,181,12.0,10360000.0,10365000.0,12.0,10363768.0,10363769.0,AC022075.1,ENSG00000245648,+,10360000.0,10365000.0,1
368,14,69260000,69265000,69260849,14,68735000,68740000,14,69260000,69265000,CM,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68735000:68740000:14,,8.479451,,,,,,,,AnchorB,354,14.0,68735000.0,68740000.0,14.0,68736027.0,68736028.0,RNU6-921P,ENSG00000207089,-,68735000.0,68740000.0,1
422,16,28505000,28510000,28505660,16,28505000,28510000,16,28545000,28550000,CM,T1D_32005708,46,chr16:28005660-29115708,734,16:28505660,16,28505660,C,G,0.1402,0.2099,0.0354,5.92938,0.564297,3.14221,3.35152,3.1343,5.93928,1.42815,1.520417e-09,28505000,28510000,chr16:28505660,28510000:28505660:16:28505000:28510000:16,,5.87488,,,,,,,,AnchorA,403,16.0,28545000.0,28550000.0,16.0,28548657.0,28548658.0,AC020765.1,ENSG00000271495,-,28545000.0,28550000.0,1
423,16,28505000,28510000,28505660,16,28505000,28510000,16,28550000,28555000,CM,T1D_32005708,46,chr16:28005660-29115708,734,16:28505660,16,28505660,C,G,0.1402,0.2099,0.0354,5.92938,0.564297,3.14221,3.35152,3.1343,5.93928,1.42815,1.520417e-09,28505000,28510000,chr16:28505660,28510000:28505660:16:28505000:28510000:16,,6.885638,,,,,,,,AnchorA,404,16.0,28550000.0,28555000.0,16.0,28553914.0,28553915.0,SGF29,ENSG00000176476,+,28550000.0,28555000.0,1
424,16,28505000,28510000,28505660,16,28505000,28510000,16,28550000,28555000,CM,T1D_32005708,46,chr16:28005660-29115708,734,16:28505660,16,28505660,C,G,0.1402,0.2099,0.0354,5.92938,0.564297,3.14221,3.35152,3.1343,5.93928,1.42815,1.520417e-09,28505000,28510000,chr16:28505660,28510000:28505660:16:28505000:28510000:16,,6.885638,,,,,,,,AnchorA,404,16.0,28550000.0,28555000.0,16.0,28554139.0,28554140.0,AC020765.2,ENSG00000275441,-,28550000.0,28555000.0,1
455,19,10510000,10515000,10512911,19,10380000,10385000,19,10510000,10515000,CM,T1D_32005708,57,chr19:9963118-10963118,1041,19:10512911,19,10512911,A,G,0.2019,-0.116,0.0297,-3.90572,0.99271,5.40365,-0.032114,0.598005,-0.03235,0.60019,0.999953,10510000,10515000,chr19:10512911,10515000:10512911:19:10380000:10385000:19,,23.980259,,,,,,,,AnchorB,435,19.0,10380000.0,10385000.0,19.0,10380675.0,10380676.0,TYK2,ENSG00000105397,-,10380000.0,10385000.0,1
462,19,10515000,10520000,10516198,19,10380000,10385000,19,10515000,10520000,CM,T1D_32005708,57,chr19:9963118-10963118,2527,19:10516198,19,10516198,G,T,0.1276,0.0765,0.0391,1.95652,0.4232,3.13507,-0.011856,0.405811,-0.028015,0.623445,0.02520186,10515000,10520000,chr19:10516198,10520000:10516198:19:10380000:10385000:19,,19.819178,,,,,,,,AnchorB,442,19.0,10380000.0,10385000.0,19.0,10380675.0,10380676.0,TYK2,ENSG00000105397,-,10380000.0,10385000.0,1
644,6,138245000,138250000,138248841,6,137540000,137545000,6,138245000,138250000,CM,T1D_34012112_Gaulton,72,chr6:137665744-138665744,3571,6:138248841,6,138248841,A,G,0.00142,-0.7314,0.6344,-1.1529,1.0,13.5847,-1.10134,0.168098,-1.10134,0.168098,0.8755244,138245000,138250000,chr6:138248841,138250000:138248841:6:137540000:137545000:6,,11.519827,,,,,,,,AnchorB,621,6.0,137540000.0,137545000.0,6.0,137544371.0,137544372.0,BTF3L4P3,ENSG00000213108,-,137540000.0,137545000.0,1


NB


Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,pchic.gms_merged,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end,pchic_support
159,12,9910000,9915000,9914005,12,9910000,9915000,12,10080000,10085000,NB,T1D_32005708,23,chr12:9323140-10724336,3660,12:9914005,12,9914005,A,T,0.2755,0.1526,0.0258,5.91473,0.39425,3.27701,0.32566,0.434477,0.826025,0.25592,1.662115e-09,9910000,9915000,chr12:9914005,9915000:9914005:12:9910000:9915000:12,,,,,,,5.903635,,,AnchorA,156,12.0,10080000.0,10085000.0,12.0,10084185.0,10084186.0,RN7SKP161,ENSG00000223042,+,10080000.0,10085000.0,1
298,14,69260000,69265000,69260849,14,68420000,68425000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68420000:68425000:14,,,,,,,11.474515,,,AnchorB,286,14.0,68420000.0,68425000.0,14.0,68422195.0,68422196.0,PPIAP6,ENSG00000258477,-,68420000.0,68425000.0,1
309,14,69260000,69265000,69260849,14,68735000,68740000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68735000:68740000:14,,,,,,,23.788561,,,AnchorB,297,14.0,68735000.0,68740000.0,14.0,68736027.0,68736028.0,RNU6-921P,ENSG00000207089,-,68735000.0,68740000.0,1
330,14,69260000,69265000,69260849,14,68860000,68865000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68860000:68865000:14,,,,,,,6.44059,,,AnchorB,318,14.0,68860000.0,68865000.0,14.0,68861761.0,68861762.0,MAGOH3P,ENSG00000270975,+,68860000.0,68865000.0,1
331,14,69260000,69265000,69260849,14,68865000,68870000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68865000:68870000:14,,,,,,,13.498287,,,AnchorB,319,14.0,68865000.0,68870000.0,14.0,68869950.0,68869951.0,BLZF2P,ENSG00000258565,-,68865000.0,68870000.0,1
340,14,69260000,69265000,69260849,14,68935000,68940000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:68935000:68940000:14,,,5.756092,,,,,,,AnchorB,328,14.0,68935000.0,68940000.0,14.0,68936601.0,68936602.0,BANF1P1,ENSG00000258531,+,68935000.0,68940000.0,1
354,14,69260000,69265000,69260849,14,69150000,69155000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:69150000:69155000:14,,,,,,,6.451721,,,AnchorB,342,14.0,69150000.0,69155000.0,14.0,69153149.0,69153150.0,DCAF5,ENSG00000139990,-,69150000.0,69155000.0,1
355,14,69260000,69265000,69260849,14,69150000,69155000,14,69260000,69265000,NB,T1D_32005708,38,chr14:68763341-69811424,1179,14:69260849,14,69260849,T,C,0.0022,0.2212,0.259,0.854054,1.0,13.305,-0.527538,0.118946,-0.527538,0.118946,0.1965375,69260000,69265000,chr14:69260849,69265000:69260849:14:69150000:69155000:14,,,,,,,6.451721,,,AnchorB,342,14.0,69150000.0,69155000.0,14.0,69154310.0,69154311.0,AL391262.1,ENSG00000276063,+,69150000.0,69155000.0,1
446,19,10510000,10515000,10512911,19,10380000,10385000,19,10510000,10515000,NB,T1D_32005708,57,chr19:9963118-10963118,1041,19:10512911,19,10512911,A,G,0.2019,-0.116,0.0297,-3.90572,0.99271,5.40365,-0.032114,0.598005,-0.03235,0.60019,0.999953,10510000,10515000,chr19:10512911,10515000:10512911:19:10380000:10385000:19,,,,,,,18.161489,,,AnchorB,426,19.0,10380000.0,10385000.0,19.0,10380675.0,10380676.0,TYK2,ENSG00000105397,-,10380000.0,10385000.0,1
458,19,10515000,10520000,10516198,19,10380000,10385000,19,10515000,10520000,NB,T1D_32005708,57,chr19:9963118-10963118,2527,19:10516198,19,10516198,G,T,0.1276,0.0765,0.0391,1.95652,0.4232,3.13507,-0.011856,0.405811,-0.028015,0.623445,0.02520186,10515000,10520000,chr19:10516198,10520000:10516198:19:10380000:10385000:19,,,,,,,18.161489,,,AnchorB,438,19.0,10380000.0,10385000.0,19.0,10380675.0,10380676.0,TYK2,ENSG00000105397,-,10380000.0,10385000.0,1


In [80]:
finemap_sgls_fn = os.path.join(outdir, 'finemap_sgls_with_pchic_support.xlsx')
gwas_hichip_genes.to_excel(finemap_sgls_fn, index=False)

In [81]:
gwas_hichip_genes

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,pchic.gms_merged,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end,pchic_support
1,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.084300,0.161800,0.041900,3.861580,0.049140,2.07697,-0.001938,0.063380,-0.039446,0.283315,5.632917e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1,,,,16.519501,,,,,,AnchorB,1,1.0,19810000.0,19815000.0,1.0,19814366.0,19814367.0,AL391883.1,ENSG00000235434,+,19810000.0,19815000.0,1
5,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD8N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.084300,0.161800,0.041900,3.861580,0.049140,2.07697,-0.001938,0.063380,-0.039446,0.283315,5.632917e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1,,,,,14.945686,,,,,AnchorB,5,1.0,19810000.0,19815000.0,1.0,19814366.0,19814367.0,AL391883.1,ENSG00000235434,+,19810000.0,19815000.0,1
8,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,NB,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.084300,0.161800,0.041900,3.861580,0.049140,2.07697,-0.001938,0.063380,-0.039446,0.283315,5.632917e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19920000:19925000:1,0.465962,,,,,,,,,AnchorB,8,1.0,19920000.0,19925000.0,1.0,19923616.0,19923617.0,PLA2G2E,ENSG00000188784,-,19920000.0,19925000.0,0
54,10,6090000,6095000,6092093,10,5885000,5890000,10,6090000,6095000,NB,T1D_34594039_GCST90018925,2,chr10:5598824-6598824,2835,10:6092093,10,6092093,T,C,0.261555,0.083200,0.019300,4.310880,0.074834,2.45529,-0.019558,0.070273,-0.261346,0.052910,8.130276e-06,6090000,6095000,chr10:6092093,6095000:6092093:10:5885000:5890000:10,1.893419,,,,,,,,,AnchorB,54,10.0,5885000.0,5890000.0,10.0,5889905.0,5889906.0,ANKRD16,ENSG00000134461,-,5885000.0,5890000.0,0
56,10,6090000,6095000,6092093,10,6015000,6020000,10,6090000,6095000,NB,T1D_34594039_GCST90018925,2,chr10:5598824-6598824,2835,10:6092093,10,6092093,T,C,0.261555,0.083200,0.019300,4.310880,0.074834,2.45529,-0.019558,0.070273,-0.261346,0.052910,8.130276e-06,6090000,6095000,chr10:6092093,6095000:6092093:10:6015000:6020000:10,1.175976,,,,,,,,,AnchorB,56,10.0,6015000.0,6020000.0,10.0,6017193.0,6017194.0,RF00397,ENSG00000251922,-,6015000.0,6020000.0,0
59,10,6110000,6115000,6113666,10,5885000,5890000,10,6110000,6115000,NB,T1D_32005708,10,chr10:5561479-6623716,811,10:6113666,10,6113666,A,G,0.017400,0.065100,0.087700,0.742303,1.000000,13.43540,-9.320980,0.037785,-9.320980,0.037785,2.289518e-01,6110000,6115000,chr10:6113666,6115000:6113666:10:5885000:5890000:10,0.339606,,,,,,,,,AnchorB,59,10.0,5885000.0,5890000.0,10.0,5889905.0,5889906.0,ANKRD16,ENSG00000134461,-,5885000.0,5890000.0,0
60,10,6110000,6115000,6113666,10,6015000,6020000,10,6110000,6115000,NB,T1D_32005708,10,chr10:5561479-6623716,811,10:6113666,10,6113666,A,G,0.017400,0.065100,0.087700,0.742303,1.000000,13.43540,-9.320980,0.037785,-9.320980,0.037785,2.289518e-01,6110000,6115000,chr10:6113666,6115000:6113666:10:6015000:6020000:10,1.712008,,,,,,,,,AnchorB,60,10.0,6015000.0,6020000.0,10.0,6017193.0,6017194.0,RF00397,ENSG00000251922,-,6015000.0,6020000.0,0
70,10,124125000,124130000,124128690,10,124125000,124130000,10,124765000,124770000,CD8N,T1D_32005708,18,chr10:123412149-124412149,2031,10:124128690,10,124128690,C,T,0.187700,-0.112300,0.031100,-3.610930,0.195227,2.77323,0.375469,0.868274,1.923250,0.940696,9.998475e-01,124125000,124130000,chr10:124128690,124130000:124128690:10:124125000:124130000:10,,,,,7.879563,,,,,AnchorA,70,10.0,124765000.0,124770000.0,10.0,124766195.0,124766196.0,RF00019,ENSG00000199466,+,124765000.0,124770000.0,1
71,11,2020000,2025000,2021075,11,1855000,1860000,11,2020000,2025000,CD4N,T1D_34594039_GCST90018925,4,chr11:1630620-2734690,2057,11:2021075,11,2021075,T,C,0.297199,0.048100,0.029900,1.608700,1.000000,13.52460,-0.096501,0.000001,-0.096501,0.000001,5.384145e-02,2020000,2025000,chr11:2021075,2025000:2021075:11:1855000:1860000:11,,,,3.748032,,,,,,AnchorB,71,11.0,1855000.0,1860000.0,11.0,1859535.0,1859536.0,MIR4298,ENSG00000264493,-,1855000.0,1860000.0,0
74,11,2020000,2025000,2021075,11,1695000,1700000,11,2020000,2025000,NB,T1D_34594039_GCST90018925,4,chr11:1630620-2734690,2057,11:2021075,11,2021075,T,C,0.297199,0.048100,0.029900,1.608700,1.000000,13.52460,-0.096501,0.000001,-0.096501,0.000001,5.384145e-02,2020000,2025000,chr11:2021075,2025000:2021075:11:1695000:1700000:11,,,3.759727,,,,,,,AnchorB,74,11.0,1695000.0,1700000.0,11.0,1697194.0,1697195.0,KRTAP5-6,ENSG00000205864,+,1695000.0,1700000.0,0
