In [39]:
import os 
import pandas as pd
import numpy as np
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 5000

# make the directory to save our data
outdir = 'results/main/Intersect_T1D_Finemap_GWAS_SNPs_with_HiChIP'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [49]:
gwas = 'results/main/finemapping/T1D_34012112_Gaulton/GRCh37/offset_1000000/Summary/sss/FINAL_top_snp_credible_set.txt'
gwas_df = pd.read_table(gwas)
gwas_df.loc[:, 'bin_start'] = np.floor(gwas_df.loc[:, 'position'] / res).astype(int) * res
gwas_df.loc[:, 'bin_end'] = gwas_df.loc[:, 'bin_start'] + res

gwas_bed = gwas_df.iloc[:, [4,5,5]]
gwas_bed.columns = ['chrom', 'start', 'end']
gwas_bed['start'] = gwas_bed['start'] - 1
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

## Load HiChIP Loops

In [50]:
def parse_seB(x): 
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [51]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

In [52]:
loops = ['results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NCM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz']

In [53]:
loop_data = []
for loop in loops:
    print(loop)
    
    cline = loop.split('/')[5]
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    
    df['chrom'] = df['chrom'].str.replace('chr', '')
    
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    
    df['startA'] = df['startA'] + 1 
    df['endA'] = df['startA'] + res

    df['startB'] = df['startB'] + 1 
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NCM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz


In [54]:
df

Unnamed: 0,chrom,startA,endA,chrom.1,startB,endB,seB,e1,e2,cline
0,1,712500,717500,1,757500,762500,"chr1:757499-757501,39.9592",1,.,NK
1,1,712500,717500,1,762500,767500,"chr1:762499-762501,18.9118",2,.,NK
2,1,712500,717500,1,892500,897500,"chr1:892499-892501,2.57933",3,.,NK
3,1,757500,762500,1,782500,787500,"chr1:782499-782501,3.26331",4,.,NK
4,1,757500,762500,1,947500,952500,"chr1:947499-947501,3.43179",5,.,NK
...,...,...,...,...,...,...,...,...,...,...
129885,9,140602500,140607500,9,140652500,140657500,"chr9:140652499-140652501,2.79712",129886,.,NK
129886,9,140612500,140617500,9,140627500,140632500,"chr9:140627499-140627501,2.12146",129887,.,NK
129887,9,140612500,140617500,9,140632500,140637500,"chr9:140632499-140632501,2.63493",129888,.,NK
129888,9,140612500,140617500,9,140792500,140797500,"chr9:140792499-140792501,3.83825",129889,.,NK


In [8]:
loop_df = pd.concat(loop_data)
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

## Intersect Fine Mapped GWAS and loops

In [9]:
snp_slop = 5000
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(b=snp_slop, g=gsizes), type='either')
#intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(l=snp_slop, r=0, g=gsizes), type='either')
#intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt.slop(b=100000, g=gsizes), type='either')
gwas_hichip = intersect_pbt.to_dataframe()

gwas_hichip = gwas_hichip.iloc[:, [7,8,9,0,1,2,3,4,5,6]]

loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'start_snp', 'end_snp'] + loop_cols + ['cline_loop']
gwas_hichip['start_snp'] += snp_slop
gwas_hichip['end_snp'] -= snp_slop
gwas_hichip = gwas_hichip.merge(gwas_df, left_on=['chr_snp', 'end_snp'], right_on=['chromosome', 'position'])
gwas_hichip.drop('start_snp', axis=1, inplace=True)
gwas_hichip.rename(columns={'end_snp': 'position_snp'}, inplace=True)
gwas_hichip['sid'] = 'chr' +  gwas_hichip['chr_snp'].astype(str) + ':' + gwas_hichip['position_snp'].astype(str)

# add loop ids
def make_lid(sr, cols):
    lid = sr[cols].tolist()
    lid = [str(x) for x in lid]
    lid = ':'.join(lid)
    return(lid)

lid_cols = [2,3,4,5,6,7]
lids = []
for sr in gwas_hichip.values: 
    new_lid = make_lid(sr, lid_cols)
    lids.append(new_lid)
gwas_hichip['loop_id'] = lids

In [10]:
gwas_hichip.head()

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,regionID,...,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,sid,loop_id
0,11,128624171,11,127902500,127912500,11,128627500,128637500,NCM,20,...,2.15558,0.952259,4.80804,0.278615,0.548181,0.292583,0.558105,0.015558,chr11:128624171,11:127902500:127912500:11:128627500:128637500
1,11,128624171,11,128142500,128152500,11,128617500,128627500,NCM,20,...,2.15558,0.952259,4.80804,0.278615,0.548181,0.292583,0.558105,0.015558,chr11:128624171,11:128142500:128152500:11:128617500:128627500
2,11,128624171,11,128142500,128152500,11,128627500,128637500,NCM,20,...,2.15558,0.952259,4.80804,0.278615,0.548181,0.292583,0.558105,0.015558,chr11:128624171,11:128142500:128152500:11:128627500:128637500
3,11,128624171,11,128162500,128172500,11,128622500,128632500,NCM,20,...,2.15558,0.952259,4.80804,0.278615,0.548181,0.292583,0.558105,0.015558,chr11:128624171,11:128162500:128172500:11:128622500:128632500
4,11,128624171,11,128392500,128402500,11,128617500,128627500,NCM,20,...,2.15558,0.952259,4.80804,0.278615,0.548181,0.292583,0.558105,0.015558,chr11:128624171,11:128392500:128402500:11:128617500:128627500


In [11]:
gwas_hichip.shape

(2033, 30)

## Integrate genes 

### Load the gene data

In [12]:
print('# Load the gene data')

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

# create a copy of the original gene bed before coordinate shrinking
orig_genes_df = genes_df.copy()

# convert the start/end position into start/end for the TSS
# if the gene is + then the start is uses as the tss otherwise
# the end is used as the tss
genes_df.loc[(genes_df.strand == '+'), 'end'] = genes_df.loc[(genes_df.strand == '+'), 'start']
genes_df.loc[(genes_df.strand == '+'), 'start'] = genes_df.loc[(genes_df.strand == '+'), 'start'] - 1
genes_df.loc[(genes_df.strand == '-'), 'end'] = genes_df.loc[(genes_df.strand == '-'), 'end']
genes_df.loc[(genes_df.strand == '-'), 'start'] = genes_df.loc[(genes_df.strand == '-'), 'end'] - 1
genes_df.loc[:, 'chrom'] = genes_df.loc[:, 'chrom'].str.replace('chr', '')

# make a genes pbt for intersection
print("# make a genes pbt for intersection")
print(genes_df.head())
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
# make a genes pbt for intersection
   chrom  start    end        gname          gene_id strand
0      1  11868  11869      DDX11L1  ENSG00000223972      +
12     1  29569  29570       WASH7P  ENSG00000227232      -
25     1  17435  17436    MIR6859-1  ENSG00000278267      -
28     1  29553  29554  MIR1302-2HG  ENSG00000243485      +
36     1  30365  30366    MIR1302-2  ENSG00000284332      +
There are 58825 genes in this GTF-derived file.


### Extract anchors opposite of a SNP anchor

In [13]:
snp_anchor = []
for i, sr in gwas_hichip.iterrows():
    if (sr.startA_loop - snp_slop <= sr.position_snp) & (sr.position_snp <= sr.endA_loop + snp_slop):
        snp_anchor.append('AnchorA')
    elif (sr.startB_loop - snp_slop <= sr.position_snp) & (sr.position_snp <= sr.endB_loop + snp_slop):
        snp_anchor.append('AnchorB')
    else:
        snp_anchor.append('bug')
        print('bug')
        break
gwas_hichip.loc[:, 'snp_anchor'] = snp_anchor

In [14]:
nonsnp_anchorsA = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorA', 
                                  ['chrB_loop', 'startB_loop', 'endB_loop', 'sid', 'cline_loop']]
nonsnp_anchorsB = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorB',
                                  ['chrA_loop', 'startA_loop', 'endA_loop', 'sid', 'cline_loop']]
nonsnp_anchorsA.columns = ['chr', 'start', 'end', 'sid', 'cline_loop']
nonsnp_anchorsB.columns = ['chr', 'start', 'end', 'sid', 'cline_loop']
nonsnp_anchors = pd.concat([nonsnp_anchorsA, nonsnp_anchorsB], axis=0)
nonsnp_anchors_pbt = pbt.BedTool.from_dataframe(nonsnp_anchors)

### Intersecting genes on anchors opposing a SNP anchor

In [15]:
gene_overlaps = nonsnp_anchors_pbt.slop(l=5000, r=0, g=gsizes).intersect(genes_pbt, wa=True, wb=True)
gene_overlaps = gene_overlaps.to_dataframe()

gene_overlaps.columns = ['chrSNP', 'startSNP', 'endSNP', 'sid', 'cline_loop',
                         'chrGene', 'startGene', 'endGene',
                         'genename', 'geneid', 'other']

gwas_hichip_genes = gwas_hichip.merge(gene_overlaps, how='left', on=['sid', 'cline_loop'])
gwas_hichip_genes = gwas_hichip_genes.loc[~gwas_hichip_genes.chrSNP.isna()]

In [16]:
gwas_hichip_genes

Unnamed: 0,chr_snp,position_snp,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,regionID,...,snp_anchor,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,other
0,11,128624171,11,127902500,127912500,11,128627500,128637500,NCM,20,...,AnchorB,11.0,128507500.0,128522500.0,11.0,128522389.0,128522390.0,MIR6090,ENSG00000276176,+
1,11,128624171,11,127902500,127912500,11,128627500,128637500,NCM,20,...,AnchorB,11.0,128582500.0,128597500.0,11.0,128587557.0,128587558.0,ETS1,ENSG00000134954,-
2,11,128624171,11,127902500,127912500,11,128627500,128637500,NCM,20,...,AnchorB,11.0,128582500.0,128597500.0,11.0,128587557.0,128587558.0,ETS1,ENSG00000134954,-
3,11,128624171,11,127902500,127912500,11,128627500,128637500,NCM,20,...,AnchorB,11.0,128587500.0,128602500.0,11.0,128587557.0,128587558.0,ETS1,ENSG00000134954,-
4,11,128624171,11,128142500,128152500,11,128617500,128627500,NCM,20,...,AnchorB,11.0,128507500.0,128522500.0,11.0,128522389.0,128522390.0,MIR6090,ENSG00000276176,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8846,6,137963070,6,137542500,137552500,6,137962500,137972500,NB,72,...,AnchorB,6.0,137537500.0,137552500.0,6.0,137544371.0,137544372.0,BTF3L4P3,ENSG00000213108,-
8847,6,137963070,6,137542500,137552500,6,137957500,137967500,CM,72,...,AnchorB,6.0,137537500.0,137552500.0,6.0,137544371.0,137544372.0,BTF3L4P3,ENSG00000213108,-
8848,6,137963070,6,137542500,137552500,6,137957500,137967500,CM,72,...,AnchorB,6.0,137537500.0,137552500.0,6.0,137544371.0,137544372.0,BTF3L4P3,ENSG00000213108,-
8849,6,137963070,6,137542500,137552500,6,137962500,137972500,CM,72,...,AnchorB,6.0,137537500.0,137552500.0,6.0,137544371.0,137544372.0,BTF3L4P3,ENSG00000213108,-


In [17]:
# find the unique SNPs
uniq_snps_by_cells = gwas_hichip_genes.groupby('cline_loop').sid.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = gwas_hichip_genes.groupby('cline_loop').genename.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='finemapping')

In [18]:
uniq_genes_by_cells

Unnamed: 0_level_0,genename
cline_loop,Unnamed: 1_level_1
CD4N,31
CD8N,30
CM,7
NB,38
NCM,30
NK,32


#### Write the gene list as well

In [19]:
gh_list = gwas_hichip_genes.geneid.unique()
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

In [31]:
gene_overlaps

Unnamed: 0,chrSNP,startSNP,endSNP,sid,cline_loop,chrGene,startGene,endGene,genename,geneid,other
0,11,128582500,128597500,chr11:128138867,NCM,11,128587557,128587558,ETS1,ENSG00000134954,-
1,11,128587500,128602500,chr11:128138867,NCM,11,128587557,128587558,ETS1,ENSG00000134954,-
2,11,128582500,128597500,chr11:128138867,NB,11,128587557,128587558,ETS1,ENSG00000134954,-
3,11,128582500,128597500,chr11:128138867,CM,11,128587557,128587558,ETS1,ENSG00000134954,-
4,11,128582500,128597500,chr11:128131013,NB,11,128587557,128587558,ETS1,ENSG00000134954,-
...,...,...,...,...,...,...,...,...,...,...,...
706,6,137537500,137552500,chr6:137963367,CM,6,137544371,137544372,BTF3L4P3,ENSG00000213108,-
707,6,137537500,137552500,chr6:137963367,CM,6,137544371,137544372,BTF3L4P3,ENSG00000213108,-
708,6,137537500,137552500,chr6:137963070,NB,6,137544371,137544372,BTF3L4P3,ENSG00000213108,-
709,6,137537500,137552500,chr6:137963070,CM,6,137544371,137544372,BTF3L4P3,ENSG00000213108,-


## Summarize the SNPs, Loops and Intersection

In [20]:
total_gwas = gwas_df.shape[0]
# # summarize the total number of GWAS loops per cell # DOESN't MAKE SENSE
# cell_summary['total_gwas'] = gwas_hichip.groupby('cline_loop').nunique('sid')['chr_snp']
# cell_summary['total_gwas'] = cell_summary['total_gwas'].to_frame()
# cell_summary['total_gwas'].columns = ['Total GWAS SNPs']
# cell_summary['total_gwas']

In [21]:
cell_summary = {}

### Summarize the Number of Loops per Cell (pre-intersection)

In [22]:
cell_summary['total_loops'] = loop_df.groupby('cline').count()['startA'].to_frame()
cell_summary['total_loops'].columns = ['total_hichip']
cell_summary['total_loops']

Unnamed: 0_level_0,total_hichip
cline,Unnamed: 1_level_1
CD4N,114421
CD8N,84599
CM,84298
NB,128288
NCM,103342
NK,129890


### Summarize the Number of SNP-Loop (SL) Pairs per Cell

In [23]:
cell_summary['sl_pairs'] = gwas_hichip['cline_loop'].value_counts().to_frame()
cell_summary['sl_pairs'].columns = ['sl_pairs']
cell_summary['sl_pairs']

Unnamed: 0,sl_pairs
NB,624
NK,452
CD4N,317
CD8N,304
NCM,217
CM,119


### Summarize the Number of Unique GWAS SNPs which Overlap a HiChIP Loop Cell

In [24]:
cell_summary['uniq_gwas'] = gwas_hichip.groupby('cline_loop')['sid'].nunique().to_frame()
cell_summary['uniq_gwas'].columns = ['uniq_gwas_in_slpairs']
cell_summary['uniq_gwas']

Unnamed: 0_level_0,uniq_gwas_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,46
CD8N,39
CM,44
NB,74
NCM,44
NK,53


### Summarize the Number of Loops with GWAS Overlaps (per cell)

In [25]:
loop_cols = ['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop', 'endB_loop']
cell_summary['uniq_loops'] = gwas_hichip.groupby('cline_loop')['loop_id'].nunique().to_frame()
cell_summary['uniq_loops'].columns = ['uniq_loops_in_slpairs']
cell_summary['uniq_loops']

Unnamed: 0_level_0,uniq_loops_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,222
CD8N,186
CM,71
NB,314
NCM,122
NK,289


In [26]:
concat_list = [cell_summary['total_loops'], cell_summary['sl_pairs'],
               cell_summary['uniq_gwas'], cell_summary['uniq_loops']]
summary = pd.concat(concat_list, axis=1)
summary['pct_uniq_gwas_in_slpairs'] = summary['uniq_gwas_in_slpairs'] / total_gwas * 100
summary['pct_uniq_loops_in_slpairs'] = summary['uniq_loops_in_slpairs'] / summary['total_hichip'] * 100

In [27]:
summary

Unnamed: 0,total_hichip,sl_pairs,uniq_gwas_in_slpairs,uniq_loops_in_slpairs,pct_uniq_gwas_in_slpairs,pct_uniq_loops_in_slpairs
CD4N,114421,317,46,222,46.464646,0.19402
CD8N,84599,304,39,186,39.393939,0.219861
CM,84298,119,44,71,44.444444,0.084225
NB,128288,624,74,314,74.747475,0.244762
NCM,103342,217,44,122,44.444444,0.118055
NK,129890,452,53,289,53.535354,0.222496


In [28]:
final_summary = summary.copy()

In [29]:
final_colnames = ['Total\\nHiChIP Loops', 
                  'Number of\\nGWAS-Loop Pairs',
                  'Number of\\nUnique GWAS SNPs in GL Pairs', 
                  'Number of\\nUnique loops in GL Pairs',
                  'Percentage of\\nUnique GWAS SNPs in GL Pairs', 
                  'Percentage of\\nUnique loops in GL Pairs']
final_colnames = ['Total HiChIP Loops', 
                  'Number of GWAS-Loop Pairs',
                  'Number of Unique GWAS SNPs in GL Pairs', 
                  'Number of Unique loops in GL Pairs',
                  'Percentage of Unique GWAS SNPs in GL Pairs', 
                  'Percentage of Unique loops in GL Pairs']
final_summary.columns = final_colnames

In [30]:
display(HTML(final_summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Total HiChIP Loops,Number of GWAS-Loop Pairs,Number of Unique GWAS SNPs in GL Pairs,Number of Unique loops in GL Pairs,Percentage of Unique GWAS SNPs in GL Pairs,Percentage of Unique loops in GL Pairs
CD4N,114421,317,46,222,46.464646,0.19402
CD8N,84599,304,39,186,39.393939,0.219861
CM,84298,119,44,71,44.444444,0.084225
NB,128288,624,74,314,74.747475,0.244762
NCM,103342,217,44,122,44.444444,0.118055
NK,129890,452,53,289,53.535354,0.222496
