In [1]:
import os 
import pandas as pd
import numpy as np
import subprocess
import glob
import pybedtools as pbt 
from IPython.display import HTML
pd.set_option('display.min_rows', 100) 
pd.set_option('display.max_columns', None)

pbt.set_bedtools_path('/mnt/BioHome/jreyna/software/anaconda3/envs/hic_tls/bin/')
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

gsizes = 'results/refs/hg19/hg19.chrom.sizes'
res = 5000

# make the directory to save our data
outdir = 'results/main/Intersect_T1D_Finemap_GWAS_SNPs_with_HiChIP/'
os.makedirs(outdir, exist_ok=True)
bedpe_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']

## Load Fine Mapped GWAS

In [2]:
major_gwas = ['T1D_32005708', 'T1D_34594039_GCST90018925', 'T1D_34012112_Gaulton']

In [3]:
gwas_glob = 'results/main/finemapping/*/GRCh37/offset_1000000/Summary/sss/FINAL_top_snp_credible_set.txt'
gwas_glob = glob.glob(gwas_glob)
data = []
for fn in gwas_glob:
    
    # get meta data from the path
    path_info = fn.split('/')
    
    if path_info[3] not in major_gwas:
        print('skipped: {}'.format(path_info[3]))
        continue
    
    # get the bin coordinates
    df = pd.read_table(fn)
    df.loc[:, 'bin_start'] = np.floor(df.loc[:, 'position'] / res).astype(int) * res
    df.loc[:, 'bin_end'] = df.loc[:, 'bin_start'] + res
    df = df.loc[(df.allele1.str.len() == 1 ) & (df.allele2.str.len() == 1)]
    df.loc[:, 'gwas_source'] = path_info[3]
    data.append(df)

gwas_df = pd.concat(data)

In [4]:
gwas_df.head()

Unnamed: 0,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start,bin_end,gwas_source
0,1,chr1:113310083-115099755,2336,1:114089649,1,114089649,A,G,0.016251,-0.1324,0.0749,-1.76769,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.9614436,114085000,114090000,T1D_34594039_GCST90018925
1,1,chr1:113310083-115099755,2829,1:114270326,1,114270326,A,C,0.260388,-0.1634,0.0209,-7.81818,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,114270000,114275000,T1D_34594039_GCST90018925
2,1,chr1:113310083-115099755,5046,1:114909703,1,114909703,T,C,0.131631,-0.0046,0.0261,-0.176245,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,0.5699493,114905000,114910000,T1D_34594039_GCST90018925
3,1,chr1:113310083-115099755,3035,1:114377568,1,114377568,G,A,0.114168,-0.4287,0.0286,-14.9895,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.0,114375000,114380000,T1D_34594039_GCST90018925
4,1,chr1:113310083-115099755,3131,1:114420328,1,114420328,T,C,0.325922,0.132,0.0189,6.98413,1.0,13.5586,0.001264,8e-06,0.001264,8e-06,1.433164e-12,114420000,114425000,T1D_34594039_GCST90018925


In [5]:
# create a pybedtools for finemap data
gwas_bed = gwas_df.loc[:, ['chromosome','bin_start','bin_end', 'position', 'gwas_source']]
gwas_pbt = pbt.BedTool.from_dataframe(gwas_bed)

In [6]:
gwas_bed.head()

Unnamed: 0,chromosome,bin_start,bin_end,position,gwas_source
0,1,114085000,114090000,114089649,T1D_34594039_GCST90018925
1,1,114270000,114275000,114270326,T1D_34594039_GCST90018925
2,1,114905000,114910000,114909703,T1D_34594039_GCST90018925
3,1,114375000,114380000,114377568,T1D_34594039_GCST90018925
4,1,114420000,114425000,114420328,T1D_34594039_GCST90018925


## Load HiChIP Loops

In [7]:
def parse_seB(x): 
    s,e = x.split(':')[1].split('-')
    e = e.split(',')[0]
    return((s,e))

In [8]:
loops = 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/'
loops += '*/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz'
loops = glob.glob(loops)

# only analyze loop data from main cell types 
loops = ['results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz',
 'results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz']

In [9]:
loop_data = []
for loop in loops:
    print(loop)
    
    # extract cell line
    cline = loop.split('/')[5]
    
    # load and parse the data
    df = pd.read_table(loop, header=None)    
    df.columns = ['chrom', 'startA', 'endA', 'seB', 'e1', 'e2']
    df['chrom'] = df['chrom'].str.replace('chr', '')
    df['startB'], df['endB'] = zip(*df['seB'].apply(parse_seB))
    df['startB'] = df['startB'].astype(int)
    df['startA'] = df['startA'] + 1 - int(res / 2)
    df['endA'] = df['startA'] + res
    df['startB'] = df['startB'] + 1 - int(res / 2)
    df['endB'] = df['startB'] + res
    
    # re-organize the data into bedpe-like
    df = df.iloc[:, [0,1,2,0,6,7,3,4,5]]

    # add cell type
    df['cline'] = cline

    loop_data.append(df) 
loop_df = pd.concat(loop_data)

results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD4N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CD8N/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NB/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/CM/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz
results/main/2021_Nikhil_eQTL/Data/FitHiChIP_Loops/NK/FitHiChIP_S/FitHiChIP.interactions_FitHiC_Q0.01_WashU.bed.gz


In [10]:
# create a pybedtools for the looping data
loop_bed = loop_df.iloc[:, [0,1,2,3,4,5,-1]]
loop_pbt = pbt.BedTool.from_dataframe(loop_bed)

In [11]:
loop_bed.head()

Unnamed: 0,chrom,startA,endA,chrom.1,startB,endB,cline
0,1,710000,715000,1,1305000,1310000,CD4N
1,1,710000,715000,1,755000,760000,CD4N
2,1,710000,715000,1,760000,765000,CD4N
3,1,710000,715000,1,775000,780000,CD4N
4,1,710000,715000,1,805000,810000,CD4N


## Intersect Fine Mapped GWAS and loops

#### Perform the intersection

In [12]:
intersect_pbt = loop_pbt.pair_to_bed(gwas_pbt, type='either')
gwas_hichip = intersect_pbt.to_dataframe(header=None, disable_auto_names=True)
gwas_hichip = gwas_hichip.iloc[:, [7,8,9,10,0,1,2,3,4,5,6,11]]
loop_cols = ['{}_loop'.format(x) for x in bedpe_cols]
gwas_hichip.columns = ['chr_snp', 'bin_start', 'bin_end', 'pos'] + loop_cols + ['cline_loop', 'gwas_source']

#### Add back fields from the original gwas data

In [13]:
gwas_hichip = gwas_hichip.merge(gwas_df.drop('gwas_source', axis=1),
                                left_on=['chr_snp', 'pos'],
                                right_on=['chromosome', 'position'])
# add the sid
gwas_hichip['sid'] = 'chr' +  gwas_hichip['chr_snp'].astype(str) + ':' + gwas_hichip['position'].astype(str)

In [14]:
gwas_hichip.head()

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid
0,1,19970000,19975000,19972330,1,19535000,19540000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
1,1,19970000,19975000,19972330,1,19715000,19720000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
2,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
3,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330
4,1,19970000,19975000,19972330,1,19925000,19930000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330


#### Add loop ids which are used for unique set analysis downstream

In [15]:
def make_lid(sr, cols):
    lid = sr[cols].tolist()
    lid = [str(x) for x in lid]
    lid = ':'.join(lid)
    return(lid)
lid_cols = [2,3,4,5,6,7]
lids = []
for sr in gwas_hichip.values: 
    new_lid = make_lid(sr, lid_cols)
    lids.append(new_lid)
gwas_hichip['loop_id'] = lids

In [16]:
gwas_hichip.head()

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id
0,1,19970000,19975000,19972330,1,19535000,19540000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19535000:19540000:1
1,1,19970000,19975000,19972330,1,19715000,19720000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19715000:19720000:1
2,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1
3,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19920000:19925000:1
4,1,19970000,19975000,19972330,1,19925000,19930000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19925000:19930000:1


In [17]:
gwas_hichip.shape

(2240, 35)

## Integrate genes 

### Load the gene data

In [18]:
print('# Load the gene data')

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

# create a copy of the original gene bed before coordinate shrinking
orig_genes_df = genes_df.copy()

# convert the start/end position into start/end for the TSS
# if the gene is + then the start is uses as the tss otherwise
# the end is used as the tss
genes_df.loc[(genes_df.strand == '+'), 'end'] = genes_df.loc[(genes_df.strand == '+'), 'start']
genes_df.loc[(genes_df.strand == '+'), 'start'] = genes_df.loc[(genes_df.strand == '+'), 'start'] - 1
genes_df.loc[(genes_df.strand == '-'), 'end'] = genes_df.loc[(genes_df.strand == '-'), 'end']
genes_df.loc[(genes_df.strand == '-'), 'start'] = genes_df.loc[(genes_df.strand == '-'), 'end'] - 1
genes_df.loc[:, 'chrom'] = genes_df.loc[:, 'chrom'].str.replace('chr', '')
genes_df.loc[:, 'bin_start'] = (np.floor(genes_df.loc[:, 'start'] / res) * res).astype(int)
genes_df.loc[:, 'bin_end'] = genes_df.loc[:, 'bin_start'] + res

# make a genes pbt for intersection
print("# make a genes pbt for intersection")
print(genes_df.head())
genes_pbt = pbt.BedTool.from_dataframe(genes_df).sort()

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
# make a genes pbt for intersection
   chrom  start    end        gname          gene_id strand  bin_start  \
0      1  11868  11869      DDX11L1  ENSG00000223972      +      10000   
12     1  29569  29570       WASH7P  ENSG00000227232      -      25000   
25     1  17435  17436    MIR6859-1  ENSG00000278267      -      15000   
28     1  29553  29554  MIR1302-2HG  ENSG00000243485      +      25000   
36     1  30365  30366    MIR1302-2  ENSG00000284332      +      30000   

    bin_end  
0     15000  
12    30000  
25    20000  
28    30000  
36    35000  
There are 58825 genes in this GTF-derived file.


### Determine the which anchor the SNP falls into

In [19]:
snp_anchor = []
for i, sr in gwas_hichip.iterrows():
    if (sr.startA_loop <= sr.position) & (sr.position <= sr.endA_loop):
        snp_anchor.append('AnchorA')
    elif (sr.startB_loop <= sr.position) & (sr.position <= sr.endB_loop):
        snp_anchor.append('AnchorB')
    else:
        snp_anchor.append('bug')
        print('bug')
        break
gwas_hichip.loc[:, 'snp_anchor'] = snp_anchor

In [20]:
print('SNP anchor designation:', gwas_hichip['snp_anchor'].unique().tolist())

SNP anchor designation: ['AnchorB', 'AnchorA']


### Extract anchors opposite of a SNP anchor

In [21]:
# using a basic serial id for merging post bedtools intersection
gwas_hichip['gh_id'] = range(gwas_hichip.shape[0])

anchor_cols = ['chrB_loop', 'startB_loop', 'endB_loop', 'gh_id']
nonsnp_anchorsA = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorA', anchor_cols]
anchor_cols =  ['chrA_loop', 'startA_loop', 'endA_loop', 'gh_id']
nonsnp_anchorsB = gwas_hichip.loc[gwas_hichip['snp_anchor'] == 'AnchorB', anchor_cols]

nonsnp_anchorsA.columns = ['chr', 'start', 'end', 'gh_id']
nonsnp_anchorsB.columns = ['chr', 'start', 'end', 'gh_id']
nonsnp_anchors = pd.concat([nonsnp_anchorsA, nonsnp_anchorsB], axis=0)
nonsnp_anchors_pbt = pbt.BedTool.from_dataframe(nonsnp_anchors)

In [22]:
nonsnp_anchors.head()

Unnamed: 0,chr,start,end,gh_id
22,1,36020000,36025000,22
24,1,36020000,36025000,24
34,1,59760000,59765000,34
35,1,65530000,65535000,35
36,1,63925000,63930000,36


### Intersecting genes on anchors opposing a SNP anchor

In [23]:
gene_overlaps = nonsnp_anchors_pbt.intersect(genes_pbt, wa=True, wb=True)
gene_overlaps = gene_overlaps.to_dataframe(header=None, disable_auto_names=True)

In [24]:
print('The number of anchor gene overlaps is:', gene_overlaps.shape)

The number of anchor gene overlaps is: (375, 12)


In [25]:
gene_overlaps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,115640000,115645000,68,1,115641969,115641970,VANGL1,ENSG00000173218,+,115640000,115645000
1,10,7830000,7835000,151,10,7833956,7833957,AL353754.1,ENSG00000233990,-,7830000,7835000
2,10,42660000,42665000,166,10,42660727,42660728,VN1R54P,ENSG00000232109,+,42660000,42665000
3,10,42660000,42665000,173,10,42660727,42660728,VN1R54P,ENSG00000232109,+,42660000,42665000
4,10,42660000,42665000,178,10,42660727,42660728,VN1R54P,ENSG00000232109,+,42660000,42665000


### Add gene overlaps to SNP-Loop Pairs

In [26]:
gene_overlaps.columns = ['chrSNP', 'startSNP', 'endSNP', 'gh_id',
                         'chrGene', 'startGene', 'endGene',
                         'genename', 'geneid', 'strand', 'bin_start', 'bin_end']
gwas_hichip_genes = gwas_hichip.merge(gene_overlaps,
                                      on=['gh_id'],
                                      how='left')
gwas_hichip_genes = gwas_hichip_genes.loc[~gwas_hichip_genes.chrSNP.isna()]

In [27]:
gwas_hichip.head()

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,snp_anchor,gh_id
0,1,19970000,19975000,19972330,1,19535000,19540000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19535000:19540000:1,AnchorB,0
1,1,19970000,19975000,19972330,1,19715000,19720000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19715000:19720000:1,AnchorB,1
2,1,19970000,19975000,19972330,1,19810000,19815000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19810000:19815000:1,AnchorB,2
3,1,19970000,19975000,19972330,1,19920000,19925000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19920000:19925000:1,AnchorB,3
4,1,19970000,19975000,19972330,1,19925000,19930000,1,19970000,19975000,CD4N,T1D_32005708,1,chr1:19579228-20579228,982,1:19972330,1,19972330,A,G,0.0843,0.1618,0.0419,3.86158,0.04914,2.07697,-0.001938,0.06338,-0.039446,0.283315,5.6e-05,19970000,19975000,chr1:19972330,19975000:19972330:1:19925000:19930000:1,AnchorB,4


In [28]:
gwas_hichip.shape

(2240, 37)

In [29]:
gwas_hichip_genes.gwas_source.unique().tolist()

['T1D_32005708', 'T1D_34594039_GCST90018925', 'T1D_34012112_Gaulton']

#### Make a table of uniq SNPs and Genes

In [30]:
# find the unique SNPs
uniq_snps_by_cells = gwas_hichip_genes.groupby('cline_loop').sid.nunique()
uniq_snps_by_cells = uniq_snps_by_cells.to_frame()

# find the unique genes 
uniq_genes_by_cells = gwas_hichip_genes.groupby('cline_loop').geneid.nunique()
uniq_genes_by_cells = uniq_genes_by_cells.to_frame()

# merge snps and genes
uniq_counts_by_cells = pd.merge(uniq_snps_by_cells, uniq_genes_by_cells, left_index=True, right_index=True)
uniq_counts_by_cells.columns = ['Number of Unique SNPs', 'Number of Unique Genes']
uniq_counts_by_cells.index.name = 'Cell Line'

# save the file
excel_analysis = os.path.join(outdir, 'Unique_Counts_By_Cell_Line.xlsx')
uniq_counts_by_cells.to_excel(excel_analysis, sheet_name='finemapping')

In [31]:
excel_analysis

'results/main/Intersect_T1D_Finemap_GWAS_SNPs_with_HiChIP/Unique_Counts_By_Cell_Line.xlsx'

#### Write the gene list as well

In [32]:
gh_list = gwas_hichip_genes.geneid.unique()
gh_fn = os.path.join(outdir, 'gene_list.txt')
with open(gh_fn, 'w') as fw:
    for x in gh_list:
        fw.write('{}\n'.format(x))

#### Get the unique genes per cell type 

In [33]:
genes_by_cell = gwas_hichip_genes[['cline_loop', 'geneid']].drop_duplicates()
genes_by_cell.sort_values(['cline_loop', 'geneid'], inplace=True)
genes_by_cell['source'] = 'finemap_with_hichip'
genes_by_cell.columns = ['cline', 'geneid', 'source']
fn = os.path.join(outdir, 'genes_by_cell.xlsx')
genes_by_cell.to_excel(fn, index=False)

## Summarize the SNPs, Loops and Intersection

In [34]:
total_gwas = gwas_df.shape[0]
# # summarize the total number of GWAS loops per cell # DOESN't MAKE SENSE
# cell_summary['total_gwas'] = gwas_hichip.groupby('cline_loop').nunique('sid')['chr_snp']
# cell_summary['total_gwas'] = cell_summary['total_gwas'].to_frame()
# cell_summary['total_gwas'].columns = ['Total GWAS SNPs']
# cell_summary['total_gwas']

In [35]:
cell_summary = {}

### Summarize the Number of Loops per Cell (pre-intersection)

In [36]:
cell_summary['total_loops'] = loop_df.groupby('cline').count()['startA'].to_frame()
cell_summary['total_loops'].columns = ['total_hichip']
cell_summary['total_loops']

Unnamed: 0_level_0,total_hichip
cline,Unnamed: 1_level_1
CD4N,114421
CD8N,84599
CM,84298
NB,128288
NK,129890


### Summarize the Number of SNP-Loop (SL) Pairs per Cell

In [37]:
cell_summary['sl_pairs'] = gwas_hichip['cline_loop'].value_counts().to_frame()
cell_summary['sl_pairs'].columns = ['sl_pairs']
cell_summary['sl_pairs']

Unnamed: 0,sl_pairs
NB,712
NK,529
CD4N,411
CM,295
CD8N,293


### Summarize the Number of Unique GWAS SNPs which Overlap a HiChIP Loop Cell

In [38]:
cell_summary['uniq_gwas'] = gwas_hichip.groupby('cline_loop')['sid'].nunique().to_frame()
cell_summary['uniq_gwas'].columns = ['uniq_gwas_in_slpairs']
cell_summary['uniq_gwas']

Unnamed: 0_level_0,uniq_gwas_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,91
CD8N,74
CM,79
NB,147
NK,111


### Summarize the Number of Loops with GWAS Overlaps (per cell)

In [39]:
loop_cols = ['chrA_loop', 'startA_loop', 'endA_loop', 'chrB_loop', 'startB_loop', 'endB_loop']
cell_summary['uniq_loops'] = gwas_hichip.groupby('cline_loop')['loop_id'].nunique().to_frame()
cell_summary['uniq_loops'].columns = ['uniq_loops_in_slpairs']
cell_summary['uniq_loops']

Unnamed: 0_level_0,uniq_loops_in_slpairs
cline_loop,Unnamed: 1_level_1
CD4N,287
CD8N,211
CM,226
NB,470
NK,342


In [40]:
concat_list = [cell_summary['total_loops'], cell_summary['sl_pairs'],
               cell_summary['uniq_gwas'], cell_summary['uniq_loops']]
summary = pd.concat(concat_list, axis=1)
summary['pct_uniq_gwas_in_slpairs'] = summary['uniq_gwas_in_slpairs'] / total_gwas * 100
summary['pct_uniq_loops_in_slpairs'] = summary['uniq_loops_in_slpairs'] / summary['total_hichip'] * 100

In [41]:
summary

Unnamed: 0,total_hichip,sl_pairs,uniq_gwas_in_slpairs,uniq_loops_in_slpairs,pct_uniq_gwas_in_slpairs,pct_uniq_loops_in_slpairs
CD4N,114421,411,91,287,12.816901,0.250828
CD8N,84599,293,74,211,10.422535,0.249412
CM,84298,295,79,226,11.126761,0.268097
NB,128288,712,147,470,20.704225,0.366363
NK,129890,529,111,342,15.633803,0.2633


In [42]:
final_summary = summary.copy()

In [43]:
final_colnames = ['Total\\nHiChIP Loops', 
                  'Number of\\nGWAS-Loop Pairs',
                  'Number of\\nUnique GWAS SNPs in GL Pairs', 
                  'Number of\\nUnique loops in GL Pairs',
                  'Percentage of\\nUnique GWAS SNPs in GL Pairs', 
                  'Percentage of\\nUnique loops in GL Pairs']
final_colnames = ['Total HiChIP Loops', 
                  'Number of GWAS-Loop Pairs',
                  'Number of Unique GWAS SNPs in GL Pairs', 
                  'Number of Unique loops in GL Pairs',
                  'Percentage of Unique GWAS SNPs in GL Pairs', 
                  'Percentage of Unique loops in GL Pairs']
final_summary.columns = final_colnames

In [44]:
display(HTML(final_summary.to_html().replace("\\n","<br>")))

Unnamed: 0,Total HiChIP Loops,Number of GWAS-Loop Pairs,Number of Unique GWAS SNPs in GL Pairs,Number of Unique loops in GL Pairs,Percentage of Unique GWAS SNPs in GL Pairs,Percentage of Unique loops in GL Pairs
CD4N,114421,411,91,287,12.816901,0.250828
CD8N,84599,293,74,211,10.422535,0.249412
CM,84298,295,79,226,11.126761,0.268097
NB,128288,712,147,470,20.704225,0.366363
NK,129890,529,111,342,15.633803,0.2633


## Investigate 

In [53]:
gwas_hichip_genes.loc[gwas_hichip_genes.genename == 'ZKSCAN4']

Unnamed: 0,chr_snp,bin_start_x,bin_end_x,pos,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,cline_loop,gwas_source,regionID,GWASLoci,index,rsid,chromosome,position,allele1,allele2,maf,beta,se,z,prob,log10bf,mean,sd,mean_incl,sd_incl,pval,bin_start_y,bin_end_y,sid,loop_id,snp_anchor,gh_id,chrSNP,startSNP,endSNP,chrGene,startGene,endGene,genename,geneid,strand,bin_start,bin_end
