In [1]:
import os 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import subprocess
import glob
import pybedtools as pbt 
pbt.set_tempdir('/mnt/hpcscratch/jreyna/')

pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
import numpy as np

os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

genome_sizes = 'results/refs/hg19/hg19.chrom.sizes'

# make the directory to save our data
outdir = 'results/main/compiled_sgl_approaches/'
os.makedirs(outdir, exist_ok=True)

## Loading SGLs from different sources

In [2]:
coloc_fn = 'results/main/coloc/sgl_intersect/coloc_ld_sgls.tsv'

In [3]:
pieqtl_fn = 'results/main/pieqtls/sgl_intersect/pieqtls_sgls.tsv'

In [4]:
finemap_fn = 'results/main/finemapping/sgl_intersect/finemap_sgls.tsv'

In [5]:
coloc = pd.read_table(coloc_fn)
coloc['sgl_type'] = 'coloc'

pieqtl = pd.read_table(pieqtl_fn)
pieqtl['sgl_type'] = 'pieqtl'

finemap = pd.read_table(finemap_fn)
finemap['sgl_type'] = 'finemap'

# Adding chr to the chromosome columns 
finemap['chrA_loop'] = 'chr' + finemap['chrA_loop'].astype(str)
finemap['chrB_loop'] = 'chr' + finemap['chrB_loop'].astype(str)

## Loading Gene Data

In [6]:
print('# Load the gene data')

res = 5000

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

print(genes_df.head())

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
   chrom  start    end        gname          gene_id strand
0   chr1  11869  14409      DDX11L1  ENSG00000223972      +
12  chr1  14404  29570       WASH7P  ENSG00000227232      -
25  chr1  17369  17436    MIR6859-1  ENSG00000278267      -
28  chr1  29554  31109  MIR1302-2HG  ENSG00000243485      +
36  chr1  30366  30503    MIR1302-2  ENSG00000284332      +
There are 58825 genes in this GTF-derived file.


In [7]:
coloc

Unnamed: 0,chr_snp,start_snp,end_snp,chr_gene,start_gene,end_gene,coloc.id,chrA,startA,endA,chrB,startB,endB,loop.id,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,pp_H2_Coloc_Summary,pp_H3_Coloc_Summary,pp_H4_Coloc_Summary,ld_rsID,variant_id,geneName,dist,pvalue,FDR,slope_snp,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,old_pos,was_converted,rsID,main.chr,main.pos,rs_id,LD,chr1,s1,e1,chr2,s2,e2,cc,Coverage1,isPeak1,Bias1,Mapp1,GCContent1,RESites1,Coverage2,isPeak2,Bias2,Mapp2,GCContent2,RESites2,p,exp_cc_Bias,p_Bias,dbinom_Bias,P-Value_Bias,Q-Value_Bias,celltype,sgl_type
0,chr18,12830537,12830538,chr18,11857553,11857554,3,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4293150,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814916,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.878018,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.418580,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,0.000003,monocyte_naive,coloc
1,chr18,12838765,12838766,chr18,11857553,11857554,4,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4296430,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10814917,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,1.000000,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.418580,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,0.000003,monocyte_naive,coloc
2,chr18,12866435,12866436,chr18,11857553,11857554,5,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4283682,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs10974435,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.930208,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.418580,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,0.000003,monocyte_naive,coloc
3,chr18,12884342,12884343,chr18,11857553,11857554,6,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4284961,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.418580,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,0.000003,monocyte_naive,coloc
4,chr18,12856907,12856908,chr18,11857553,11857554,15,chr18,11850000,11855000,chr18,12880000,12885000,572433,chr9,4287466,4.728672e-289,9.059926e-287,0.001165,0.222490,0.776345,rs7041847,9:4296430:A:G,ENSG00000107249,51962,2.764420e-02,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.180000e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.949290,chr18,11850000,11855000,chr18,12880000,12885000,22,5889,1,4.418580,0,0,0,6594,1,4.947549,0,0,0,9.957719e-08,4.878204,4.563685e-08,9.383704e-09,1.187546e-08,0.000003,monocyte_naive,coloc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,chr4,123067807,123067808,chr4,122884711,122884712,78,chr4,122870000,122875000,chr4,123070000,123075000,457354,chr8,120096814,6.662446e-83,2.825264e-82,0.018490,0.077506,0.904003,rs2450065,8:119071462:G:C,ENSG00000164761,119577,1.869290e-02,0.517206,-0.262816,G,C,3209,0.640775,5008,0.013207,-0.091563,0.014668,4.310000e-10,119071462,1,rs1485294,chr8,120083701,rs1485294,0.844994,chr4,122870000,122875000,chr4,123070000,123075000,14,226,1,0.135535,0,0,0,1468,1,0.880380,0,0,0,8.717059e-07,2.125524,1.476179e-08,5.260317e-08,6.118760e-08,0.000007,CD4_T-cell_naive,coloc
439,chr4,123087956,123087957,chr4,122884711,122884712,49,chr4,122870000,122875000,chr4,123070000,123075000,457354,chr18,67529964,0.000000e+00,0.000000e+00,0.000081,0.095307,0.904612,rs35977166,18:69870115:T:C,ENSG00000206052,469227,7.026230e-07,0.000302,-0.407996,T,C,2041,0.407548,5008,0.004730,-0.100640,0.013979,6.040000e-13,69870115,1,rs17207042,chr18,67537351,rs17207042,0.995607,chr4,122870000,122875000,chr4,123070000,123075000,14,226,1,0.135535,0,0,0,1468,1,0.880380,0,0,0,8.717059e-07,2.125524,1.476179e-08,5.260317e-08,6.118760e-08,0.000007,CD4_T-cell_naive,coloc
440,chr4,123116994,123116995,chr4,122884711,122884712,76,chr4,122870000,122875000,chr4,123070000,123075000,457354,chr8,120103850,6.662446e-83,2.825264e-82,0.018490,0.077506,0.904003,rs201978307,8:119071462:G:C,ENSG00000164761,119577,1.869290e-02,0.517206,-0.262816,G,C,3209,0.640775,5008,0.013207,-0.091563,0.014668,4.310000e-10,119071462,1,rs1485294,chr8,120083701,rs1485294,0.882144,chr4,122870000,122875000,chr4,123070000,123075000,14,226,1,0.135535,0,0,0,1468,1,0.880380,0,0,0,8.717059e-07,2.125524,1.476179e-08,5.260317e-08,6.118760e-08,0.000007,CD4_T-cell_naive,coloc
441,chr4,123116861,123116862,chr4,122884711,122884712,77,chr4,122870000,122875000,chr4,123070000,123075000,457354,chr8,120097144,6.662446e-83,2.825264e-82,0.018490,0.077506,0.904003,rs2450064,8:119071462:G:C,ENSG00000164761,119577,1.869290e-02,0.517206,-0.262816,G,C,3209,0.640775,5008,0.013207,-0.091563,0.014668,4.310000e-10,119071462,1,rs1485294,chr8,120083701,rs1485294,0.848834,chr4,122870000,122875000,chr4,123070000,123075000,14,226,1,0.135535,0,0,0,1468,1,0.880380,0,0,0,8.717059e-07,2.125524,1.476179e-08,5.260317e-08,6.118760e-08,0.000007,CD4_T-cell_naive,coloc


## Intersecting PC-HiC Data

#### Coloc Approach

In [8]:
# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'monocyte_naive',
                        'naive-b': 'B-cell_naive',
                        'total-b': 'B-cell_naive',
                        'gms_merged': 'B-cell_naive', 
                        'naive-cd4': 'CD4_T-cell_naive', 
                        'total-cd4': 'CD4_T-cell_naive', 
                        'non-activated-total-cd4': 'CD4_T-cell_naive',
                        'naive-cd8': 'CD8_T-cell_naive',
                        'total-cd8': 'CD8_T-cell_naive'} 

In [9]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
coloc['mid'] = range(coloc.shape[0])

# renaming loop columns
coloc.rename(columns={'chrA': 'chrA_loop', 'startA': 'startA_loop', 'endA': 'endA_loop',
                      'chrB': 'chrB_loop', 'startB': 'startB_loop', 'endB': 'endB_loop',
                      'celltype': 'cline_loop'}, inplace=True)

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for fn in pc_hics:
    pchic_cline = os.path.basename(fn).split('.')[0]
    
    if pchic_cline in hichip_pchic_matches: 
        
        # getting hichip data for the current cell line
        hichip_cline = hichip_pchic_matches[pchic_cline]
        hichip_cline_df = coloc.loc[coloc.cline_loop == hichip_cline, hichip_bedpe_cols]
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        #pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        #pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting hichip and pc-hic bedpes
        hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        
        
        both_loops = hichip_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

overlap found: results/main/pc_hic/2016_javierre/processing/non-activated-total-cd4.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/monocytes.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/naive-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd8.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd4.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/total-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd8.bedpe
overlap not found: results/main/pc_hic/2019_jung/processing/gms_merged.bedpe


In [10]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
coloc = coloc.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
coloc.drop('mid', axis=1, inplace=True)

# no overlap with gms_merged so manually adding 
coloc['pchic.monocytes'] = np.nan
coloc['pchic.naive-b'] = np.nan
coloc['pchic.total-b'] = np.nan
coloc['pchic.gms_merged'] = np.nan
coloc['pchic.total-cd8'] = np.nan

In [11]:
# adding column to specify PC-HiC Support
pchic_support = []
for i, sr in coloc.iterrows():
    
    pchic_supp = 0 
    if sr.cline_loop == 'CD4_T-cell_naive':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CD8_T-cell_naive':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'monocyte_naive':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline_loop == 'B-cell_naive':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5 or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
    
    pchic_support.append(pchic_supp)
        
coloc['pchic_support'] = pchic_support

In [12]:
coloc = coloc.merge(genes_df[['gene_id', 'gname']], left_on='geneName', right_on='gene_id')

In [13]:
coloc

Unnamed: 0,chr_snp,start_snp,end_snp,chr_gene,start_gene,end_gene,coloc.id,chrA_loop,startA_loop,endA_loop,chrB_loop,startB_loop,endB_loop,loop.id,chr,pos,pp_H0_Coloc_Summary,pp_H1_Coloc_Summary,pp_H2_Coloc_Summary,pp_H3_Coloc_Summary,pp_H4_Coloc_Summary,ld_rsID,variant_id,geneName,dist,pvalue,FDR,slope_snp,ref,alt,AC,AF,AN,slope_se_snp,slope_gwas,slope_se_gwas,pval_nominal,old_pos,was_converted,rsID,main.chr,main.pos,rs_id,LD,chr1,s1,e1,chr2,s2,e2,cc,Coverage1,isPeak1,Bias1,Mapp1,GCContent1,RESites1,Coverage2,isPeak2,Bias2,Mapp2,GCContent2,RESites2,p,exp_cc_Bias,p_Bias,dbinom_Bias,P-Value_Bias,Q-Value_Bias,cline_loop,sgl_type,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-cd4,pchic.monocytes,pchic.naive-b,pchic.total-b,pchic.gms_merged,pchic.total-cd8,pchic_support,gene_id,gname
0,chr6,91002493,91002494,chr6,90320442,90320443,3,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4293150,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs10814916,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.878018,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
1,chr6,90986319,90986320,chr6,90320442,90320443,4,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4296430,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs10814917,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,1.0,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
2,chr6,90995979,90995980,chr6,90320442,90320443,5,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4283682,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs10974435,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.930208,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
3,chr6,90996768,90996769,chr6,90320442,90320443,5,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4283682,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs10974435,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.930208,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
4,chr6,90996768,90996769,chr6,90320442,90320443,5,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4283682,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs10974435,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.930208,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
5,chr6,90968948,90968949,chr6,90320442,90320443,6,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4284961,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
6,chr6,90985158,90985159,chr6,90320442,90320443,6,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4284961,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
7,chr6,90985158,90985159,chr6,90320442,90320443,6,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4284961,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs34494309,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.906997,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
8,chr6,91002493,91002494,chr6,90320442,90320443,15,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4287466,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs7041847,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.94929,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3
9,chr6,91002493,91002494,chr6,90320442,90320443,15,chr6,90345000,90350000,chr6,91005000,91010000,469593,chr9,4287466,4.728672e-289,9.059926000000001e-287,0.001165,0.22249,0.776345,rs7041847,9:4296430:A:G,ENSG00000107249,51962,0.0276442,0.568634,-0.325938,A,G,1700,0.339457,5008,0.008901,-0.119714,0.013994,1.18e-17,4296430,1,rs10814917,chr9,4296430,rs10814917,0.94929,chr6,90345000,90350000,chr6,91005000,91010000,46,6242,1,3.743412,0,0,0,17414,1,10.443413,0,0,0,2.448258e-07,20.417326,1.417986e-07,4.489987e-07,7.849391e-07,8.028595e-05,CD4_T-cell_naive,coloc,3.418659,,,,,,,,,0,ENSG00000107249,GLIS3


In [14]:
coloc_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                'chrB_loop', 'startB_loop', 'endB_loop', 
                'pos', 'ld_rsID', 'gname', 'gene_id', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'rs_id']

In [15]:
# coloc_cols = ['chrA', 'startA', 'endA',
#                 'chrB', 'startB', 'endB', 
#                 'pos', 'ld_rsID', 'gname', 'gene_id', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
#                 'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
#                 'pchic.non-activated-total-cd4', 'pchic.total-b',
#                 'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'rs_id']

In [16]:
coloc_min = coloc.loc[:, coloc_cols]

In [17]:
coloc_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'rsid', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'lead.rs_id']

In [18]:
coloc_min

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,pos,rsid,genename,geneid,cline,sgl_type,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,pchic_support,lead.rs_id
0,chr6,90345000,90350000,chr6,91005000,91010000,4293150,rs10814916,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
1,chr6,90345000,90350000,chr6,91005000,91010000,4296430,rs10814917,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
2,chr6,90345000,90350000,chr6,91005000,91010000,4283682,rs10974435,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
3,chr6,90345000,90350000,chr6,91005000,91010000,4283682,rs10974435,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
4,chr6,90345000,90350000,chr6,91005000,91010000,4283682,rs10974435,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
5,chr6,90345000,90350000,chr6,91005000,91010000,4284961,rs34494309,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
6,chr6,90345000,90350000,chr6,91005000,91010000,4284961,rs34494309,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
7,chr6,90345000,90350000,chr6,91005000,91010000,4284961,rs34494309,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
8,chr6,90345000,90350000,chr6,91005000,91010000,4287466,rs7041847,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917
9,chr6,90345000,90350000,chr6,91005000,91010000,4287466,rs7041847,GLIS3,ENSG00000107249,CD4_T-cell_naive,coloc,,,3.418659,,,,,,0,rs10814917


#### Finemap Approach

In [19]:
# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'CM',
                        'naive-b': 'NB',
                        'total-b': 'NB',
                        'gms_merged': 'NB', 
                        'naive-cd4': 'CD4N', 
                        'total-cd4': 'CD4N', 
                        'non-activated-total-cd4': 'CD4N',
                        'naive-cd8': 'CD8N',
                        'total-cd8': 'CD8N'} 

In [20]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
finemap['mid'] = range(finemap.shape[0])

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for fn in pc_hics:
    pchic_cline = os.path.basename(fn).split('.')[0]
    if pchic_cline in hichip_pchic_matches: 
        
        # getting hichip data for the current cell line
        hichip_cline = hichip_pchic_matches[pchic_cline]
        hichip_cline_df = finemap.loc[finemap.cline_loop == hichip_cline, hichip_bedpe_cols]
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        #         pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        #         pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting hichip and pc-hic bedpes
        hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        both_loops = hichip_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

overlap found: results/main/pc_hic/2016_javierre/processing/non-activated-total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/monocytes.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd8.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd8.bedpe
overlap not found: results/main/pc_hic/2019_jung/processing/gms_merged.bedpe


In [21]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
finemap = finemap.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
finemap.drop('mid', axis=1, inplace=True)

# no overlap with gms_merged so manually adding 
finemap['pchic.gms_merged'] = np.nan

In [22]:
# adding column to specify PC-HiC Support
pchic_support = []
for i, sr in finemap.iterrows():
    
    pchic_supp = 0 
    if sr.cline_loop == 'CD4N':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CD8N':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CM':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline_loop == 'NB':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5 or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
    
    pchic_support.append(pchic_supp)
        
finemap['pchic_support'] = pchic_support

In [23]:
# generating the hgvs id to query myvariant 
finemap['hgvs_id'] = finemap['chrA_loop'].astype(str) + ":g." + \
                        finemap['pos'].astype(str) + finemap['allele2'] + '>' + finemap['allele1']

In [24]:
finemap_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                'chrB_loop', 'startB_loop', 'endB_loop', 
                'pos', 'hgvs_id', 'genename', 'geneid', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

In [25]:
finemap_min = finemap[finemap_cols]

In [26]:
finemap_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'hgvs_id', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

#### Pieqtl Approach

In [27]:
# create locus A columns which require startA and endA
def pos_to_bin(pos, res):
    start = int(np.floor(pos / res) * res)
    end = start + res 
    return([start, end])

lociA = pieqtl['pieQTL.Position'].apply(pos_to_bin, res=5000)
lociA = pd.DataFrame(lociA.values.tolist())
lociA.columns = ['startA', 'endA']

# create locus B columns which require startB and endB
def tss_to_bin(tss, res, slop=0):
    start = int(np.floor(tss / res) * res)
    end = start + res 
    return([start, end])

lociB = pieqtl['TSS'].apply(tss_to_bin, res=5000)
lociB = pd.DataFrame(lociB.values.tolist())
lociB.columns = ['startB', 'endB']

# create locus columns to main intersect df 
pieqtl = pd.concat([pieqtl, lociA, lociB], axis=1)

# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'monocyte_naive',
                        'naive-b': 'B-cell_naive',
                        'total-b': 'B-cell_naive',
                        'gms_merged': 'B-cell_naive', 
                        'naive-cd4': 'CD4_T-cell_naive', 
                        'total-cd4': 'CD4_T-cell_naive', 
                        'non-activated-total-cd4': 'CD4_T-cell_naive',
                        'naive-cd8': 'CD8_T-cell_naive',
                        'total-cd8': 'CD8_T-cell_naive'} 

In [28]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
pieqtl['mid'] = range(pieqtl.shape[0])

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
intersect_bedpe_cols = ['CHR', 'startA', 'endA', 'CHR', 'startB', 'endB', 'mid']

for fn in pc_hics:
    
    pchic_cline = os.path.basename(fn).split('.')[0]
    if pchic_cline in hichip_pchic_matches: 
        
        # getting pieqtl data for the current cell line
        pieqtl_cline = hichip_pchic_matches[pchic_cline]
        pieqtl_cline_df = pieqtl.loc[pieqtl.cline == pieqtl_cline, intersect_bedpe_cols]
        pieqtl_cline_df.iloc[:, 0] = pieqtl_cline_df.iloc[:, 0].str.replace('chr', '')
        pieqtl_cline_df.iloc[:, 3] = pieqtl_cline_df.iloc[:, 0].str.replace('chr', '')
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting pieqtl and pc-hic bedpes
        pieqtl_cline_pbt = pbt.BedTool.from_dataframe(pieqtl_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        both_loops = pieqtl_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

overlap found: results/main/pc_hic/2016_javierre/processing/non-activated-total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/monocytes.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd8.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/total-b.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd4.bedpe
overlap found: results/main/pc_hic/2016_javierre/processing/naive-cd8.bedpe
overlap not found: results/main/pc_hic/2019_jung/processing/gms_merged.bedpe


In [29]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
pieqtl = pieqtl.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
pieqtl.drop('mid', axis=1, inplace=True)

pchic_support = []
for i, sr in pieqtl.iterrows():
    
    pchic_supp = 0 
    if sr.cline == 'CD4_T-cell_naive':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
    elif sr.cline == 'CD8_T-cell_naive':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
    elif sr.cline == 'monocyte_naive':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline == 'B-cell_naive':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5: #or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
            
    pchic_support.append(pchic_supp)
        
pieqtl['pchic_support'] = pchic_support

In [30]:
# generating the hgvs id to query myvariant 
pieqtl['hgvs_id'] = pieqtl['CHR'].astype(str) + ":g." + \
                        pieqtl['POS'].astype(str) + pieqtl['ref'] + '>' + pieqtl['alt']

In [31]:
pieqtl_cols = ['CHR', 'startA', 'endA',
               'CHR', 'startB', 'endB',
               'POS', 'hgvs_id', 'genename', 'geneid', 'cline', 'sgl_type', 'pchic.monocytes',
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

In [32]:
pieqtl_min = pieqtl[pieqtl_cols]

In [33]:
pieqtl_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'hgvs_id', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

## Concating and comparing

In [34]:
genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.grch37.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gene_name', 'subtype']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
# genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

In [35]:
# combining finemap nd pieqtls first
agg_sgls = pd.concat([finemap_min, pieqtl_min])
agg_sgls = agg_sgls.merge(genes_df[['gene_id', 'subtype']], left_on='geneid', right_on='gene_id')

In [36]:
import myvariant

# generate a dictionary of hgvs id to rsid
mv = myvariant.MyVariantInfo()
query = mv.getvariants(agg_sgls.hgvs_id, fields=['dbsnp.rsid'])

hgvs_to_rsid = {}
for rec in query:
    
    if 'dbsnp' in rec:
        hgvs_to_rsid[rec['query']] = rec['dbsnp']['rsid']
    else:
        hgvs_to_rsid[rec['query']] = 'Not Found'

agg_sgls['rsid'] = agg_sgls['hgvs_id'].replace(hgvs_to_rsid)

querying 1-355...done.


In [37]:
agg_sgls

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,pos,hgvs_id,genename,geneid,cline,sgl_type,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,pchic_support,gene_id,subtype,rsid
0,chr1,19810000,19815000,chr1,19970000,19975000,19972330,chr1:g.19972330G>A,AL391883.1,ENSG00000235434,CD4N,finemap,,,16.519501,,,,,,1,ENSG00000235434,antisense,rs79540600
1,chr1,19810000,19815000,chr1,19970000,19975000,19972330,chr1:g.19972330G>A,AL391883.1,ENSG00000235434,CD8N,finemap,,,,14.945686,,,,,1,ENSG00000235434,antisense,rs79540600
2,chr10,124125000,124130000,chr10,124765000,124770000,124128690,chr10:g.124128690T>C,RF00019,ENSG00000199466,CD8N,finemap,,,,7.879563,,,,,1,ENSG00000199466,misc_RNA,rs11816578
3,chr11,1855000,1860000,chr11,2020000,2025000,2021075,chr11:g.2021075C>T,MIR4298,ENSG00000264493,CD4N,finemap,,,3.748032,,,,,,0,ENSG00000264493,miRNA,rs2107425
4,chr11,1855000,1860000,chr11,2020000,2025000,2021075,chr11:g.2021075C>T,MIR4298,ENSG00000264493,NB,finemap,,,,,,3.428174,,,0,ENSG00000264493,miRNA,rs2107425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,chr6,31145000,31150000,chr6,31125000,31130000,31149520,chr6:g.31149520G>A,CCHCR1,ENSG00000204536,CD4_T-cell_naive,pieqtl,,,,,2.386576,,,,0,ENSG00000204536,protein_coding,rs3130508
351,chr6,32220000,32225000,chr6,32335000,32340000,32224388,chr6:g.32224388C>A,TSBP1,ENSG00000204296,B-cell_naive,pieqtl,,6.253983,,,,,,,1,ENSG00000204296,protein_coding,rs9268005
352,chr6,32220000,32225000,chr6,32335000,32340000,32224388,chr6:g.32224388C>A,TSBP1,ENSG00000204296,B-cell_naive,pieqtl,,6.253983,,,,,,,1,ENSG00000204296,protein_coding,rs9268005
353,chr6,26535000,26540000,chr6,26420000,26425000,26537801,chr6:g.26537801G>A,BTN2A3P,ENSG00000124549,CD8_T-cell_naive,pieqtl,,,,2.169706,,,,,0,ENSG00000124549,transcribed_unprocessed_pseudogene,rs6920256


In [38]:
# add the colocs 
agg_sgls.drop('hgvs_id', axis=1, inplace=True)
tmp_coloc_min = coloc_min.merge(genes_df[['gene_id', 'subtype']], left_on='geneid', right_on='gene_id')
agg_sgls = pd.concat([agg_sgls, tmp_coloc_min])

In [39]:
agg_gene_grps = agg_sgls.groupby('subtype')

In [40]:
list(agg_gene_grps.groups.keys())

['antisense',
 'lincRNA',
 'miRNA',
 'misc_RNA',
 'processed_pseudogene',
 'protein_coding',
 'snRNA',
 'snoRNA',
 'transcribed_unitary_pseudogene',
 'transcribed_unprocessed_pseudogene',
 'unprocessed_pseudogene']

In [41]:
# creating a dictionary which matches 
# hichip and pc-hic data
dice_to_sgl_names = {'CM': 'monocyte_naive',
                        'NB': 'B-cell_naive', 
                        'CD4N': 'CD4_T-cell_naive',
                        'CD8N': 'CD8_T-cell_naive'}

In [42]:
agg_sgls.cline = agg_sgls.cline.replace(dice_to_sgl_names)

## Focusing on protein coding genes

In [43]:
prots_only = agg_gene_grps.get_group('protein_coding')
prots_only.sort_values(['geneid', 'cline', 'sgl_type'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prots_only.sort_values(['geneid', 'cline', 'sgl_type'], inplace=True)


In [44]:
sig_prots = prots_only.loc[prots_only.pchic_support == 1]

In [45]:
g = sig_prots.groupby(['cline', 'sgl_type'])
sgl_count_per_cell = g.genename.value_counts()
sgl_count_per_cell = sgl_count_per_cell.to_frame()
sgl_count_per_cell.columns = ['sgl_counts']
sgl_count_per_cell.reset_index(inplace=True)

In [46]:
sgl_count_per_cell.sort_values('sgl_counts', ascending=False)

Unnamed: 0,cline,sgl_type,genename,sgl_counts
30,CD8_T-cell_naive,pieqtl,LST1,25
31,CD8_T-cell_naive,pieqtl,BACH2,18
32,CD8_T-cell_naive,pieqtl,ZKSCAN4,13
4,B-cell_naive,pieqtl,BTN2A1,12
8,CD4_T-cell_naive,coloc,GLIS3,10
33,CD8_T-cell_naive,pieqtl,LY6G5B,9
42,monocyte_naive,pieqtl,TRIM27,8
5,B-cell_naive,pieqtl,BTNL2,7
43,monocyte_naive,pieqtl,LY6G5C,4
18,CD4_T-cell_naive,pieqtl,LY6G5C,2


In [47]:
sgl_count_per_cell.genename.nunique()

35

## Intersecting Dice Gene Expression

In [48]:
ge_list = ['results/main/dice_gene_expression/CD4N_TPM.csv', 
           'results/main/dice_gene_expression/CD8N_TPM.csv', 
           'results/main/dice_gene_expression/CM_TPM.csv', 
           'results/main/dice_gene_expression/NB_TPM.csv']

In [49]:
dice_expr_list = []
for ge_fn in ge_list:
    print(ge_fn)

    df = pd.read_table(ge_fn, sep=',')
    cline = ge_fn.split('/')[-1].split('_')[0]
    df['cline'] = dice_to_sgl_names[cline]
    
    df.drop(['Transcript_Length(bp)', 'Additional_annotations'], axis=1, inplace=True)
    df.set_index('Feature_name', inplace=True)
    df = df.median(axis=1).to_frame()
    df.columns = ['ge.' + dice_to_sgl_names[cline]]
    
    dice_expr_list.append(df)

# concat all of the gene expression data
dice_expr = pd.concat(dice_expr_list, axis=1)
dice_expr.index = dice_expr.index.str.replace('\.[A-Za-z0-9_-]*$', '')

results/main/dice_gene_expression/CD4N_TPM.csv


  df = df.median(axis=1).to_frame()


results/main/dice_gene_expression/CD8N_TPM.csv


  df = df.median(axis=1).to_frame()


results/main/dice_gene_expression/CM_TPM.csv


  df = df.median(axis=1).to_frame()


results/main/dice_gene_expression/NB_TPM.csv


  df = df.median(axis=1).to_frame()
  dice_expr.index = dice_expr.index.str.replace('\.[A-Za-z0-9_-]*$', '')


In [50]:
sig_prots = sig_prots.merge(dice_expr, left_on='geneid', right_index=True)

In [51]:
# calculating the specfic expression approach
ge_cell_specific = []
for i, sr in sig_prots.iterrows():
    
    ge_spec = 0
    if sr.cline == 'CD4_T-cell_naive' and sr['ge.CD4_T-cell_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'CD8_T-cell_naive' and sr['ge.CD8_T-cell_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'monocyte_naive' and sr['ge.monocyte_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'B-cell_naive' and sr['ge.B-cell_naive'] > 8:
        ge_spec = 1 
    
    ge_cell_specific.append(ge_spec)
        
sig_prots['ge_support'] = ge_cell_specific

In [52]:
# making a final table
expr_sig_prots = sig_prots.loc[sig_prots.ge_support == 1]
expr_sig_prots.sort_values(['genename', 'cline'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expr_sig_prots.sort_values(['genename', 'cline'], inplace=True)


In [53]:
single_reps = expr_sig_prots.drop_duplicates(['genename', 'cline'])
pd.set_option('display.max_rows', None)
display(single_reps)
pd.set_option('display.max_rows', 100)

Unnamed: 0,chrA,startA,endA,chrB,startB,endB,pos,genename,geneid,cline,sgl_type,pchic.monocytes,pchic.naive-b,pchic.naive-cd4,pchic.naive-cd8,pchic.non-activated-total-cd4,pchic.total-b,pchic.total-cd4,pchic.total-cd8,pchic_support,gene_id,subtype,rsid,lead.rs_id,ge.CD4_T-cell_naive,ge.CD8_T-cell_naive,ge.monocyte_naive,ge.B-cell_naive,ge_support
79,chr19,47110000,47115000,chr19,47220000,47225000,47223949,AC008755.1,ENSG00000130749,B-cell_naive,finemap,,8.163704,,,,,,,1,ENSG00000130749,protein_coding,rs313841,,22.902918,24.631782,13.804148,19.079346,1
240,chr6,31800000,31805000,chr6,31580000,31585000,31801233,AIF1,ENSG00000204472,CD4_T-cell_naive,pieqtl,,,6.593487,,,,,,1,ENSG00000204472,protein_coding,rs2736430,,101.321612,135.829169,1410.817016,1.253391,1
265,chr6,90975000,90980000,chr6,91005000,91010000,90976768,BACH2,ENSG00000112182,CD4_T-cell_naive,pieqtl,,,9.281679,,,,,,1,ENSG00000112182,protein_coding,rs72928038,,111.125622,111.144486,0.129748,236.421361,1
261,chr6,90885000,90890000,chr6,91005000,91010000,90886824,BACH2,ENSG00000112182,CD8_T-cell_naive,pieqtl,,,,14.328445,,,,,1,ENSG00000112182,protein_coding,rs10806423,,111.125622,111.144486,0.129748,236.421361,1
320,chr6,26530000,26535000,chr6,26455000,26460000,26533664,BTN2A1,ENSG00000112763,B-cell_naive,pieqtl,,,,,,5.462889,,,1,ENSG00000112763,protein_coding,rs10946834,,113.796337,119.28766,63.928445,83.106333,1
337,chr6,26500000,26505000,chr6,26365000,26370000,26501768,BTN3A2,ENSG00000186470,CD4_T-cell_naive,pieqtl,,,,,11.277539,,,,1,ENSG00000186470,protein_coding,rs2295593,,186.095117,175.180214,40.001593,50.623978,1
339,chr6,26595000,26600000,chr6,26365000,26370000,26598004,BTN3A2,ENSG00000186470,CD8_T-cell_naive,pieqtl,,,,,,,,11.091761,1,ENSG00000186470,protein_coding,rs4634439,,186.095117,175.180214,40.001593,50.623978,1
338,chr6,26595000,26600000,chr6,26365000,26370000,26598004,BTN3A2,ENSG00000186470,monocyte_naive,pieqtl,10.592087,,,,,,,,1,ENSG00000186470,protein_coding,rs4634439,,186.095117,175.180214,40.001593,50.623978,1
107,chr12,9870000,9875000,chr12,10280000,10285000,10282239,CLEC2B,ENSG00000110852,monocyte_naive,finemap,10.448108,,,,,,,,1,ENSG00000110852,protein_coding,rs113895413,,176.405352,124.095507,123.771034,166.212713,1
112,chr12,10130000,10135000,chr12,10280000,10285000,10282239,CLEC7A,ENSG00000172243,monocyte_naive,finemap,7.144294,,,,,,,,1,ENSG00000172243,protein_coding,rs113895413,,1.685681,1.659372,614.073122,0.651149,1


## Comparing this final list to a consensus gene set

In [54]:
consensus_genes = pd.read_table('results/main/gene_lists/consensus_gene_list.txt', squeeze=True)
consensus_genes = set(consensus_genes.values.tolist())



  consensus_genes = pd.read_table('results/main/gene_lists/consensus_gene_list.txt', squeeze=True)


In [55]:
consensus_genes

{"'-",
 'ABCC8',
 'AC002378.1',
 'AC080079.1',
 'ACAP1',
 'ACE',
 'ACOXL',
 'ACP1',
 'ACSL1',
 'ADAD1',
 'ADAM30',
 'ADAMTS14',
 'ADCY7',
 'ADGRL2',
 'ADIPOQ',
 'AFF3',
 'AGO2',
 'AGT',
 'AGTR1',
 'AHR',
 'AIMP1P2',
 'AIRE',
 'AKR1B1',
 'AKT2',
 'AL049612.1',
 'AL163932.1',
 'AL596442.1',
 'ALB',
 'ALDH7A1P4',
 'AMBP',
 'ANKRD50',
 'ANKRD55',
 'AOC3',
 'AP4B1-AS1',
 'APOA1',
 'ARHGAP27P2',
 'ARL8B',
 'ASCL2',
 'ATG16L1',
 'ATP6V1G3',
 'ATXN2',
 'ATXN2L',
 'BACH2',
 'BAD',
 'BATF3',
 'BCAR1',
 'BCL2L15',
 'BDNF',
 'BGLAP',
 'BTN2A3P',
 'BTNL2',
 'C1QTNF6',
 'C1orf141',
 'C4A',
 'C6orf15',
 'CALCR',
 'CAMK4',
 'CAMSAP2',
 'CAPN10',
 'CAPSL',
 'CARD9',
 'CBX3P1',
 'CCL2',
 'CCR12P',
 'CCR5',
 'CCR7',
 'CCRL2',
 'CD226',
 'CD6',
 'CD69',
 'CD79A',
 'CDC34',
 'CDKAL1',
 'CDKN1C',
 'CDKN2B-AS1',
 'CEL',
 'CENPU',
 'CENPW',
 'CETP',
 'CFB',
 'CFTR',
 'CLEC16A',
 'CLEC2D',
 'CLN3',
 'COBL',
 'COL11A2P1',
 'COLEC10',
 'CPE',
 'CRB1',
 'CRP',
 'CSN2',
 'CST3',
 'CTLA4',
 'CTRB1',
 'CTRB2',
 'CTS

In [56]:
single_reps_genes = set(single_reps.genename.values.tolist())

In [57]:
highly_studied_genes = single_reps_genes.intersection(consensus_genes)

In [58]:
highly_studied_genes

{'BACH2', 'TYK2'}

In [59]:
understudied_genes = single_reps_genes.difference(consensus_genes)

In [60]:
understudied_genes

{'AC008755.1',
 'AIF1',
 'BTN2A1',
 'BTN3A2',
 'CLEC2B',
 'CLEC7A',
 'DCAF5',
 'FLOT1',
 'FYB1',
 'LST1',
 'LY6G5B',
 'MAPKAPK5',
 'PTGER4',
 'RICTOR',
 'RPL37',
 'SAE1',
 'SEH1L',
 'SGF29',
 'TRIM27',
 'ZKSCAN4'}

#### Investigating lesser studied genes

In [61]:
understudied_sgl_genes = single_reps.loc[single_reps.genename.isin(understudied_genes)]

In [62]:
understudied_sgl_genes_fn = os.path.join(outdir, 'understudied_sgl_genes.tsv')
understudied_sgl_genes.to_csv(understudied_sgl_genes_fn, sep='\t')

In [63]:
understudied_sgl_genes_fn

'results/main/compiled_sgl_approaches//understudied_sgl_genes.tsv'