In [23]:
import os 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import subprocess
import glob
import pybedtools as pbt 
pbt.set_tempdir('/mnt/hpcscratch/jreyna/')

pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
import numpy as np

os.chdir('/mnt/bioadhoc-temp/Groups/vd-ay/jreyna-temp/projects/dchallenge/')

genome_sizes = 'results/refs/hg19/hg19.chrom.sizes'

# make the directory to save our data
outdir = 'results/main/compiled_sgl_approaches//'
os.makedirs(outdir, exist_ok=True)

In [24]:
rm /mnt/hpcscratch/jreyna/*

rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


## Loading SGLs from different sources

In [25]:
coloc_fn = 'results/main/coloc/sgl_intersect/coloc_ld_sgls.tsv'

In [26]:
pieqtl_fn = 'results/main/pieqtls/sgl_intersect/pieqtls_sgls.tsv'

In [27]:
finemap_fn = 'results/main/finemapping/sgl_intersect/finemap_sgls.tsv'

In [28]:
coloc = pd.read_table(coloc_fn)
coloc['sgl_type'] = 'coloc'

pieqtl = pd.read_table(pieqtl_fn)
pieqtl['sgl_type'] = 'pieqtl'

finemap = pd.read_table(finemap_fn)
finemap['sgl_type'] = 'finemap'

# Adding chr to the chromosome columns 
finemap['chrA_loop'] = 'chr' + finemap['chrA_loop'].astype(str)
finemap['chrB_loop'] = 'chr' + finemap['chrB_loop'].astype(str)

In [29]:
coloc

ModuleNotFoundError: No module named 'pandas.io.formats.html'

  chr_snp  start_snp   end_snp chr_gene  start_gene  end_gene  coloc.id  chrA  \
0    chr6   91014028  91014029     chr6    89673468  89673469         9  chr6   
1    chr6   91002493  91002494     chr6    89673468  89673469        15  chr6   

     startA      endA  chrB    startB      endB  loop.id   chr      pos  \
0  89670000  89675000  chr6  91005000  91010000   520647  chr9  4282536   
1  89670000  89675000  chr6  91005000  91010000   520647  chr9  4287466   

   pp_H0_Coloc_Summary  pp_H1_Coloc_Summary  pp_H2_Coloc_Summary  \
0        4.728672e-289        9.059926e-287             0.001165   
1        4.728672e-289        9.059926e-287             0.001165   

   pp_H3_Coloc_Summary  pp_H4_Coloc_Summary    ld_rsID     variant_id  \
0              0.22249             0.776345  rs4380994  9:4296430:A:G   
1              0.22249             0.776345  rs7041847  9:4296430:A:G   

          geneName   dist    pvalue       FDR  slope_snp ref alt    AC  \
0  ENSG00000107249  51962  0.02

## Loading Gene Data

In [8]:
print('# Load the gene data')

res = 5000

genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gname']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

print(genes_df.head())

print('There are {} genes in this GTF-derived file.'.format(genes_df.shape[0]))

# Load the gene data
   chrom  start    end        gname          gene_id strand
0   chr1  11869  14409      DDX11L1  ENSG00000223972      +
12  chr1  14404  29570       WASH7P  ENSG00000227232      -
25  chr1  17369  17436    MIR6859-1  ENSG00000278267      -
28  chr1  29554  31109  MIR1302-2HG  ENSG00000243485      +
36  chr1  30366  30503    MIR1302-2  ENSG00000284332      +
There are 58825 genes in this GTF-derived file.


## Intersecting PC-HiC Data

#### Coloc Approach

In [9]:
# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'monocyte_naive',
                        'naive-b': 'B-cell_naive',
                        'total-b': 'B-cell_naive',
                        'gms_merged': 'B-cell_naive', 
                        'naive-cd4': 'CD4_T-cell_naive', 
                        'total-cd4': 'CD4_T-cell_naive', 
                        'non-activated-total-cd4': 'CD4_T-cell_naive',
                        'naive-cd8': 'CD8_T-cell_naive',
                        'total-cd8': 'CD8_T-cell_naive'} 

In [19]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
coloc['mid'] = range(coloc.shape[0])

# renaming loop columns
coloc.rename(columns={'chrA': 'chrA_loop', 'startA': 'startA_loop', 'endA': 'endA_loop',
                      'chrB': 'chrB_loop', 'startB': 'startB_loop', 'endB': 'endB_loop',
                      'celltype': 'cline_loop'}, inplace=True)

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for fn in pc_hics:
    pchic_cline = os.path.basename(fn).split('.')[0]
    
    os.system('rm /mnt/hpcscratch/jreyna/*')
    
    if pchic_cline in hichip_pchic_matches: 
        
        # getting hichip data for the current cell line
        hichip_cline = hichip_pchic_matches[pchic_cline]
        hichip_cline_df = coloc.loc[coloc.cline_loop == hichip_cline, hichip_bedpe_cols]
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        #pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        #pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting hichip and pc-hic bedpes
        hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        
    
        if len(hichip_cline_pbt) > 0 and len(pchic_cline_pbt) > 0: 
        
            both_loops = hichip_cline_pbt.pairtopair(pchic_cline_pbt)        
            both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]

            if both_loops.shape[0] > 0: 
                pchic_merge_data.append(both_loops)
                print('overlap found: {}'.format(fn))
            else:
                print('overlap not found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory
rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2016_javierre/processing/non-activated-total-cd4.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/monocytes.bedpe


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2016_javierre/processing/naive-b.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/total-cd8.bedpe


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory
rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2016_javierre/processing/total-cd4.bedpe


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory
rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2016_javierre/processing/total-b.bedpe
overlap not found: results/main/pc_hic/2016_javierre/processing/naive-cd4.bedpe


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2016_javierre/processing/naive-cd8.bedpe


rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory
rm: cannot remove ‘/mnt/hpcscratch/jreyna/*’: No such file or directory


overlap not found: results/main/pc_hic/2019_jung/processing/gms_merged.bedpe


In [20]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
coloc = coloc.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
coloc.drop('mid', axis=1, inplace=True)

# no overlap with gms_merged so manually adding 
coloc['pchic.monocytes'] = np.nan
coloc['pchic.naive-b'] = np.nan
coloc['pchic.total-b'] = np.nan
coloc['pchic.gms_merged'] = np.nan
coloc['pchic.total-cd8'] = np.nan

ValueError: No objects to concatenate

In [None]:
# adding column to specify PC-HiC Support
pchic_support = []
for i, sr in coloc.iterrows():
    
    pchic_supp = 0 
    if sr.cline_loop == 'CD4_T-cell_naive':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CD8_T-cell_naive':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'monocyte_naive':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline_loop == 'B-cell_naive':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5 or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
    
    pchic_support.append(pchic_supp)
        
coloc['pchic_support'] = pchic_support

In [None]:
coloc = coloc.merge(genes_df[['gene_id', 'gname']], left_on='geneName', right_on='gene_id')

In [None]:
coloc

In [None]:
coloc_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                'chrB_loop', 'startB_loop', 'endB_loop', 
                'pos', 'ld_rsID', 'gname', 'gene_id', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'rs_id']

In [None]:
# coloc_cols = ['chrA', 'startA', 'endA',
#                 'chrB', 'startB', 'endB', 
#                 'pos', 'ld_rsID', 'gname', 'gene_id', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
#                 'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
#                 'pchic.non-activated-total-cd4', 'pchic.total-b',
#                 'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'rs_id']

In [None]:
coloc_min = coloc.loc[:, coloc_cols]

In [None]:
coloc_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'rsid', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support', 'lead.rs_id']

In [None]:
coloc_min

#### Finemap Approach

In [None]:
# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'CM',
                        'naive-b': 'NB',
                        'total-b': 'NB',
                        'gms_merged': 'NB', 
                        'naive-cd4': 'CD4N', 
                        'total-cd4': 'CD4N', 
                        'non-activated-total-cd4': 'CD4N',
                        'naive-cd8': 'CD8N',
                        'total-cd8': 'CD8N'} 

In [None]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
finemap['mid'] = range(finemap.shape[0])

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for fn in pc_hics:
    pchic_cline = os.path.basename(fn).split('.')[0]
    if pchic_cline in hichip_pchic_matches: 
        
        # getting hichip data for the current cell line
        hichip_cline = hichip_pchic_matches[pchic_cline]
        hichip_cline_df = finemap.loc[finemap.cline_loop == hichip_cline, hichip_bedpe_cols]
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        #         pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        #         pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting hichip and pc-hic bedpes
        hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        both_loops = hichip_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

In [None]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
finemap = finemap.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
finemap.drop('mid', axis=1, inplace=True)

# no overlap with gms_merged so manually adding 
finemap['pchic.gms_merged'] = np.nan

In [None]:
# adding column to specify PC-HiC Support
pchic_support = []
for i, sr in finemap.iterrows():
    
    pchic_supp = 0 
    if sr.cline_loop == 'CD4N':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CD8N':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
            
    elif sr.cline_loop == 'CM':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline_loop == 'NB':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5 or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
    
    pchic_support.append(pchic_supp)
        
finemap['pchic_support'] = pchic_support

In [None]:
# generating the hgvs id to query myvariant 
finemap['hgvs_id'] = finemap['chrA_loop'].astype(str) + ":g." + \
                        finemap['pos'].astype(str) + finemap['allele2'] + '>' + finemap['allele1']

In [None]:
finemap_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                'chrB_loop', 'startB_loop', 'endB_loop', 
                'pos', 'hgvs_id', 'genename', 'geneid', 'cline_loop', 'sgl_type', 'pchic.monocytes', 
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

In [None]:
finemap_min = finemap[finemap_cols]

In [None]:
finemap_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'hgvs_id', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

#### Pieqtl Approach

In [None]:
# create locus A columns which require startA and endA
def pos_to_bin(pos, res):
    start = int(np.floor(pos / res) * res)
    end = start + res 
    return([start, end])

lociA = pieqtl['pieQTL.Position'].apply(pos_to_bin, res=5000)
lociA = pd.DataFrame(lociA.values.tolist())
lociA.columns = ['startA', 'endA']

# create locus B columns which require startB and endB
def tss_to_bin(tss, res, slop=0):
    start = int(np.floor(tss / res) * res)
    end = start + res 
    return([start, end])

lociB = pieqtl['TSS'].apply(tss_to_bin, res=5000)
lociB = pd.DataFrame(lociB.values.tolist())
lociB.columns = ['startB', 'endB']

# create locus columns to main intersect df 
pieqtl = pd.concat([pieqtl, lociA, lociB], axis=1)

# creating a dictionary which matches 
# hichip and pc-hic data
hichip_pchic_matches = {'monocytes': 'monocyte_naive',
                        'naive-b': 'B-cell_naive',
                        'total-b': 'B-cell_naive',
                        'gms_merged': 'B-cell_naive', 
                        'naive-cd4': 'CD4_T-cell_naive', 
                        'total-cd4': 'CD4_T-cell_naive', 
                        'non-activated-total-cd4': 'CD4_T-cell_naive',
                        'naive-cd8': 'CD8_T-cell_naive',
                        'total-cd8': 'CD8_T-cell_naive'} 

In [None]:
# getting a list of pc_hic files 
pc_hics = glob.glob('results/main/pc_hic/2016_javierre/processing/*.bedpe')
pc_hics += glob.glob('results/main/pc_hic/2019_jung/processing/*.bedpe')

# adding a merge id for post fixing
pieqtl['mid'] = range(pieqtl.shape[0])

# initializing the merge list and column names
pchic_merge_data = []
pchic_cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'score', 'pchic_cline']
intersect_bedpe_cols = ['CHR', 'startA', 'endA', 'CHR', 'startB', 'endB', 'mid']

for fn in pc_hics:
    
    pchic_cline = os.path.basename(fn).split('.')[0]
    if pchic_cline in hichip_pchic_matches: 
        
        # getting pieqtl data for the current cell line
        pieqtl_cline = hichip_pchic_matches[pchic_cline]
        pieqtl_cline_df = pieqtl.loc[pieqtl.cline == pieqtl_cline, intersect_bedpe_cols]
        pieqtl_cline_df.iloc[:, 0] = pieqtl_cline_df.iloc[:, 0].str.replace('chr', '')
        pieqtl_cline_df.iloc[:, 3] = pieqtl_cline_df.iloc[:, 0].str.replace('chr', '')
        
        # loading pc-hic data for the current cell line
        pchic_cline_df = pd.read_table(fn, names=pchic_cols)
        pchic_cline_df.chrA = pchic_cline_df.chrA.str.replace('chr', '')
        pchic_cline_df.chrB = pchic_cline_df.chrB.str.replace('chr', '')
        
        # intersecting pieqtl and pc-hic bedpes
        pieqtl_cline_pbt = pbt.BedTool.from_dataframe(pieqtl_cline_df)
        pchic_cline_pbt = pbt.BedTool.from_dataframe(pchic_cline_df)
        both_loops = pieqtl_cline_pbt.pairtopair(pchic_cline_pbt)        
        both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]
        
        if both_loops.shape[0] > 0: 
            pchic_merge_data.append(both_loops)
            print('overlap found: {}'.format(fn))
        else:
            print('overlap not found: {}'.format(fn))
        

In [None]:
pchic_merge_df = pd.concat(pchic_merge_data, axis=0)
pchic_merge_df.columns = ['hichip_chrA', 'hichip_startA', 'hichip_endA',
                    'hichip_chrB', 'hichip_startB', 'hichip_endB', 'hichip_mid', 
                    'pchic_chrA', 'pchic_startA', 'pchic_endA',
                    'pchic_chrB', 'pchic_startB', 'pchic_endB', 'pchic_score', 'pchic_cline']
pchic_merge_df = pchic_merge_df.sort_values('pchic_score', ascending=False)
pchic_merge_df = pchic_merge_df.drop_duplicates(subset=['hichip_mid'], keep='first')

# creating a table of HiChIP ID versus PC-HiC cell versus score (entries) 
pchic_merge_clean = pchic_merge_df[['hichip_mid', 'pchic_score', 'pchic_cline']]
pchic_merge_clean = pchic_merge_clean.pivot(index='hichip_mid', columns='pchic_cline', values='pchic_score')
pchic_merge_clean.columns = 'pchic.' + pchic_merge_clean.columns

# merging and cleaning the merged dataset 
pieqtl = pieqtl.merge(pchic_merge_clean, left_on='mid', right_on='hichip_mid')
pieqtl.drop('mid', axis=1, inplace=True)

pchic_support = []
for i, sr in pieqtl.iterrows():
    
    pchic_supp = 0 
    if sr.cline == 'CD4_T-cell_naive':
        if sr['pchic.naive-cd4'] >= 5 or sr['pchic.total-cd4'] >= 5 or sr['pchic.non-activated-total-cd4'] > 5:
            pchic_supp = 1 
    elif sr.cline == 'CD8_T-cell_naive':
        if sr['pchic.naive-cd8'] >= 5 or sr['pchic.total-cd8'] >= 5:
            pchic_supp = 1 
    elif sr.cline == 'monocyte_naive':
        if sr['pchic.monocytes'] >= 5:
            pchic_supp = 1             
            
    elif sr.cline == 'B-cell_naive':
        if sr['pchic.naive-b'] >= 5 or sr['pchic.total-b'] >= 5: #or sr['pchic.gms_merged'] >= 5:
            pchic_supp = 1        
            
    pchic_support.append(pchic_supp)
        
pieqtl['pchic_support'] = pchic_support

In [None]:
# generating the hgvs id to query myvariant 
pieqtl['hgvs_id'] = pieqtl['CHR'].astype(str) + ":g." + \
                        pieqtl['POS'].astype(str) + pieqtl['ref'] + '>' + pieqtl['alt']

In [None]:
pieqtl_cols = ['CHR', 'startA', 'endA',
               'CHR', 'startB', 'endB',
               'POS', 'hgvs_id', 'genename', 'geneid', 'cline', 'sgl_type', 'pchic.monocytes',
                'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                'pchic.non-activated-total-cd4', 'pchic.total-b',
                'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

In [None]:
pieqtl_min = pieqtl[pieqtl_cols]

In [None]:
pieqtl_min.columns = ['chrA', 'startA', 'endA',
                      'chrB', 'startB', 'endB',
                      'pos', 'hgvs_id', 'genename', 'geneid', 
                      'cline', 'sgl_type', 'pchic.monocytes',
                      'pchic.naive-b', 'pchic.naive-cd4', 'pchic.naive-cd8',
                      'pchic.non-activated-total-cd4', 'pchic.total-b',
                      'pchic.total-cd4', 'pchic.total-cd8','pchic_support']

## Concating and comparing

In [None]:
genes_fn = 'results/refs/gencode/v30/gencode.v30.annotation.grch37.bed'

# load the gencode coords
cols = ['chrom', 'start', 'end', 'strand', 'type', 'gene_id', 'gene_name', 'subtype']
gencode = pd.read_table(genes_fn, header=None, names=cols)

# extract just the genes
genes_df = gencode.loc[gencode['type'].isin(['gene'])]
genes_df = genes_df.loc[~genes_df.duplicated(subset='gene_id'), :]
genes_df.loc[:, 'chrom'] = genes_df['chrom'].astype(str)
# genes_df = genes_df.iloc[:, [0,1,2,6,5,3]]

In [None]:
# combining finemap nd pieqtls first
agg_sgls = pd.concat([finemap_min, pieqtl_min])
agg_sgls = agg_sgls.merge(genes_df[['gene_id', 'subtype']], left_on='geneid', right_on='gene_id')

In [None]:
import myvariant

# generate a dictionary of hgvs id to rsid
mv = myvariant.MyVariantInfo()
query = mv.getvariants(agg_sgls.hgvs_id, fields=['dbsnp.rsid'])

hgvs_to_rsid = {}
for rec in query:
    
    if 'dbsnp' in rec:
        hgvs_to_rsid[rec['query']] = rec['dbsnp']['rsid']
    else:
        hgvs_to_rsid[rec['query']] = 'Not Found'

agg_sgls['rsid'] = agg_sgls['hgvs_id'].replace(hgvs_to_rsid)

In [None]:
agg_sgls

In [None]:
# add the colocs 
agg_sgls.drop('hgvs_id', axis=1, inplace=True)
tmp_coloc_min = coloc_min.merge(genes_df[['gene_id', 'subtype']], left_on='geneid', right_on='gene_id')
agg_sgls = pd.concat([agg_sgls, tmp_coloc_min])

In [None]:
agg_gene_grps = agg_sgls.groupby('subtype')

In [None]:
list(agg_gene_grps.groups.keys())

In [None]:
# creating a dictionary which matches 
# hichip and pc-hic data
dice_to_sgl_names = {'CM': 'monocyte_naive',
                        'NB': 'B-cell_naive', 
                        'CD4N': 'CD4_T-cell_naive',
                        'CD8N': 'CD8_T-cell_naive'}

In [None]:
agg_sgls.cline = agg_sgls.cline.replace(dice_to_sgl_names)

## Focusing on protein coding genes

In [None]:
prots_only = agg_gene_grps.get_group('protein_coding')
prots_only.sort_values(['geneid', 'cline', 'sgl_type'], inplace=True)

In [None]:
sig_prots = prots_only.loc[prots_only.pchic_support == 1]

In [None]:
g = sig_prots.groupby(['cline', 'sgl_type'])
sgl_count_per_cell = g.genename.value_counts()
sgl_count_per_cell = sgl_count_per_cell.to_frame()
sgl_count_per_cell.columns = ['sgl_counts']
sgl_count_per_cell.reset_index(inplace=True)

In [None]:
sgl_count_per_cell.sort_values('sgl_counts', ascending=False)

In [None]:
sgl_count_per_cell.genename.nunique()

## Intersecting Dice Gene Expression

In [None]:
ge_list = ['results/main/dice_gene_expression/CD4N_TPM.csv', 
           'results/main/dice_gene_expression/CD8N_TPM.csv', 
           'results/main/dice_gene_expression/CM_TPM.csv', 
           'results/main/dice_gene_expression/NB_TPM.csv']

In [None]:
dice_expr_list = []
for ge_fn in ge_list:
    print(ge_fn)

    df = pd.read_table(ge_fn, sep=',')
    cline = ge_fn.split('/')[-1].split('_')[0]
    df['cline'] = dice_to_sgl_names[cline]
    
    df.drop(['Transcript_Length(bp)', 'Additional_annotations'], axis=1, inplace=True)
    df.set_index('Feature_name', inplace=True)
    df = df.median(axis=1).to_frame()
    df.columns = ['ge.' + dice_to_sgl_names[cline]]
    
    dice_expr_list.append(df)

# concat all of the gene expression data
dice_expr = pd.concat(dice_expr_list, axis=1)
dice_expr.index = dice_expr.index.str.replace('\.[A-Za-z0-9_-]*$', '')

In [None]:
sig_prots = sig_prots.merge(dice_expr, left_on='geneid', right_index=True)

In [None]:
# calculating the specfic expression approach
ge_cell_specific = []
for i, sr in sig_prots.iterrows():
    
    ge_spec = 0
    if sr.cline == 'CD4_T-cell_naive' and sr['ge.CD4_T-cell_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'CD8_T-cell_naive' and sr['ge.CD8_T-cell_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'monocyte_naive' and sr['ge.monocyte_naive'] > 8:
        ge_spec = 1 
        
    elif sr.cline == 'B-cell_naive' and sr['ge.B-cell_naive'] > 8:
        ge_spec = 1 
    
    ge_cell_specific.append(ge_spec)
        
sig_prots['ge_support'] = ge_cell_specific

In [None]:
# making a final table
expr_sig_prots = sig_prots.loc[sig_prots.ge_support == 1]
expr_sig_prots.sort_values(['genename', 'cline'], inplace=True)

In [None]:
single_reps = expr_sig_prots.drop_duplicates(['genename', 'cline'])
pd.set_option('display.max_rows', None)
display(single_reps)
pd.set_option('display.max_rows', 100)

## Comparing this final list to a consensus gene set

In [None]:
consensus_genes = pd.read_table('results/main/gene_lists/consensus_gene_list.txt', squeeze=True)
consensus_genes = set(consensus_genes.values.tolist())

In [None]:
consensus_genes

In [None]:
single_reps_genes = set(single_reps.genename.values.tolist())

In [None]:
highly_studied_genes = single_reps_genes.intersection(consensus_genes)

In [None]:
highly_studied_genes

In [None]:
understudied_genes = single_reps_genes.difference(consensus_genes)

In [None]:
understudied_genes

#### Investigating lesser studied genes

In [None]:
understudied_sgl_genes = single_reps.loc[single_reps.genename.isin(understudied_genes)]

In [None]:
understudied_sgl_genes_fn = os.path.join(outdir, 'understudied_sgl_genes.tsv')
understudied_sgl_genes.to_csv(understudied_sgl_genes_fn, sep='\t')

In [None]:
understudied_sgl_genes_fn

## Visualizing loops in an SGL only

In [None]:
# initializing the merge list and column names
pchic_merge_data = []
hichip_bedpe_cols = ['chrA_loop', 'startA_loop', 'endA_loop',
                    'chrB_loop', 'startB_loop', 'endB_loop', 'mid']
for hichip_cline in hichip_clines:
        
    # getting hichip data for the current cell line
    hichip_cline_df = coloc.loc[coloc.cline_loop == hichip_cline, hichip_bedpe_cols]

    # intersecting hichip and prioritized SGLS
    hichip_cline_pbt = pbt.BedTool.from_dataframe(hichip_cline_df)

    both_loops = hichip_cline_pbt.pairtobed(pchic_cline_pbt)       
    both_loops = both_loops.to_dataframe(disable_auto_names=True, header=None).iloc[:, 0:15]


In [16]:
# Saving some simple loops for visualization of examples