# Loop Catalog hg38 Tracks

Initially I started this notebook with the idea that I would focus on understanding loops from the loop catalog but then I also tried making WashU tracks with this data. The loops are in hg38 so that's actually a problem for my analysis since I am focusing on hg19. For now, this notebook is starts from the raw Loop Catalog data and generates hg38 longrange tracks. Check out the `Loop-Catalog-hg38-to-hg19` and `Loop-Catalog-Samples-and-Mcool-to-5kb-Cool` for more details on lifting over. 

In [1]:
import os 
import numpy as np
import pandas as pd
import glob
import subprocess as sp
import json
import warnings
warnings.filterwarnings('ignore')

# change the working directory
os.chdir('<project-dir>')

pd.set_option('display.max_columns', 1000)

In [2]:
outdir = 'results/main/loop_catalog/processing/hg38_tracks/'
os.makedirs(outdir, exist_ok=True)

In [3]:
resolutions = [5000, 10000, 25000]

stringency_no = [0, 1]
stringency_str = ['S5', 'S10', 'S25', 'L5', 'L10', 'L25']

# getting the template paths for different loops
paths = {}
paths['HiCCUPS'] = '<loops-dir>/loops/hiccups/{sample_name}/postprocessed_pixels_{res}.bedpe'
paths['FitHiChIP using ChIP-Seq peaks'] = '<loops-dir>/loops/fithichip/{sample_name}_chipseq.peaks/{stg_str}/FitHiChIP_Peak2ALL_b{res}_L20000_U2000000/P2PBckgr_{stg_no}/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-{stg_str}.interactions_FitHiC_Q0.01.bed'
paths['FitHiChIP using HiChIP-Peaks peaks'] = '<loops-dir>/loops/fithichip/{sample_name}_hichip-peaks.peaks/{stg_str}/FitHiChIP_Peak2ALL_b{res}_L20000_U2000000/P2PBckgr_{stg_no}/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-{stg_str}.interactions_FitHiC_Q0.01.bed'

In [4]:
# loading sample data
samples = pd.read_excel('results/main/loop_catalog/master-table.immune-samples-only.xlsx')
cline, gse, species, pulldown, biorep = zip(*samples['full.sample.name'].str.split('.'))

samples['cline'] = cline
samples['gse'] = gse
samples['pulldown'] = pulldown
samples['biorep'] = biorep

In [5]:
samples.pulldown.value_counts()

H3K27ac    22
CTCF       11
SMC1A       2
H3K4me1     1
KLF4        1
NANOG       1
OCT4        1
Rad21       1
Name: pulldown, dtype: int64

In [6]:
enhancer_samples = samples.loc[samples['pulldown'] == 'H3K27ac']

In [7]:
enhancer_samples.cline.unique().tolist()

['GM12878',
 'HARA',
 'MyLa',
 'Naive_Tcells',
 'Th17',
 'Treg',
 'lgs000379',
 'lgs101645',
 'lgs102580',
 'lgs102943',
 'lgs301283',
 'lgs301315']

## Making HiCCUPs Hub

In [8]:
hiccups_dir = os.path.join(outdir, 'hiccups/')
os.makedirs(hiccups_dir, exist_ok=True)

In [9]:
# extra FitHiChIP loops for best filtering parameters
res = 5000
stg_no = 1
stg_str = 'S5'
for sample_name in enhancer_samples['full.sample.name']:
    
    hiccups = paths['HiCCUPS'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    
    if os.path.exists(hiccups):
        
        hiccups_df = pd.read_table(hiccups)
        hiccups_df = hiccups_df.loc[~hiccups_df['#chr1'].str.match('^#')]
        hiccups_df.loc[:, '#chr1'] = 'chr' + hiccups_df.loc[:, '#chr1']
        hiccups_df.loc[:, 'chr2'] = 'chr' + hiccups_df.loc[:, 'chr2']
        hiccups_df.loc[:, 'x1'] = hiccups_df['x1'].astype(int)
        hiccups_df.loc[:, 'x2'] = hiccups_df['x2'].astype(int)
        hiccups_df.loc[:, 'y1'] = hiccups_df['y1'].astype(int)
        hiccups_df.loc[:, 'y2'] = hiccups_df['y2'].astype(int)

        hiccups_lr = hiccups_df[['#chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'fdrH']]
        #hiccups_lr.loc[hiccups_lr['fdrH'] == 0, 'fdrH'] = 1 * 10**(-8) # assign super sign to 0 p-value regions
        
        hiccups_lr['-log10(fdrH)'] =  -1 * np.log10(hiccups_lr['fdrH'])

        # assign max score to inf
        s = hiccups_lr['-log10(fdrH)']
        s = s[s != np.inf]     
        hiccups_lr.loc[hiccups_lr['-log10(fdrH)'] == np.inf, '-log10(fdrH)'] = max(s)
         
            
        hiccups_lr['right'] = hiccups_lr['chr2'] + ':' +  hiccups_lr['y1'].astype(str) \
                                    + '-' + hiccups_lr['y2'].astype(str) + ',' \
                                    + hiccups_lr['-log10(fdrH)'].apply(lambda x: '{:.2f}'.format(x))
        
        # sort based on coordinates (lexicographically)
        hiccups_lr.sort_values(['#chr1', 'x1', 'x2', 'right'], inplace=True)
        
        save_lr = hiccups_lr[['#chr1', 'x1', 'x2', 'right']]
        
        outfn = os.path.join(hiccups_dir, '{}.bed'.format(sample_name))
        save_lr.to_csv(outfn, sep='\t', header=False, index=False)


In [10]:
tabix = '<software-dir>/tabix/tabix-0.2.6/tabix'
bgzip = '<software-dir>/tabix/tabix-0.2.6/bgzip'

for fn in glob.glob(os.path.join(hiccups_dir, '*.bed')):
    
    # make a gz file
    cmd = '{0} -c {1} > {1}.gz'.format(bgzip, fn)
    j = sp.check_output(cmd, shell=True)
    
    # make an index file
    cmd = '{0} -f {1}.gz'.format(tabix, fn)
    g = sp.check_output(cmd, shell=True)

In [11]:
# collect json formatted information
url_root = 'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/'
hub = []
for fn in glob.glob(os.path.join(hiccups_dir, '*.bed.gz')):
    
    # get basename for url
    bn = os.path.basename(fn)
    
    # create the whole URL
    full_url = os.path.join(url_root, bn)                       
    print(full_url)
    
    # get cell name 
    sample_name = bn.replace('.bed.gz', '')
        
    # use cell name to get cell metadata info
    #cell_info = t1d_cells.loc[cell_name]
        
    # fill in the JSON details
    entry = {}
    entry['type'] = 'longrange'
    entry['name'] = 'hiccups Loops peaks for {}'.format(sample_name)
    entry['url'] = full_url
        
    
    entry['showOnHubLoad'] = True
    #color = cell_colors.loc[cell_info['color-name'], 'hex']
    color = 'red'
    entry['options'] = {'ensembleStyle': True, 'displayMode': 'arc', 'height': 200, 'color': color}

    hub.append(entry)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/MyLa.GSE101498.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/MyLa.GSE101498.Homo_Sapiens.H3K27ac.b2.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/Th17.GSE101498.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/MyLa.GSE151193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/

In [12]:
## save the hub
hub_fn = os.path.join(hiccups_dir, 'hub.config.txt')
with open(hub_fn, 'w') as fw:
    s = json.dumps(hub,  indent=True)
    fw.write(s) 

In [13]:
hub_link = os.path.join(url_root, os.path.basename(hub_fn))

In [14]:
hub_link

'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/hiccups/hub.config.txt'

## Making FitHiChIP using ChIP-Seq peaks Hub

In [15]:
fit_cp_dir = os.path.join(outdir, 'fit_cp/')
os.makedirs(fit_cp_dir, exist_ok=True)

In [16]:
# processing data from FitHiChIP using ChIP-Seq peaks
for sample_name in enhancer_samples['full.sample.name']:

    fit_cp = paths['FitHiChIP using ChIP-Seq peaks'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    if os.path.exists(fit_cp):
        
        fit_cp_df = pd.read_table(fit_cp)
                
        fit_cp_lr = fit_cp_df[['chr1', 's1', 'e1', 'chr2', 's2', 'e2', 'p']]
        
        fit_cp_lr['-log10(p)'] =  -1 * np.log10(fit_cp_lr['p'])

        # assign max score to inf
        s = fit_cp_lr['-log10(p)']
        s = s[s != np.inf]     
        fit_cp_lr.loc[fit_cp_lr['-log10(p)'] == np.inf, '-log10(p)'] = max(s)
         
            
        fit_cp_lr['right'] = fit_cp_lr['chr2'] + ':' +  fit_cp_lr['s2'].astype(str) \
                                    + '-' + fit_cp_lr['e2'].astype(str) + ',' \
                                    + fit_cp_lr['-log10(p)'].apply(lambda x: '{:.2f}'.format(x))
        
        # sort based on coordinates (lexicographically)
        fit_cp_lr.sort_values(['chr1', 's1', 'e1', 'right'], inplace=True)
        
        save_lr = fit_cp_lr[['chr1', 's1', 'e1', 'right']]
        
        outfn = os.path.join(fit_cp_dir, '{}.bed'.format(sample_name))
        save_lr.to_csv(outfn, sep='\t', header=False, index=False)

    

In [17]:
for fn in glob.glob(os.path.join(fit_cp_dir, '*.bed')):
    
    # make a gz file
    cmd = '{0} -c {1} > {1}.gz'.format(bgzip, fn)
    j = sp.check_output(cmd, shell=True)
    
    # make an index file
    cmd = '{0} -f {1}.gz'.format(tabix, fn)
    g = sp.check_output(cmd, shell=True)

In [18]:
# collect json formatted information
url_root = 'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/'
hub = []
for fn in glob.glob(os.path.join(fit_cp_dir, '*.bed.gz')):
    
    # get basename for url
    bn = os.path.basename(fn)
    
    # create the whole URL
    full_url = os.path.join(url_root, bn)                       
    print(full_url)
    
    # get cell name 
    sample_name = bn.replace('.bed.gz', '')
        
    # use cell name to get cell metadata info
    #cell_info = t1d_cells.loc[cell_name]
        
    # fill in the JSON details
    entry = {}
    entry['type'] = 'longrange'
    entry['name'] = 'Loops peaks for {}'.format(sample_name)
    entry['url'] = full_url
        
    
    entry['showOnHubLoad'] = True
    #color = cell_colors.loc[cell_info['color-name'], 'hex']
    color = 'red'
    entry['options'] = {'ensembleStyle': True, 'displayMode': 'arc', 'height': 200, 'color': color}

    hub.append(entry)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/lgs102580.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/lgs101645.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/lgs301283.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/lgs102943.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.o

In [19]:
## save the hub
hub_fn = os.path.join(fit_cp_dir, 'hub.config.txt')
with open(hub_fn, 'w') as fw:
    s = json.dumps(hub,  indent=True)
    fw.write(s) 

In [20]:
hub_link = os.path.join(url_root, os.path.basename(hub_fn))

In [21]:
hub_link

'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_cp/hub.config.txt'

In [22]:
save_lr

Unnamed: 0,chr1,s1,e1,right
0,chr1,24960000,24965000,"chr1:25020000-25025000,6.07"
1,chr1,24960000,24965000,"chr1:25030000-25035000,6.12"
2,chr1,24965000,24970000,"chr1:25020000-25025000,6.04"
3,chr1,160425000,160430000,"chr1:160545000-160550000,6.30"
4,chr1,160430000,160435000,"chr1:160535000-160540000,6.25"
...,...,...,...,...
241,chr9,131670000,131675000,"chr9:131730000-131735000,6.07"
242,chr9,137675000,137680000,"chr9:137760000-137765000,6.18"
243,chrX,12950000,12955000,"chrX:13085000-13090000,6.35"
244,chrX,12955000,12960000,"chrX:13085000-13090000,6.33"


## Making FitHiChIP using HiChIP-Peaks peaks

In [23]:
fit_hp_dir = os.path.join(outdir, 'fit_hp/')
os.makedirs(fit_hp_dir, exist_ok=True)

In [24]:
# processing data from FitHiChIP using ChIP-Seq peaks
for sample_name in enhancer_samples['full.sample.name']:

    fit_hp = paths['FitHiChIP using HiChIP-Peaks peaks'].format(sample_name=sample_name,
                                      stg_str=stg_str,
                                      stg_no=stg_no,
                                      res=res)
    if os.path.exists(fit_hp):
        
        fit_hp_df = pd.read_table(fit_hp)
                
        fit_hp_lr = fit_hp_df[['chr1', 's1', 'e1', 'chr2', 's2', 'e2', 'p']]
        
        fit_hp_lr['-log10(p)'] =  -1 * np.log10(fit_hp_lr['p'])

        # assign max score to inf
        s = fit_hp_lr['-log10(p)']
        s = s[s != np.inf]     
        fit_hp_lr.loc[fit_hp_lr['-log10(p)'] == np.inf, '-log10(p)'] = max(s)
         
            
        fit_hp_lr['right'] = fit_hp_lr['chr2'] + ':' +  fit_hp_lr['s2'].astype(str) \
                                    + '-' + fit_hp_lr['e2'].astype(str) + ',' \
                                    + fit_hp_lr['-log10(p)'].apply(lambda x: '{:.2f}'.format(x))
        
        # sort based on coordinates (lexicographically)
        fit_hp_lr.sort_values(['chr1', 's1', 'e1', 'right'], inplace=True)
        
        save_lr = fit_hp_lr[['chr1', 's1', 'e1', 'right']]
        
        outfn = os.path.join(fit_hp_dir, '{}.bed'.format(sample_name))
        save_lr.to_csv(outfn, sep='\t', header=False, index=False)

    

In [25]:
for fn in glob.glob(os.path.join(fit_hp_dir, '*.bed')):
    
    # make a gz file
    cmd = '{0} -c {1} > {1}.gz'.format(bgzip, fn)
    j = sp.check_output(cmd, shell=True)
    
    # make an index file
    cmd = '{0} -f {1}.gz'.format(tabix, fn)
    g = sp.check_output(cmd, shell=True)

In [26]:
# collect json formatted information
url_root = 'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/'
hub = []
for fn in glob.glob(os.path.join(fit_hp_dir, '*.bed.gz')):
    
    # get basename for url
    bn = os.path.basename(fn)
    
    # create the whole URL
    full_url = os.path.join(url_root, bn)                       
    print(full_url)
    
    # get cell name 
    sample_name = bn.replace('.bed.gz', '')
        
    # use cell name to get cell metadata info
    #cell_info = t1d_cells.loc[cell_name]
        
    # fill in the JSON details
    entry = {}
    entry['type'] = 'longrange'
    entry['name'] = 'Loops peaks for {}'.format(sample_name)
    entry['url'] = full_url
        
    
    entry['showOnHubLoad'] = True
    #color = cell_colors.loc[cell_info['color-name'], 'hex']
    color = 'red'
    entry['options'] = {'ensembleStyle': True, 'displayMode': 'arc', 'height': 200, 'color': color}

    hub.append(entry)

https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/MyLa.GSE151193.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/MyLa.GSE137906.Homo_Sapiens.H3K27ac.b1.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b3.bed.gz
https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/Th17.GSE101498.Homo_Sapiens.H3K27ac.b3.bed.gz
https://informaticsdata.liai.org/bioadhoc-

In [27]:
## save the hub
hub_fn = os.path.join(fit_hp_dir, 'hub.config.txt')
with open(hub_fn, 'w') as fw:
    s = json.dumps(hub,  indent=True)
    fw.write(s) 

In [28]:
hub_link = os.path.join(url_root, os.path.basename(hub_fn))

In [29]:
hub_link

'https://informaticsdata.liai.org/bioadhoc-temp/Groups/vd-ay/jreyna/projects/dchallenge/results/main/loop_catalog/processing/hg38_tracks/fit_hp/hub.config.txt'