# Linking loop data

In [24]:
import os
import shutil
import glob
import pandas as pd

In [25]:
project_dir = '/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/'
lji_lcsd_hub = 'results/lji_lcsd_hub/release-0.1/hub/'
os.chdir(project_dir)

## Making a shortcut for FitHiChIP loops (All types)

Include all loop types

In [26]:
glob_path = 'results/biorep_merged/results/loops/fithichip/*_*/*/FitHiChIP_Peak2ALL_b*_L20000_U2000000/'
glob_path += 'P2PBckgr_*/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-*.interactions_FitHiC_Q0.01.bed'
fns = glob.glob(glob_path)

In [27]:
# drop hichip-peaks.peaks, these will not be on the LJI-LCSD
fns = [x for x in fns if 'hichip-peaks.peaks' not in x]

In [28]:
len(fns)

2118

In [29]:
for fn in fns:
    
    # get sample information
    file_info = fn.split('/')
    sample_name, peak_type = file_info[5].rsplit('_', 1)
    stringency = file_info[6][0]
    resolution = file_info[6][1:] + '000'
    stringency_long = 'stringent' if stringency == 'S' else 'loose'
    
    # determine the reference genome
    if 'Homo_Sapien' in sample_name:
        ref = 'hg38'
    elif 'Mus_Musculus' in sample_name:
        ref = 'mm10'
    
    # determine the shortcut directory
    if peak_type == 'chipseq.peaks':
        sample_outdir = '{ref}/loops/hichip/chip-seq/macs2/{stringeny}'

    elif peak_type == 'hichip-peaks.peaks':
        sample_outdir = '{ref}/loops/hichip/hichip/hichip-peaks/{stringeny}'
        
    elif peak_type == 'fithichip.peaks':
        sample_outdir = '{ref}/loops/hichip/hichip/fithichip-utility/{stringeny}'
        
    sample_outdir = sample_outdir.format(ref=ref, stringeny=stringency_long)
        
    # get the link name
    new_fn = '{sample_name}.{res}.fithichip_q0.01.loops.bed'
    new_fn = new_fn.format(sample_name=sample_name, res=resolution)
    new_path = os.path.join(project_dir, lji_lcsd_hub, sample_outdir, new_fn) 
    
    # create the link
    if not os.path.islink(new_path):
        
        #print(new_path)
        abs_fn = os.path.join(project_dir, fn)
        os.symlink(abs_fn, new_path)


In [32]:
new_path

'/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/lji_lcsd_hub/release-0.1/hub/mm10/loops/hichip/hichip/fithichip-utility/stringent/Atria-TBX5AKO.GSE222370.Mus_Musculus.H3K27ac.biorep_merged.10000.fithichip_q0.01.loops.bed'

#### Account of loop types

In [30]:
acct_data = []
for fn in fns:
    
    # get sample information
    file_info = fn.split('/')
    sample_name, peak_type = file_info[5].rsplit('_', 1)
    stringency = file_info[6][0]
    resolution = file_info[6][1:] + '000'
    stringency_long = 'stringent' if stringency == 'S' else 'loose'
    
    # determine the reference genome
    if 'Homo_Sapien' in sample_name:
        ref = 'hg38'
    elif 'Mus_Musculus' in sample_name:
        ref = 'mm10'
        
    acct_data.append([sample_name, peak_type, stringency, resolution, ref])
    

In [31]:
acct_df = pd.DataFrame(acct_data)
acct_df.columns = ['sample_name', 'peak_type', 'stringency', 'resolution', 'ref']
acct_agg = acct_df.groupby(['ref', 'peak_type', 'stringency', 'resolution'])
acct_agg.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sample_name
ref,peak_type,stringency,resolution,Unnamed: 4_level_1
hg38,chipseq.peaks,L,10000,41
hg38,chipseq.peaks,L,25000,41
hg38,chipseq.peaks,L,5000,41
hg38,chipseq.peaks,S,10000,41
hg38,chipseq.peaks,S,25000,41
hg38,chipseq.peaks,S,5000,41
hg38,fithichip.peaks,L,10000,174
hg38,fithichip.peaks,L,25000,174
hg38,fithichip.peaks,L,5000,174
hg38,fithichip.peaks,S,10000,174


## Making a shortcut for HiCCUPs loops

In [20]:
glob_path = 'results/biorep_merged/results/loops/hiccups/whole_genome/*/postprocessed_pixels_*.bedpe'
fns = glob.glob(glob_path)

In [21]:
len(fns)

0

In [22]:
for fn in fns:
    
    # get sample information
    file_info = fn.split('/')
    sample_name = file_info[4]
    resolution = file_info[5].split('_')[2].split('.')[0]
    
    # determine the reference genome
    if 'Homo_Sapien' in sample_name:
        ref = 'hg38'
    elif 'Mus_Musculus' in sample_name:
        ref = 'mm10'
    
    # determine the shortcut directory
    sample_outdir = '{ref}/loops/hichip/hiccups/'.format(ref=ref)
        
    # get the link name
    new_fn = '{sample_name}.{res}.post_processed.hiccups.loops.bed'
    new_fn = new_fn.format(sample_name=sample_name, res=resolution)

    new_path = os.path.join(project_dir, lji_lcsd_hub, sample_outdir, new_fn) 
    
    if not os.path.islink(new_path):
        abs_fn = os.path.join(project_dir, fn)
        os.symlink(abs_fn, new_path)
    