# Linking loop data

In [1]:
import os
import shutil
import glob
import pandas as pd

In [2]:
project_dir = '/mnt/bioadhoc-temp/Groups/vd-ay/hichip-db-loop-calling/'
os.chdir(project_dir)

## Make the directory structure

In [3]:
shortcut_dir = 'results/shortcuts/'
os.makedirs(project_dir, exist_ok=True)

In [87]:
for ref in ['hg38', 'mm10', 't2t-chm13-v2.0']:

    # make cp loop dirs
    os.makedirs(shortcut_dir + '{}/loops/hichip/chip-seq/macs2/stringent/'.format(ref), exist_ok=True)
    os.makedirs(shortcut_dir + '{}/loops/hichip/chip-seq/macs2/loose/'.format(ref), exist_ok=True)   
    
    # make fp loop dirs
    os.makedirs(shortcut_dir + '{}/loops/hichip/hichip/fithichip-utility/stringent/'.format(ref), exist_ok=True)
    os.makedirs(shortcut_dir + '{}/loops/hichip/hichip/fithichip-utility/loose/'.format(ref), exist_ok=True)
    
    # make hp loop dirs
    os.makedirs(shortcut_dir + '{}/loops/hichip/hichip/hichip-peaks/stringent/'.format(ref), exist_ok=True)
    os.makedirs(shortcut_dir + '{}/loops/hichip/hichip/hichip-peaks/loose/'.format(ref), exist_ok=True)
    
    # make cp peak dirs
    os.makedirs(shortcut_dir + '{}/peaks/chip-seq/macs2/'.format(ref), exist_ok=True)
    os.makedirs(shortcut_dir + '{}/peaks/hichip/fithichip-utility/'.format(ref), exist_ok=True)
    os.makedirs(shortcut_dir + '{}/peaks/hichip/hichip-peaks/'.format(ref), exist_ok=True)

    #os.makedirs(shortcut_dir + '{}/loops/hic/mustache'.format(ref), exist_ok=True)


## Making a shortcut for loops

Include all loop types

In [5]:
glob_path = 'results/loops/fithichip/*_*/*/FitHiChIP_Peak2ALL_b*_L20000_U2000000/'
glob_path += 'P2PBckgr_*/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-*.interactions_FitHiC_Q0.01.bed'
fns = glob.glob(glob_path)

In [6]:
fns[0:5]

['results/loops/fithichip/Nonclassical_Monocyte_1814.phs001703v4p1.Homo_Sapiens.H3K27ac.b4_fithichip.peaks/L25/FitHiChIP_Peak2ALL_b25000_L20000_U2000000/P2PBckgr_0/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-L25.interactions_FitHiC_Q0.01.bed',
 'results/loops/fithichip/Nonclassical_Monocyte_1814.phs001703v4p1.Homo_Sapiens.H3K27ac.b4_fithichip.peaks/S10/FitHiChIP_Peak2ALL_b10000_L20000_U2000000/P2PBckgr_1/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-S10.interactions_FitHiC_Q0.01.bed',
 'results/loops/fithichip/Nonclassical_Monocyte_1814.phs001703v4p1.Homo_Sapiens.H3K27ac.b4_fithichip.peaks/S25/FitHiChIP_Peak2ALL_b25000_L20000_U2000000/P2PBckgr_1/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-S25.interactions_FitHiC_Q0.01.bed',
 'results/loops/fithichip/Nonclassical_Monocyte_1814.phs001703v4p1.Homo_Sapiens.H3K27ac.b4_fithichip.peaks/L10/FitHiChIP_Peak2ALL_b10000_L20000_U2000000/P2PBckgr_0/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-L10.interactions_FitHiC_Q0.01.bed',
 'results/loops/fithichip/Nonclassical_M

In [7]:
len(fns)

9708

In [15]:
for fn in fns:
    
    # get sample information
    file_info = fn.split('/')
    sample_name, peak_type = file_info[3].rsplit('_', 1)
    stringency = file_info[4][0]
    resolution = file_info[4][1:] + '000'
    stringency_long = 'stringent' if stringency == 'S' else 'loose'
    
    # determine the reference genome
    if 'Homo_Sapien' in sample_name:
        ref = 'hg38'
    elif 'Mus_Musculus' in sample_name:
        ref = 'mm10'
    
    # determine the shortcut directory
    if peak_type == 'chipseq.peaks':
        sample_outdir = '{ref}/loops/hichip/chip-seq/macs2/{stringeny}'

    elif peak_type == 'hichip-peaks.peaks':
        sample_outdir = '{ref}/loops/hichip/hichip/hichip-peaks/{stringeny}'
        
    elif peak_type == 'fithichip.peaks':
        sample_outdir = '{ref}/loops/hichip/hichip/fithichip-utility/{stringeny}'
        
    sample_outdir = sample_outdir.format(ref=ref, stringeny=stringency_long)
        
    # get the link name
    new_fn = '{sample_name}.{res}.interactions_FitHiC_Q0.01.bed'
    new_fn = new_fn.format(sample_name=sample_name, res=resolution)
    new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
    
    if not os.path.islink(new_path):
        abs_fn = os.path.join(project_dir, fn)
        os.symlink(abs_fn, new_path)
    

#### Account of loop types

In [21]:
acct_data = []
for fn in fns:
    
    # get sample information
    file_info = fn.split('/')
    sample_name, peak_type = file_info[3].rsplit('_', 1)
    stringency = file_info[4][0]
    resolution = file_info[4][1:] + '000'
    stringency_long = 'stringent' if stringency == 'S' else 'loose'
    
    # determine the reference genome
    if 'Homo_Sapien' in sample_name:
        ref = 'hg38'
    elif 'Mus_Musculus' in sample_name:
        ref = 'mm10'
        
    acct_data.append([sample_name, peak_type, stringency, resolution, ref])
    

In [30]:
acct_df = pd.DataFrame(acct_data)
acct_df.columns = ['sample_name', 'peak_type', 'stringency', 'resolution', 'ref']
acct_agg = acct_df.groupby(['ref', 'peak_type', 'stringency', 'resolution'])
acct_agg.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sample_name
ref,peak_type,stringency,resolution,Unnamed: 4_level_1
hg38,chipseq.peaks,L,10000,243
hg38,chipseq.peaks,L,25000,243
hg38,chipseq.peaks,L,5000,243
hg38,chipseq.peaks,S,10000,243
hg38,chipseq.peaks,S,25000,243
hg38,chipseq.peaks,S,5000,243
hg38,fithichip.peaks,L,10000,478
hg38,fithichip.peaks,L,25000,478
hg38,fithichip.peaks,L,5000,478
hg38,fithichip.peaks,S,10000,478


In [38]:
tdf = acct_agg.get_group(name=('hg38', 'fithichip.peaks', 'S', '5000'))

## Making a shortcut for peak-associated data

### Original peak files

In [66]:
cols = ['sample_name', 'vp', 'hp', 'fp', 'cp']

# load hg38 peaks
hg38 = pd.read_table('results/samplesheets/post-hicpro/human.peaks_files.samplesheet.without_header.tsv',
                    header=None, names=cols)
hg38['ref'] = 'hg38'

# load mm10 peaks
mm10 = pd.read_table('results/samplesheets/post-hicpro/mouse.peaks_files.samplesheet.without_header.tsv',
                    header=None, names=cols)
mm10['ref'] = 'mm10'

# load t2t peaks
t2t = pd.read_table('results/samplesheets/post-hicpro/human_t2t.peaks_files.samplesheet.without_header.tsv',
                   header=None, names=cols)
t2t['ref'] = 't2t-chm13-v2.0'

peaks = pd.concat([hg38, mm10, t2t])

#### Accounting of samples

In [138]:
peaks['cp.present'] = peaks.cp.str.startswith('/mnt/')
peaks['fp.present'] = peaks.fp.str.startswith('/mnt/')
peaks['hp.present'] = peaks.hp.str.startswith('/mnt/')

In [156]:
acct_df = peaks.melt(id_vars=['sample_name', 'ref'], value_vars=['cp.present', 'fp.present', 'hp.present'])
acct_grps = acct_df.groupby(['ref', 'variable'])
acct_summary = acct_grps['value'].sum().to_frame()
acct_summary.index.names = ['ref', 'peak_type']
acct_summary.columns = ['count']

In [170]:
acct_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count
ref,peak_type,Unnamed: 2_level_1
hg38,cp.present,243
hg38,fp.present,473
hg38,hp.present,473
mm10,cp.present,31
mm10,fp.present,194
mm10,hp.present,194
t2t-chm13-v2.0,cp.present,243
t2t-chm13-v2.0,fp.present,486
t2t-chm13-v2.0,hp.present,0


#### Processing

In [101]:
for i, sr in peaks.iterrows():
    
    sample_name = sr.sample_name
    ref = sr.ref
    
    for peak_col in ['hp', 'fp', 'cp']:
        
        # process only if the peak file exists 
        peak_fn = sr[peak_col]
        if os.path.exists(peak_fn):
            
            # assigning the correct directory
            if peak_col == 'hp':
                sample_outdir = '{ref}/peaks/hichip/hichip-peaks/'
            elif peak_col == 'fp':
                sample_outdir = '{ref}/peaks/hichip/fithichip-utility/'
            elif peak_col == 'cp':
                sample_outdir = '{ref}/peaks/chip-seq/macs2/'
            
            # assigning merged status
            if 'merged' in peak_fn:
                single_or_merged = 'merged'
            else:
                single_or_merged = 'single'
    
            # get the link name
            new_fn = '{sample_name}.{single_or_merged}.peaks.bed'
            new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
            new_path = new_path.format(sample_name=sample_name, single_or_merged=single_or_merged, ref=ref)

            if not os.path.islink(new_path):
                os.symlink(peak_fn, new_path)


### Bigbed peak files

In [112]:
jr_to_kf_peak_mapper = {'hp': 'hichip-peaks', 'fp': 'fithichip', 'cp': 'chipseq'}

In [117]:
for i, sr in peaks.iterrows():
    
    sample_name = sr.sample_name
    ref = sr.ref
    
    for peak_col in ['hp', 'fp', 'cp']:
        
        # process only if the peak file exists 
        peak_fn = sr[peak_col]
        
        if os.path.exists(peak_fn):
            
                        
            # assigning the correct directory
            if peak_col == 'hp':
                sample_outdir = '{ref}/peaks/hichip/hichip-peaks/'
            elif peak_col == 'fp':
                sample_outdir = '{ref}/peaks/hichip/fithichip-utility/'
            elif peak_col == 'cp':
                sample_outdir = '{ref}/peaks/chip-seq/macs2/'
            
            # assigning merged status
            if 'merged' in peak_fn:
                single_or_merged = 'merged'
            else:
                single_or_merged = 'single'
    
            # get the path to the bigbed
            bigbed = 'results/visualizations/washu/{peak_type}_peaks/{sample_name}.{peak_type}.peaks.bed.bb'
            bigbed = bigbed.format(peak_type=jr_to_kf_peak_mapper[peak_type], sample_name=sample_name)
            
        
            # get the link name
            new_fn = '{sample_name}.peaks.bed.bb'
            new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
            new_path = new_path.format(sample_name=sample_name, single_or_merged=single_or_merged, ref=ref)

            if not os.path.islink(new_path):
                os.symlink(peak_fn, new_path)