# Linking loop data

In [3]:
import os
import shutil
import glob
import pandas as pd

In [4]:
project_dir = '/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/'
lji_lcsd_hub = 'results/lji_lcsd_hub/release-0.1/hub/'
os.chdir(project_dir)

## Making a shortcut for peak-associated data

### Original peak files

In [5]:
cols = ['sample_name', 'vp', 'hp', 'fp', 'cp']

# load hg38 peaks
hg38 = pd.read_table('results/samplesheets/post-hicpro/human.peaks_files.samplesheet.without_header.tsv',
                    header=None, names=cols)
hg38['ref'] = 'hg38'

# load mm10 peaks
mm10 = pd.read_table('results/samplesheets/post-hicpro/mouse.peaks_files.samplesheet.without_header.tsv',
                    header=None, names=cols)
mm10['ref'] = 'mm10'

# load t2t peaks
t2t = pd.read_table('results/samplesheets/post-hicpro/human_t2t.peaks_files.samplesheet.without_header.tsv',
                   header=None, names=cols)
t2t['ref'] = 't2t-chm13-v2.0'

peaks = pd.concat([hg38, mm10, t2t])

#### Accounting of samples

In [6]:
peaks['cp.present'] = peaks.cp.str.startswith('/mnt/')
peaks['fp.present'] = peaks.fp.str.startswith('/mnt/')
peaks['hp.present'] = peaks.hp.str.startswith('/mnt/')

In [7]:
acct_df = peaks.melt(id_vars=['sample_name', 'ref'], value_vars=['cp.present', 'fp.present', 'hp.present'])
acct_grps = acct_df.groupby(['ref', 'variable'])
acct_summary = acct_grps['value'].sum().to_frame()
acct_summary.index.names = ['ref', 'peak_type']
acct_summary.columns = ['count']

In [8]:
acct_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count
ref,peak_type,Unnamed: 2_level_1
hg38,cp.present,243
hg38,fp.present,473
hg38,hp.present,473
mm10,cp.present,31
mm10,fp.present,194
mm10,hp.present,194
t2t-chm13-v2.0,cp.present,243
t2t-chm13-v2.0,fp.present,486
t2t-chm13-v2.0,hp.present,0


#### Processing

for i, sr in peaks.iterrows():
    
    sample_name = sr.sample_name
    ref = sr.ref
    
    for peak_col in ['hp', 'fp', 'cp']:
        
        # process only if the peak file exists 
        peak_fn = sr[peak_col]
        if os.path.exists(peak_fn):
            
            # assigning the correct directory
            if peak_col == 'hp':
                sample_outdir = '{ref}/peaks/hichip/hichip-peaks/'
            elif peak_col == 'fp':
                sample_outdir = '{ref}/peaks/hichip/fithichip-utility/'
            elif peak_col == 'cp':
                sample_outdir = '{ref}/peaks/chip-seq/macs2/'
            
            # assigning merged status
            if 'merged' in peak_fn:
                single_or_merged = 'merged'
            else:
                single_or_merged = 'single'
    
            # get the link name
            new_fn = '{sample_name}.{single_or_merged}.peaks.bed'
            new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
            new_path = new_path.format(sample_name=sample_name, single_or_merged=single_or_merged, ref=ref)

            if not os.path.islink(new_path):
                os.symlink(peak_fn, new_path)


### Bigbed peak files

jr_to_kf_peak_mapper = {'hp': 'hichip-peaks', 'fp': 'fithichip', 'cp': 'chipseq'}

for i, sr in peaks.iterrows():
    
    sample_name = sr.sample_name
    ref = sr.ref
    
    for peak_col in ['hp', 'fp', 'cp']:
        
        # process only if the peak file exists 
        peak_fn = sr[peak_col]
        
        if os.path.exists(peak_fn):
            
            #print(peak_fn)
                        
            # assigning the correct directory
            if peak_col == 'hp':
                sample_outdir = '{ref}/peaks/hichip/hichip-peaks/'
            elif peak_col == 'fp':
                sample_outdir = '{ref}/peaks/hichip/fithichip-utility/'
            elif peak_col == 'cp':
                sample_outdir = '{ref}/peaks/chip-seq/macs2/'
            
            # assigning merged status
            if 'merged' in peak_fn:
                single_or_merged = 'merged'
            else:
                single_or_merged = 'single'
    
            # get the path to the bigbed
            bigbed = 'results/visualizations/washu/{peak_type}_peaks/{sample_name}.{peak_type}.peaks.bed.bb'
            bigbed = bigbed.format(peak_type=jr_to_kf_peak_mapper[peak_type], sample_name=sample_name)
            bigbed = os.path.join(project_dir, bigbed)
            
            # get the link name
            new_fn = '{sample_name}.peaks.bed.bb'
            new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
            new_path = new_path.format(sample_name=sample_name, single_or_merged=single_or_merged, ref=ref)

            print(new_path)
            
            if not os.path.islink(new_path) and os.path.exists(bigbed):
                print(bigbed)
                os.symlink(bigbed, new_path)