# Linking loop data

In [1]:
import os
import shutil
import glob
import pandas as pd

In [2]:
project_dir = '<project-dir>'
lji_lcsd_hub = 'results/lji_lcsd_hub/release-0.1/hub/'
os.chdir(project_dir)

## Making a shortcut for peak-associated data

In [3]:
# loading cp peaks 
cp_fn = 'results/samplesheets/post-hicpro/2024.2.15.10.52.peaks_files_chipseq.all_batches.samplesheet.without_header.tsv'
cp_peaks = pd.read_table(cp_fn, header=None)
cp_peaks.columns = ['hicpro_std_sample_name', 'validpair_path', 'cp']
cp_peaks.drop('validpair_path', axis=1, inplace=True)

# add the peak std sample name that can different from the hicpro_std_sample_name due to the GEO ID
def get_chipseq_peak_std_sample_name(x):
    bn = os.path.basename(x)
    sample_name = bn.rsplit('.', 3)[0]
    return(sample_name)
cp_peaks.loc[:, 'cp_peak_std_sample_name'] = cp_peaks.cp.apply(get_chipseq_peak_std_sample_name)

# only need to transfer the file once so removing duplicates
cp_peaks.drop_duplicates('cp_peak_std_sample_name', inplace=True)

In [4]:
# loading hp peaks
hp_fn = 'results/samplesheets/post-hicpro/2024.2.1.10.52.peaks_files_fithichip.all_batches.samplesheet.without_header.tsv'
hp_peaks = pd.read_table(hp_fn, header=None)
hp_peaks.columns = ['hicpro_std_sample_name', 'validpair_path', 'hp']
hp_peaks.drop('validpair_path', axis=1, inplace=True)

# adding the peak std sample name to be consistent with cp peaks
hp_peaks.loc[:, 'hp_peak_std_sample_name'] = hp_peaks['hicpro_std_sample_name']

# merging the two
peaks = hp_peaks.merge(cp_peaks, on='hicpro_std_sample_name', suffixes=('', ''), how='left')
peaks.fillna('No-Peak-File', inplace=True)

In [5]:
def get_ref(x):
    organism = x.rsplit('.', 3)[1]
    if organism == 'Homo_Sapiens':
        return('hg38')
    elif organism == 'Mus_Musculus':
        return('mm10')
    else:
        return('error')
peaks['ref'] = peaks.hicpro_std_sample_name.apply(get_ref)

In [6]:
peaks.loc[peaks.ref == 'error'].shape

(0, 6)

#### Accounting of samples

In [7]:
peaks['cp.present'] = ~(peaks.cp.str.contains('No-Peak-File'))
peaks['hp.present'] = ~(peaks.hp.str.contains('No-Peak-File'))

In [8]:
acct_df = peaks.melt(id_vars=['hicpro_std_sample_name', 'ref'], value_vars=['cp.present', 'hp.present'])
acct_grps = acct_df.groupby(['ref', 'variable'])
acct_summary = acct_grps['value'].sum().to_frame()
acct_summary.index.names = ['ref', 'peak_type']
acct_summary.columns = ['count']

In [9]:
acct_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count
ref,peak_type,Unnamed: 2_level_1
hg38,cp.present,149
hg38,hp.present,763
mm10,cp.present,39
mm10,hp.present,281


#### Symlinking the raw data

In [10]:
# for i, sr in peaks.iterrows():
    
#     sample_name = sr.sample_name
#     ref = sr.ref
    
#     for peak_col in ['hp', 'fp', 'cp']:
        
#         # process only if the peak file exists 
#         peak_fn = sr[peak_col]
#         if os.path.exists(peak_fn):
            
#             # assigning the correct directory
#             if peak_col == 'hp':
#                 sample_outdir = '{ref}/peaks/hichip/hichip-peaks/'
#             elif peak_col == 'fp':
#                 sample_outdir = '{ref}/peaks/hichip/fithichip-utility/'
#             elif peak_col == 'cp':
#                 sample_outdir = '{ref}/peaks/chip-seq/macs2/'
            
#             # assigning merged status
#             if 'merged' in peak_fn:
#                 single_or_merged = 'merged'
#             else:
#                 single_or_merged = 'single'
    
#             # get the link name
#             new_fn = '{sample_name}.{single_or_merged}.peaks.bed'
#             new_path = os.path.join(project_dir, shortcut_dir, sample_outdir, new_fn) 
#             new_path = new_path.format(sample_name=sample_name, single_or_merged=single_or_merged, ref=ref)

#             if not os.path.islink(new_path):
#                 os.symlink(peak_fn, new_path)


### Symlinking the Bigbed peak files

In [11]:
# mapping between jr and kf names
jr_to_kf_peak_mapper = {'hp': 'fithichip-utility', 'cp': 'chipseq'}

# mapping between hp and cp peak directory paths
peak_target_dirs = {'hp': '{ref}/peaks/hichip/fithichip-utility/',
                 'cp': '{ref}/peaks/chip-seq/macs2/'}

# helper function to determine the source bigbed
def get_bb_source_path(peak_col, sample_name, control_type):
    if peak_col == 'cp':
        src_bigbed = f'results/peaks/chipline_v2/{sample_name}/{control_type}/{sample_name}_bigNarrowPeak_Q0.05filt_MACS2_Ext.bb'
    return(src_bigbed)

In [12]:
peaks.tail()

Unnamed: 0,hicpro_std_sample_name,hp,hp_peak_std_sample_name,cp,cp_peak_std_sample_name,ref,cp.present,hp.present
1039,LCL-LWK.PRJNA898623.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,LCL-LWK.PRJNA898623.Homo_Sapiens.H3K27ac.b2,No-Peak-File,No-Peak-File,hg38,False,True
1040,LCL-TSI.PRJNA898623.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,LCL-TSI.PRJNA898623.Homo_Sapiens.H3K27ac.b1,No-Peak-File,No-Peak-File,hg38,False,True
1041,LCL-TSI.PRJNA898623.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,LCL-TSI.PRJNA898623.Homo_Sapiens.H3K27ac.b2,No-Peak-File,No-Peak-File,hg38,False,True
1042,LCL-YRI.PRJNA898623.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,LCL-YRI.PRJNA898623.Homo_Sapiens.H3K27ac.b1,No-Peak-File,No-Peak-File,hg38,False,True
1043,LCL-YRI.PRJNA898623.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,LCL-YRI.PRJNA898623.Homo_Sapiens.H3K27ac.b2,No-Peak-File,No-Peak-File,hg38,False,True


In [23]:
count = 0
bad_count = 0
report = []
for i, sr in peaks.iterrows():
    
    hicpro_std_sample_name = sr.hicpro_std_sample_name
    ref = sr.ref
    
    #for peak_col in ['hp', 'cp']:
    for peak_col in ['cp']:
        
        # get the peak_std_sample_name
        peak_std_sample_name = sr['{}_peak_std_sample_name'.format(peak_col)]
        
        # get peak fn 
        peak_fn = sr[peak_col]
        peak_type = jr_to_kf_peak_mapper[peak_col]
        
        # process only if the peak file exists 
        if os.path.exists(peak_fn):
            
            # get the path to the bigbed
            control_type = peak_fn.split('/')[-2]
            src_bigbed = get_bb_source_path(peak_col, peak_std_sample_name, control_type)            
            src_bigbed = os.path.join(project_dir, src_bigbed)

            # get the target outdir
            target_outdir = peak_target_dirs[peak_col]
                        
            # get the link name
            target_path = '{sample_name}.peaks.bed.bb'
            target_path = os.path.join(project_dir, lji_lcsd_hub, target_outdir, target_path) 
            target_path = target_path.format(sample_name=peak_std_sample_name, ref=ref)

            # perform the symlinking
            if not os.path.exists(src_bigbed):
                info = [hicpro_std_sample_name, peak_std_sample_name, src_bigbed, target_path, 'source-file-not-present']
            elif os.path.islink(target_path):
                info = [hicpro_std_sample_name, peak_std_sample_name, src_bigbed, target_path, 'previously-transfer']
            elif os.path.exists(src_bigbed) and (not os.path.islink(target_path)):
                info = [hicpro_std_sample_name, peak_std_sample_name, src_bigbed, target_path, 'transfer-performed']
                os.symlink(src_bigbed, target_path)
            else:
                info = [hicpro_std_sample_name, peak_std_sample_name, src_bigbed, target_path, 'other-problem']
            report.append(info)


In [24]:
report_df = pd.DataFrame(report,
                         columns=['hicpro_std_sample_name', 'peak_std_sample_name', 'src_bigbed', 'target_path', 'status'])

In [25]:
report_df.status.value_counts()

status
previously-transfer        187
source-file-not-present      1
Name: count, dtype: int64

In [26]:
report_grps = report_df.groupby('status')

In [27]:
pd.options.display.max_colwidth = None
report_grps.get_group('source-file-not-present').iloc[[0]].T

Unnamed: 0,127
hicpro_std_sample_name,HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27ac.b1
peak_std_sample_name,HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27ac.b1
src_bigbed,/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/peaks/chipline_v2/HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_No_Control/HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27ac.b1_bigNarrowPeak_Q0.05filt_MACS2_Ext.bb
target_path,/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/lji_lcsd_hub/release-0.1/hub/hg38/peaks/chip-seq/macs2/HCT116-NUP93-mAC.GSE165463.Homo_Sapiens.H3K27ac.b1.peaks.bed.bb
status,source-file-not-present
