In [1]:
import glob
import os
import sys
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 50)
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling')

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    output_prefix = 'results/samplesheets/post-hicpro/peaks_files.samplesheet'
# if not using jupyter notebook
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [3]:
# list of GSE IDs of processed ChIP-seq samples, or GSE IDs without any ChIP-seq samples
# this list is used to determine which HiChIP samples do not have matching ChIP-seq data

# Note: the GSE IDs are those corresponding to the matching HiChIP data, not necessarily the IDs with ChIP-seq data themselves
matching_chipseq_human = ["GSE136090", "GSE116193", "GSE105028", "GSE128106", "GSE147646"]
matching_chipseq_human.extend(["GSE179544", "GSE108869", "GSE117888", "GSE116869", "GSE116495", "GSE115492"])
matching_chipseq_human.extend(["GSE111537", "GSE131054", "GSE131651", "GSE136629", "GSE137849", "GSE149420"])
matching_chipseq_human.extend(["GSE151001", "GSE154513", "GSE156650", "GSE159985", "GSE157107", "GSE157381"])
matching_chipseq_human.extend(["GSE165207", "GSE173843", "GSE179666", "GSE180194", "GSE97585", "GSE166232"])

no_matching_chipseq_human = ["GSE178598", "GSE101498", "GSE147854", "GSE151193", "GSE179755", "GSE173699"]
no_matching_chipseq_human.extend(["GSE92881", "GSE100856", "GSE111930", "GSE115524", "GSE115896", "GSE116872"])
no_matching_chipseq_human.extend(["GSE116875", "GSE126792", "GSE137168", "GSE137906", "GSE137936", "GSE139466"])
no_matching_chipseq_human.extend(["GSE148958", "GSE152900", "GSE155184", "GSE155328", "GSE156772"])
no_matching_chipseq_human.extend(["GSE158642", "GSE168881", "GSE171591", "GSE173871", "GSE181971", "GSE183797"])
no_matching_chipseq_human.extend(["GSE188380", "GSE196235", "GSE80820", "GSE150906", "GSE133227", "GSE99519", "GSE120294"])

processed_gse_ids = matching_chipseq_human + no_matching_chipseq_human

#### Function to generate pandas dataframe

In [4]:
# Create a dataframe with the following columns:
# 1) Sample Name
# 2) HiC-Pro .allValidPairs file
# 3) Matching ChIP-seq peaks (if it exists)
# 4) Matching FitHiChIP peaks (if it has been processed yet)

def generate_df():
    
    data = []
    hicpro_paths = glob.glob('results/hicpro/*/hic_results/data/*/*.allValidPairs')
    
    # iterate through each HiC-Pro output directory
    for fn in hicpro_paths:
        
        sample_name = fn.split('/')[2]
        sample_name_chipseq = sample_name[:-1] + "1"
        
        # hicpro validpairs file
        hicpro_path = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/'+ fn
        
        chipseq_peak_file="Not_processed_or_not_available"
        chipline_path="/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks"
        
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name_chipseq
        if os.path.exists(merged_chipline_path):
            chipseq_peak_file = merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"
        
        # check whether a folder matching sample_name exists in the chipline folder
        # IMPORTANT: Replace the last character of the sample name with 1 (so it is .b1 to match with the ChIP-seq loops)
        chipline_path += "/chipline/" + sample_name_chipseq + "/MACS2_Ext*/" + sample_name_chipseq + ".macs2_peaks.narrowPeak_Q0.01filt"
        if chipseq_peak_file == "Not_processed_or_not_available" and len(glob.glob(chipline_path)) > 0:
            chipseq_peak_file = glob.glob(chipline_path)[0]
            
        # check if the matching ChIP-seq sample does not exist
        if any(gse_id in sample_name_chipseq for gse_id in processed_gse_ids) and not os.path.exists(chipseq_peak_file):
            chipseq_peak_file = "Not_available"
        
        # fix path if assigned chip-seq path does not contain peak calls
        if chipseq_peak_file != "Not_available" and os.path.getsize(chipseq_peak_file) == 0:
            print("sample:", sample_name)
            print("old path:", chipseq_peak_file)
            
            new_path = ''
            if "merged_chipline" in chipseq_peak_file:
                new_path = glob.glob(chipline_path)[0]
                if os.path.getsize(new_path) == 0:
                    print("rep 1 was empty")
                    new_path = glob.glob(chipline_path)[0].replace('b1', 'b2')
                
            if new_path == '':
                if os.path.getsize('/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)) == 0:
                    print("reg chipline 0.05 filt is also empty")
                    new_path = "Not_available"
                else:
                    print("reg chipline 0.05 filt is NOT empty")
                    new_path = '/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)
                
            print("new path:", new_path)
            print("new path size:", os.path.getsize(new_path))
            print(os.path.exists(new_path))
            print()
            
            chipseq_peak_file = new_path
            
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file = "Not_processed"
        fithichip_peak_path = 'results/peaks/fithichip/{sn}/MACS2_ExtSize/out_macs2_peaks.narrowPeak'.format(sn = sample_name)
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_file = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + fithichip_peak_path
        
        # check whether a folder matching the sample name exists in the hichip-peaks folder
        hichippeaks_peak_file = "Not_processed"
        hichippeaks_path = 'results/peaks/hichip-peaks/{sn}/out_peaks.bed'.format(sn = sample_name)
        if os.path.exists(hichippeaks_path):
            hichippeaks_peak_file = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + hichippeaks_path

        # add all the information for this sample to the 2D array
        sample_data = [sample_name, hicpro_path, hichippeaks_peak_file, fithichip_peak_file, chipseq_peak_file]
        
        data.append(sample_data)
        
    return pd.DataFrame(data)
    

#### Display the pandas dataframe

In [5]:
df = generate_df()

sample: MDA-MB-231-sgRNA-Ctrl.GSE97585.Homo_Sapiens.H3K27ac.b2
old path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/merged_chipline/MDA-MB-231-sgRNA-Ctrl.GSE97585.Homo_Sapiens.H3K27ac.b1/FINAL_IDR_Peaks_FDR0.05.txt
rep 1 was empty
new path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/MDA-MB-231-sgRNA-Ctrl.GSE97585.Homo_Sapiens.H3K27ac.b2/MACS2_Ext_with_Control/MDA-MB-231-sgRNA-Ctrl.GSE97585.Homo_Sapiens.H3K27ac.b2.macs2_peaks.narrowPeak_Q0.01filt
new path size: 15047967
True

sample: HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol-II.b1
old path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/merged_chipline/HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol-II.b1/FINAL_IDR_Peaks_FDR0.05.txt
new path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol-II.b1/MACS2_Ext_with_Co

In [6]:
df.columns = ["std_sample_name", "validpairs_path", "hichip_peaks_path", "fithichip_path", "chipseq_path"]
df = df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
df

Unnamed: 0,std_sample_name,validpairs_path,hichip_peaks_path,fithichip_path,chipseq_path
0,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...
1,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...
2,293T.GSE128106.Homo_Sapiens.YY1.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...
3,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sap...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
4,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sap...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
5,A673-siSA2-Dh6-72h-D347-D362.GSE156650.Homo_Sa...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
6,A673-siSA2-Dh6-72h-D347-D362.GSE156650.Homo_Sa...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
7,A673-siSA2-Dh8-72h-D347-D362.GSE156650.Homo_Sa...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
8,A673-siSA2-Dh8-72h-D347-D362.GSE156650.Homo_Sa...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available
9,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_processed,Not_available


In [7]:
len(df.loc[df['chipseq_path'].str.contains("/mnt")])

101

#### Output the dataframe as a tsv file

In [8]:
without_header_output = '{}.without_header.tsv'.format(output_prefix)
df.to_csv(without_header_output, header=False, index=False, sep='\t')