In [1]:
import glob
import os
import pandas as pd

In [7]:
# list of GSE IDs of processed ChIP-seq samples, or GSE IDs without any ChIP-seq samples
# this list is used to determine which HiChIP samples do not have matching ChIP-seq data

# Note: the GSE IDs are those corresponding to the matching HiChIP data, not necessarily the IDs with ChIP-seq data themselves
matching_chipseq_human = ["GSE136090", "GSE154512", "GSE116193", "GSE105028", "GSE128106", "GSE147646"]
matching_chipseq_human.extend(["GSE179544", "GSE108869", "GSE117888", "GSE116869", "GSE116495", "GSE115492"])
matching_chipseq_human.extend(["GSE111537", "GSE131054", "GSE131651", "GSE136629", "GSE137849", "GSE149420"])
matching_chipseq_human.extend(["GSE151001", "GSE154513", "GSE156650", "GSE159985", "GSE157107", "GSE157381"])
matching_chipseq_human.extend(["GSE165207", "GSE173843", "GSE179666", "GSE180194", "GSE97585", "GSE166232"])
matching_chipseq_human.extend(["GSE165303"])

no_matching_chipseq_human = ["GSE178598", "GSE101498", "GSE147854", "GSE151193", "GSE179755", "GSE173699"]
no_matching_chipseq_human.extend(["GSE92881", "GSE100856", "GSE111930", "GSE115524", "GSE115896", "GSE116872"])
no_matching_chipseq_human.extend(["GSE116875", "GSE126792", "GSE137168", "GSE137906", "GSE137936", "GSE139466"])
no_matching_chipseq_human.extend(["GSE148958", "GSE152900", "GSE155184", "GSE155328", "GSE156772"])
no_matching_chipseq_human.extend(["GSE158642", "GSE168881", "GSE171591", "GSE173871", "GSE181971", "GSE183797"])
no_matching_chipseq_human.extend(["GSE188380", "GSE196235", "GSE80820", "GSE150906", "GSE133227", "GSE99519"])
no_matching_chipseq_human.extend(["GSE120294", "GSE147672"])

matching_chipseq_nonhuman = ["GSE112717", "GSE141113", "GSE110898", "GSE141847", "GSE142004", "GSE159629"]
matching_chipseq_nonhuman.extend(["GSE178344", "GSE147919"])

no_matching_chipseq_nonhuman = ["GSE101498", "GSE148691", "GSE112176", "GSE113339", "GSE115524"]
no_matching_chipseq_nonhuman.extend(["GSE121671", "GSE126362", "GSE135296", "GSE148691", "GSE150536"])
no_matching_chipseq_nonhuman.extend(["GSE150906", "GSE153884", "GSE157666", "GSE160656", "GSE162617"])
no_matching_chipseq_nonhuman.extend(["GSE189442", "GSE192387", "GSE193079", "GSE194068", "GSE80820"])
no_matching_chipseq_nonhuman.extend(["GSE99519", "GSE145793", "GSE166177"])


processed_gse_ids = matching_chipseq_human + no_matching_chipseq_human + matching_chipseq_nonhuman + no_matching_chipseq_nonhuman

In [9]:
# Check if a particular ChIP-seq GSE ID has been processed
gse_id = "GSE166177"

if gse_id in matching_chipseq_human:
    print(gse_id, "has matching human data.")
if gse_id in no_matching_chipseq_human:
    print(gse_id, "does not have matching human data.")
if gse_id in matching_chipseq_nonhuman:
    print(gse_id, "has matching nonhuman data.")
if gse_id in no_matching_chipseq_nonhuman:
    print(gse_id, "does not have matching nonhuman data.")

#### Function to generate pandas dataframe

In [4]:
# Create a dataframe with the following columns:
# 1) Sample Name
# 2) HiC-Pro .allValidPairs file
# 3) Matching ChIP-seq peaks (if it exists)
# 4) Matching FitHiChIP peaks (if it has been processed yet)
# 5) FitHiChIP Loop Calling from ChIPseq Peaks Complete?
# 6) FitHiChIP Loop Calling from FitHiChIP Peaks Complete?
# 7) FitHiChIP Loop Calling from HiChIP-Peaks Complete?
# 8) Number of 5kb loops from HICCUPS
# 9) Number of 10kb loops from HICCUPS
# 10) Number of 25kb loops from HICCUPS

def generate_df():
    data = []
    
    hicpro_path1 = "/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/"
    hicpro_path1 += "*/hic_results/data/*/*.allValidPairs"

    
    #hicpro_path2 = "/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/hicpro/"
    #hicpro_path2 += "*/hic_results/data/*/*.allValidPairs"
    
    hicpro_paths = glob.glob(hicpro_path1) #+ glob.glob(hicpro_path2)
    
    # iterate through each HiC-Pro output directory
    for fn in hicpro_paths:
        
        sample_name = fn.split('/')[9]
        if sample_name=="hic_results":
            sample_name = fn.split('/')[8]
        
        # IMPORTANT: Replace the last character of the sample name with 1 (so it is .b1 to match with the ChIP-seq loops)
        sample_name_chipseq = sample_name[:-1] + "1"
        
        # set the path of the HiC-Pro output folder
        hicpro_output=fn
        
        chipseq_peak_file="Not processed or not available"
        chipline_path="/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks"
        
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name_chipseq
        if os.path.exists(merged_chipline_path):
            chipseq_peak_file=merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"
        
        # check whether a folder matching sample_name exists in the chipline folder
        chipline_path += "/chipline/" + sample_name_chipseq + "/MACS2_Ext*/" + sample_name_chipseq + ".macs2_peaks.narrowPeak_Q0.01filt"
        if chipseq_peak_file=="Not processed or not available" and len(glob.glob(chipline_path))>0:
            chipseq_peak_file = glob.glob(chipline_path)[0]
            
        # check if the matching ChIP-seq sample does not exist
        if any(gse_id in sample_name_chipseq for gse_id in processed_gse_ids) and not os.path.exists(chipseq_peak_file):
            chipseq_peak_file = "Not available"
            
        
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file_hichip="Not processed"
        fithichip_peak_file_chipseq="Not processed"
        
        fithichip_peak_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/" + sample_name
        
        fithichip_peak_path_hichip = fithichip_peak_path + "_hichip-peaks.peaks"
        
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_file = fithichip_peak_path # + "/MACS2_ExtSize/out_macs2_peaks.narrowPeak"
        
        
        
        # check whether loop calling for this sample with FitHiChIP is complete
        fithichip_loops_chipseq_peaks_complete = "No"
        fithichip_loops_fithichip_peaks_complete = "No"
        fithichip_loops_hichip_peaks_complete = "No"
        fithichip_loops_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/loops/fithichip/" + sample_name
        
        fithichip_loops_chipseq_peaks_path = fithichip_loops_path + "_chipseq.peaks"
        fithichip_loops_fithichip_peaks_path = fithichip_loops_path + "_fithichip.peaks"
        fithichip_loops_hichip_peaks_path = fithichip_loops_path + "_hichip-peaks.peaks"
        
        if os.path.exists(fithichip_loops_chipseq_peaks_path):
            fithichip_loops_chipseq_peaks_complete = "Yes"
        if os.path.exists(fithichip_loops_fithichip_peaks_path):
            fithichip_loops_fithichip_peaks_complete = "Yes"
        if os.path.exists(fithichip_loops_hichip_peaks_path):
            fithichip_loops_hichip_peaks_complete = "Yes"
        
        
        # check the number of loops from HICCUPS at each resolution
        hiccups_loops_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/loops/hiccups/" + sample_name
        
        hiccups_5000_numLoops="Not processed"
        hiccups_10000_numLoops="Not processed"
        hiccups_25000_numLoops="Not processed"
        if os.path.exists(hiccups_loops_path):
            hiccups_loops_5000_path = hiccups_loops_path + "/postprocessed_pixels_5000.bedpe"
            hiccups_loops_10000_path = hiccups_loops_path + "/postprocessed_pixels_10000.bedpe"
            hiccups_loops_25000_path = hiccups_loops_path + "/postprocessed_pixels_25000.bedpe"
            
            hiccups_5000_numLoops = sum(1 for line in open(hiccups_loops_5000_path)) - 2
            hiccups_10000_numLoops = sum(1 for line in open(hiccups_loops_10000_path)) - 2
            hiccups_25000_numLoops = sum(1 for line in open(hiccups_loops_25000_path)) - 2
        
        
        # add all the information for this sample to the 2D array
        sample_data = [sample_name, hicpro_output, chipseq_peak_file, fithichip_peak_file]
        sample_data.extend([fithichip_loops_chipseq_peaks_complete, fithichip_loops_fithichip_peaks_complete, fithichip_loops_hichip_peaks_complete])
        sample_data.extend([hiccups_5000_numLoops, hiccups_10000_numLoops, hiccups_25000_numLoops])
        
        data.append(sample_data)
        
    df = pd.DataFrame(data)
    return df
    

#### Display the pandas dataframe

In [5]:
# In the dataframe, "Not available" indicates that the necessary input to create the file does not exist
# "Not processed" indicates that the necessary input to create the file exists, but has not been processed yet

df = generate_df()

columns = ["Sample Name", "HiC-Pro Valid Pairs Path", "ChIP-seq Peaks Path", "FitHiChIP Peaks Path"]
columns.extend(["FitHiChIP Loop Calling from ChIPseq Peaks Complete?"])
columns.extend(["FitHiChIP Loop Calling from FitHiChIP Peaks Complete?"])
columns.extend(["FitHiChIP Loop Calling from HiChIP-Peaks Complete?"])
columns.extend(["Number of Loops from HICCUPS (5kb)"])
columns.extend(["Number of Loops from HICCUPS (10kb)", "Number of Loops from HICCUPS (25kb)"])

df.columns = columns

df = df.reset_index(drop=True)
df.index += 2

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,Sample Name,HiC-Pro Valid Pairs Path,ChIP-seq Peaks Path,FitHiChIP Peaks Path,FitHiChIP Loop Calling from ChIPseq Peaks Complete?,FitHiChIP Loop Calling from FitHiChIP Peaks Complete?,FitHiChIP Loop Calling from HiChIP-Peaks Complete?,Number of Loops from HICCUPS (5kb),Number of Loops from HICCUPS (10kb),Number of Loops from HICCUPS (25kb)
2,H9-HS.GSE105028.Homo_Sapiens.OCT4.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,Not processed,Not processed,Not processed
3,H9-HS.GSE105028.Homo_Sapiens.KLF4.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,Not processed,Not processed,Not processed
4,DEX.GSE117888.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,Not processed,Not processed,Not processed
5,SUCCS1-siEA.GSE180194.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,889,1818,2733
6,hESC-WT-D0.GSE120294.Homo_Sapiens.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No,Yes,Yes,291,1176,3177
7,THP1-125D.GSE188380.Homo_Sapiens.CTCF.b3,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No,Yes,Yes,4301,6659,5583
8,BC1.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,Not processed,Not processed,Not processed
9,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_da...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes,Yes,Yes,Not processed,Not processed,Not processed
10,HaCaT-Stimulated.GSE137906.Homo_Sapiens.H3K27a...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No,Yes,Yes,452,1436,2598
11,OCI-Ly7-Control.GSE183797.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No,Yes,Yes,Not processed,Not processed,Not processed


#### Check the values for any specific row

In [6]:
row=170

print("Sample Name:", df["Sample Name"][row], "\n")
print("HiC-Pro Valid Pairs Path:", df["HiC-Pro Valid Pairs Path"][row], "\n")
print("ChIP-seq Peaks Path:", df["ChIP-seq Peaks Path"][row], "\n")
print("FitHiChIP Peaks Path:", df["FitHiChIP Peaks Path"][row], "\n")

print("FitHiChIP Loop Calling from ChIPseq Peaks Complete?", df["FitHiChIP Loop Calling from ChIPseq Peaks Complete?"][row], "\n")
print("FitHiChIP Loop Calling from FitHiChIP Peaks Complete?", df["FitHiChIP Loop Calling from FitHiChIP Peaks Complete?"][row], "\n")
print("FitHiChIP Loop Calling from HiChIP-Peaks Complete?", df["FitHiChIP Loop Calling from HiChIP-Peaks Complete?"][row], "\n")

print("Number of Loops from HICCUPS (5kb):", df["Number of Loops from HICCUPS (5kb)"][row])
print("Number of Loops from HICCUPS (10kb):", df["Number of Loops from HICCUPS (10kb)"][row])
print("Number of Loops from HICCUPS (25kb):", df["Number of Loops from HICCUPS (25kb)"][row], "\n")


Sample Name: Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1 

HiC-Pro Valid Pairs Path: /mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1/hic_results/data/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1.allValidPairs 

ChIP-seq Peaks Path: /mnt/bioadhoc-temp/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_with_Control/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt 

FitHiChIP Peaks Path: /mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1 

FitHiChIP Loop Calling from ChIPseq Peaks Complete? Yes 

FitHiChIP Loop Calling from FitHiChIP Peaks Complete? Yes 

FitHiChIP Loop Calling from HiChIP-Peaks Complete? Yes 

Number of Loops from HICCUPS (5kb): 242
Number of Loops from HICCUPS (10kb): 11

#### Output the dataframe as a tsv file

In [7]:
df2 = df.copy(deep=True)
df2.set_index("Sample Name", inplace=True)

# set path of file
tsv_path = "../../../results/samplesheets/loop_calling_tracker.tsv"
df2.to_csv(tsv_path, sep="\t")