In [70]:
import glob
import os
import pandas as pd

In [71]:
# list of GSE IDs of processed ChIP-seq samples, or GSE IDs without any ChIP-seq samples
# this list is used to determine which HiChIP samples do not have matching ChIP-seq data
processed_chipseq_gse_ids = ["GSE136090", "GSE105028", "GSE116193", "GSE178598", "GSE101498", "GSE147854", "GSE151193"]
processed_chipseq_gse_ids.extend(["GSE179755", "GSE173699", "GSE92881", "GSE148691"])

#### Function to generate pandas dataframe

In [72]:
# Create a dataframe with the following columns:
# 1) Sample Name
# 2) HiC-Pro .allValidPairs file
# 3) Matching ChIP-seq peaks (if it exists)
# 4) Matching FitHiChIP peaks (if it has been processed yet)
# 5) Whether loop calling has been completed for this sample

def generate_df():
    data = []
    
    hicpro_path = "/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/hicpro/"
    hicpro_path += "*/hic_results/data/*/*.allValidPairs"
    
    # iterate through each HiC-Pro output directory
    for fn in glob.glob(hicpro_path):
        
        sample_name = fn.split('/')[10]
        
        # set the path of the HiC-Pro output folder
        hicpro_output=fn
        
        chipseq_peak_file="Not processed or not available"
        chipline_path="/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks"
        
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name
        if os.path.exists(merged_chipline_path):
            chipseq_peak_file=merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"
        
        # check whether a folder matching sample_name exists in the chipline folder
        chipline_path += "/chipline/" + sample_name + "/MACS2_Ext*/" + sample_name + ".macs2_peaks.narrowPeak_Q0.01filt"
        if len(glob.glob(chipline_path))>0:
            chipseq_peak_file = glob.glob(chipline_path)[0]
            
        # check if the matching ChIP-seq sample does not exist
        if any(gse_id in sample_name for gse_id in processed_chipseq_gse_ids):
            if not os.path.exists(chipseq_peak_file):
                chipseq_peak_file = "Not available"
            
        
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file="Not processed"
        
        fithichip_peak_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/" + sample_name
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_file = fithichip_peak_path + "/MACS2_ExtSize/out_macs2_peaks.narrowPeak"
            
        # check whether loop calling for this sample is complete
        loop_calling_complete = "No"
        fithichip_loops_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/loops/fithichip/" + sample_name
        if os.path.exists(fithichip_loops_path):
            loop_calling_complete = "Yes"
        
        data.append([sample_name, hicpro_output, chipseq_peak_file, fithichip_peak_file, loop_calling_complete])
        
    df = pd.DataFrame(data)
    return df
    

#### Display the pandas dataframe

In [73]:
df = generate_df()
df.columns = ["Sample Name", "HiC-Pro Valid Pairs Path", "ChIP-seq Peaks Path", "FitHiChIP Peaks Path", "Loop Calling Complete?"]

df = df.reset_index(drop=True)
df.index += 2

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,Sample Name,HiC-Pro Valid Pairs Path,ChIP-seq Peaks Path,FitHiChIP Peaks Path,Loop Calling Complete?
2,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
3,lgs000379.GSE116193.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
4,lgs301283.GSE116193.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
5,AoSMC.GSE178598.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
6,H9-HS-KI.GSE105028.Homo_Sapiens.Rad21.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
7,HAVIC.GSE154512.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
8,Th17.GSE101498.Homo_Sapiens.H3K27ac.b3,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
9,GM.GSE101498.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
10,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not available,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
11,BC3.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes


#### Check the values for any specific row

In [7]:
row=48

print("Sample Name:", df["Sample Name"][row], "\n")
print("HiC-Pro Valid Pairs Path:", df["HiC-Pro Valid Pairs Path"][row], "\n")
print("ChIP-seq Peaks Path:", df["ChIP-seq Peaks Path"][row], "\n")
print("FitHiChIP Peaks Path:", df["FitHiChIP Peaks Path"][row], "\n")
print("Loop Calling Complete?:", df["Loop Calling Complete?"][row], "\n")


Sample Name: H9-HS.GSE105028.Homo_Sapiens.Rad21.b1 

HiC-Pro Valid Pairs Path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/hicpro/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1/hic_results/data/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1.allValidPairs 

ChIP-seq Peaks Path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks/chipline/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1/MACS2_Ext_No_Control/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1.macs2_peaks.narrowPeak_Q0.01filt 

FitHiChIP Peaks Path: /mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/H9-HS.GSE105028.Homo_Sapiens.Rad21.b1/MACS2_ExtSize/out_macs2_peaks.narrowPeak 

Loop Calling Complete?: Yes 



#### Output the dataframe as a tsv file

In [8]:
df2 = df.copy(deep=True)
df2.set_index("Sample Name", inplace=True)

# set path of file
tsv_path = "../../../results/samplesheets/loop_calling_tracker.tsv"
df2.to_csv(tsv_path, sep="\t")