In [93]:
import glob
import os
import pandas as pd

#### Method to generate pandas dataframe

In [94]:
# Create a dataframe with the following columns:
# 1) Sample Name
# 2) HiC-Pro .allValidPairs file
# 3) Matching ChIP-seq peaks (if it exists)
# 4) Matching FitHiChIP peaks (if it has been processed yet)

def generate_df(hicpro_path, sample_name_index):
    data = []
    glob_str = hicpro_path.format(sample_name='*')
    
    # iterate through each HiC-Pro output directory
    for fn in glob.glob(glob_str):
        
        sample_name = fn.split('/')[sample_name_index]
        info = []
        
        # determine the path of the HiC-Pro output folder
        hicpro_output=fn
        
        chipseq_peak_file="Not processed or does not exist"
        chipline_path="/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/peaks"
        
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name
        if os.path.exists(merged_chipline_path):
            chipseq_peak_file=merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"
        
        # check whether a folder matching sample_name exists in the chipline folder
        chipline_path += "/chipline/" + sample_name
        if os.path.exists(chipline_path):
            chipseq_peak_file=chipline_path + "/MACS2_Ext_with_Control/" + sample_name + ".macs2_peaks.narrowPeak_Q0.01filt"
        
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file="Not processed yet"
        
        fithichip_peak_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/" + sample_name
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_path += "/MACS2_ExtSize/out_macs2_peaks.narrowPeak"
            
        # check whether loop calling for this sample is complete
        loop_calling_complete = "No"
        fithichip_loops_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/loops/fithichip/" + sample_name
        if os.path.exists(fithichip_loops_path):
            loop_calling_complete = "Yes"
        
        
        info.extend([sample_name, hicpro_output, chipseq_peak_file, fithichip_peak_path, loop_calling_complete])
        data.append(info)
        
    df = pd.DataFrame(data)
    return df
    

#### Display the pandas dataframe

In [95]:
hicpro_path = "/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/hicpro/"
hicpro_path += "{sample_name}/hic_results/data/{sample_name}/{sample_name}.allValidPairs"

df = generate_df(hicpro_path, 10)
df.columns = ["Sample Name", "HiC-Pro Valid Pairs Path", "ChIP-seq Peaks Path", "FitHiChIP Peaks Path", "Loop Calling Complete"]

df = df.sort_values("ChIP-seq Peaks Path").reset_index(drop=True)
df.index += 2

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df)

Unnamed: 0,Sample Name,HiC-Pro Valid Pairs Path,ChIP-seq Peaks Path,FitHiChIP Peaks Path,Loop Calling Complete
2,BC1.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
3,BC3.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
4,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
5,JSC.GSE136090.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
6,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
7,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Yes
8,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b3,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
9,Th17.GSE101498.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
10,lgs101645.GSE116193.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No
11,lgs102943.GSE116193.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_databas...,Not processed or does not exist,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,No


#### Check the values for any specific row

In [96]:
# Note: This cell cannot be run if the next cell has already been run. If so, you must rerun the previous cell first.

row=79

print("Sample Name:", df["Sample Name"][row], "\n")
print("HiC-Pro Valid Pairs Path:", df["HiC-Pro Valid Pairs Path"][row], "\n")
print("ChIP-seq Peaks Path:", df["ChIP-seq Peaks Path"][row], "\n")
print("FitHiChIP Peaks Path:", df["FitHiChIP Peaks Path"][row], "\n")
print("Loop Calling Complete:", df["Loop Calling Complete"][row], "\n")


Sample Name: mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2 

HiC-Pro Valid Pairs Path: /mnt/BioAdHoc/Groups/vd-ay/nrao/hichip_database/hichip-db-loop-calling/results/hicpro/mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2/hic_results/data/mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2/mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2.allValidPairs 

ChIP-seq Peaks Path: Not processed or does not exist 

FitHiChIP Peaks Path: /mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/fithichip/mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2 

Loop Calling Complete: No 



#### Output the dataframe as a tsv file

In [90]:
df.set_index("Sample Name", inplace=True)

# set path of file
tsv_path = "../../../results/samplesheets/loop_calling_tracker.tsv"
df.to_csv(tsv_path, sep="\t")