In [1]:
import pandas as pd
import csv
import string
import glob
import os

In [2]:
# how many reads
# whether there was a control or not and whether it is used in peak calling
# how many peaks at what setting and threshold

In [3]:
def generate_summary_df(path):
    samples = []
    with open(path,'r') as file:

        #reader = csv.reader(file, delimiter='\t')
        #next(reader, None)
        reader = csv.DictReader(file, delimiter='\t')
        for line in reader:
            sample_info = []
            
            # if there is no sample name, skip this line
            if not line['Sample Name']:
                continue

            ##### CONSTRUCT THE SAMPLE NAME ######

            name = line['Sample Name']
            gse = line['GSE ID of Corresponding HiChIP Data']

            organism = line['Organism']
            organism = string.capwords(organism).replace(" ", "_")

            target_of_antibody = line['ChIP-seq Pull Down']
            if (target_of_antibody=="N/A"):
                continue
            
            replicate = line['Replicate Serial No']

            sample_name = name + "." + gse + "." + organism + "." + target_of_antibody + ".b" + replicate

            ###########

            # determine whether a control is present
            control_info = "no control for this sample"
            if line['Control GSM(s)']!="N/A":
                control_info = "control present and used in peak calling"

            
            # determine peak sizes and settings
            narrowpeak1 = "peak calling not completed"
            narrowpeak2 = "peak calling not completed"

            idrpeak1 = "peak calling not completed"
            idrpeak2 = "peak calling not completed"

            # check for merged replicates number of peaks
            if line['Merged replicates (if there are replicates)']=="1":
                merged_sample_name = sample_name[:-1] + "1"
                idrpeakpath = "../../../results/peaks/merged_chipline/" + merged_sample_name
                idrpeakpath1 = idrpeakpath + "/FINAL_IDR_Peaks_FDR0.1.txt"
                idrpeakpath2 = idrpeakpath + "/FINAL_IDR_Peaks_FDR0.05.txt"

                if os.path.exists(idrpeakpath):
                    idrpeak1 = sum(1 for line in open(idrpeakpath1))
                    idrpeak2 = sum(1 for line in open(idrpeakpath2))
                else:
                    idrpeak1 = "N/A"
                    idrpeak2 = "N/A"
            
            # check for peaks per replicate
            if line['Ran peak calling']=="1":
                narrowpeakpath = "../../../results/peaks/chipline/" + sample_name + "/MACS2_Ext*/"
                narrowpeakpath1 = narrowpeakpath + sample_name + ".macs2_peaks.narrowPeak_Q0.05filt"
                narrowpeakpath2 = narrowpeakpath + sample_name + ".macs2_peaks.narrowPeak_Q0.01filt"

                if len(glob.glob(narrowpeakpath1))==1:
                    narrowpeak1 = sum(1 for line in open(glob.glob(narrowpeakpath1)[0]))
                else:
                    narrowpeak1 = "error in peak calling"

                
                if len(glob.glob(narrowpeakpath2))==1:
                    narrowpeak2 = sum(1 for line in open(glob.glob(narrowpeakpath2)[0]))
                else:
                    narrowpeak2 = "error in peak calling"


            sample_info = [sample_name, control_info, narrowpeak1, narrowpeak2, idrpeak1, idrpeak2]
            samples.append(sample_info)
    
    df = pd.DataFrame(samples)
    return df

In [5]:
human_path = "../../../results/samplesheets/fastq/chipseq_human_tracker.tsv"
mouse_path = "../../../results/samplesheets/fastq/chipseq_mouse_tracker.tsv"

table = []
mouse_df = generate_summary_df(mouse_path)
human_df = generate_summary_df(human_path)

df = pd.concat([mouse_df, human_df])

columns = ["Sample Name", "Control Used?"]
columns.extend(["Narrow peaks with Q-value threshold of 0.01", "Narrow peaks with Q-value threshold of 0.05"])
columns.extend(["IDR peaks with respect to FDR 0.1", "IDR peaks with respect to FDR 0.05"])
df.columns = columns

# reset the row count
df = df.reset_index(drop=True)
df.index += 2

display(df)

# samples = pd.concat([human_samples, mouse_samples])
# samples = samples.dropna(how='all', subset=['Sample Name'])
# samples = samples.reset_index(drop=True)

# display(samples)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(samples)

Unnamed: 0,Sample Name,Control Used?,Narrow peaks with Q-value threshold of 0.01,Narrow peaks with Q-value threshold of 0.05,IDR peaks with respect to FDR 0.1,IDR peaks with respect to FDR 0.05
2,EryP.GSE112717.Mus_Musculus.Gata1.b1,control present and used in peak calling,152539,146931,,
3,EryD.GSE112717.Mus_Musculus.Gata1.b1,control present and used in peak calling,50066,48138,,
4,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,no control for this sample,180440,180440,23118,18701
5,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b2,no control for this sample,207431,203757,23118,18701
6,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1,no control for this sample,241450,238302,24860,19835
...,...,...,...,...,...,...
211,DCM_10.GSE165303.Homo_Sapiens.H3K27ac.b1,no control for this sample,124318,124318,,
212,hCM-Ctrl.GSE165303.Homo_Sapiens.H3K27ac.b1,no control for this sample,98693,98529,,
213,hCM-Ctrl.GSE165303.Homo_Sapiens.H3K27ac.b2,no control for this sample,92482,92332,,
214,hCM-HAND1OE.GSE165303.Homo_Sapiens.H3K27ac.b1,no control for this sample,87092,87092,,
