In [63]:
import glob
import os
import sys
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 50)
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling')

In [64]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    output_prefix = 'results/samplesheets/post-hicpro/human.11.6.22.peaks_files.samplesheet.without_header.tsv'
# if not using jupyter notebook
else:
    input_fn = sys.argv[1]
    output_prefix = sys.argv[2]

In [65]:
samples = pd.read_table('results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv', header=None)
columns = ['std_sample_name', 'gse_id', 'organism', 'bio_rep', 'antibody_target', 'restriction_enzyme', 'sample_name']
samples.columns = columns

In [66]:
# list of GSE IDs of processed ChIP-seq samples, or GSE IDs without any ChIP-seq samples
# this list is used to determine which HiChIP samples do not have matching ChIP-seq data

# Note: the GSE IDs are those corresponding to the matching HiChIP data, not necessarily the IDs with ChIP-seq data themselves
matching_chipseq_human = ["GSE136090", "GSE116193", "GSE105028", "GSE128106", "GSE147646"]
matching_chipseq_human.extend(["GSE179545", "GSE108869", "GSE117888", "GSE116876", "GSE165783", "GSE115494"])
matching_chipseq_human.extend(["GSE111537", "GSE131052", "GSE131651", "GSE136889", "GSE137848", "GSE149420"])
matching_chipseq_human.extend(["GSE151002", "GSE154513", "GSE156649", "GSE159972", "GSE157105", "GSE157222"])
matching_chipseq_human.extend(["GSE107147", "GSE173842", "GSE179666", "GSE180198", "GSE97584", "GSE166234"])
matching_chipseq_human.extend(["GSE165303", "phs001703v3p1", "phs001703v4p1", "GSE99518"])

no_matching_chipseq_human = ["GSE178598", "GSE101498", "GSE147854", "GSE151193", "GSE179755", "GSE173699"]
no_matching_chipseq_human.extend(["GSE92881", "GSE100856", "GSE111930", "GSE115524", "GSE115896", "GSE116872"])
no_matching_chipseq_human.extend(["GSE116875", "GSE126792", "GSE137168", "GSE137906", "GSE137936", "GSE139466"])
no_matching_chipseq_human.extend(["GSE148958", "GSE152900", "GSE155184", "GSE155328", "GSE156772"])
no_matching_chipseq_human.extend(["GSE158642", "GSE168881", "GSE171591", "GSE173871", "GSE181971", "GSE183797"])
no_matching_chipseq_human.extend(["GSE188380", "GSE196235", "GSE80820", "GSE150906", "GSE133227", "GSE120294"])
no_matching_chipseq_human.extend(["GSE147672"])

matching_chipseq_mouse = ["GSE112717", "GSE141113", "GSE110898", "GSE141847", "GSE142004", "GSE159629"]
matching_chipseq_mouse.extend(["GSE178344", "GSE147919"])

no_matching_chipseq_mouse = ["GSE101498", "GSE148691", "GSE112176", "GSE113339", "GSE115524"]
no_matching_chipseq_mouse.extend(["GSE121671", "GSE126362", "GSE135296", "GSE148691", "GSE150536"])
no_matching_chipseq_mouse.extend(["GSE150906", "GSE153884", "GSE157666", "GSE160656", "GSE162617"])
no_matching_chipseq_mouse.extend(["GSE189442", "GSE192387", "GSE193079", "GSE194068", "GSE80820"])
no_matching_chipseq_mouse.extend(["GSE99519", "GSE145793", "GSE166177"])

processed_gse_ids = matching_chipseq_human + no_matching_chipseq_human + matching_chipseq_mouse + no_matching_chipseq_mouse

In [67]:
# Chip-seq GSE IDs
matching_chipseq_human = [
    "GSE147646", "GSE136090", "GSE116193", "GSE105028",
    "GSE179545", "GSE128106", "GSE108869", "GSE111537", "GSE115494",
    "GSE165783", "GSE116876", "GSE117888", "GSE131052", "GSE131651",
    "GSE136889", "GSE137848", "GSE149420", "GSE151002", "GSE154513",
    "GSE156649", "GSE157105", "GSE157222", "GSE159972", "GSE107147",
    "GSE173842", "GSE179666", "GSE180198", "GSE97584", "GSE99518",
    "GSE166234", "GSE165303", "phs001703v3p1", "phs001703v4p1"
]

In [68]:
# Chip-seq GSE IDs
matching_chipseq_mouse = ["GSE112717", "GSE141104", "GSE110896", "GSE141847", "GSE142003", "GSE147919", "GSE159627", "GSE178346"]

In [69]:
human_gse_map = {
    "GSE147646": "GSE147646", "GSE136090": "GSE136090", "GSE116193": "GSE116193",
    "GSE105028": "GSE105028", "GSE179544": "GSE179545", "GSE128106": "GSE128106",
    "GSE108869": "GSE108869", "GSE111537": "GSE111537", "GSE115492": "GSE115494",
    "GSE116495": "GSE165783", "GSE116869": "GSE116876", "GSE117888": "GSE117888",
    "GSE131054": "GSE131052", "GSE131651": "GSE131651", "GSE136629": "GSE136889",
    "GSE137849": "GSE137848", "GSE149420": "GSE149420", "GSE151001": "GSE151002",
    "GSE154513": "GSE154513", "GSE156650": "GSE156649", "GSE157107": "GSE157105",
    "GSE157381": "GSE157222", "GSE159985": "GSE159972", "GSE165207": "GSE107147",
    "GSE173843": "GSE173842", "GSE179666": "GSE179666", "GSE180194": "GSE180198",
    "GSE97585": "GSE97584", "GSE99519": "GSE99518", "GSE166232": "GSE166234",
    "GSE165303": "GSE165303", "phs001703v3p1": "phs001703v3p1", "phs001703v4p1": "phs001703v4p1"}

In [70]:
mouse_gse_map = {
    "GSE112717": "GSE112717", "GSE141113": "GSE141104", "GSE110898": "GSE110896",
    "GSE141847": "GSE141847", "GSE142004": "GSE142003", "GSE147919": "GSE147919",
    "GSE159629": "GSE159627", "GSE178344": "GSE178346"}

#### Function to generate pandas dataframe

In [71]:
def read_chipseq_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            count = count + 1
    return(count)

In [72]:
def get_gse(ID, org):
    if org == "Homo":
        if ID in human_gse_map.keys():
            return human_gse_map[ID]
    elif org == "Mus":
        if ID in mouse_gse_map.keys():
            return mouse_gse_map[ID]
    return ID

In [73]:
chipline_sample_list = "results/peaks/chipline_samples.new.txt"
merged_chipline_sample_list = "results/peaks/merged_chipline_samples.new.txt"

chipline_all_samples = []
merged_chipline_all_samples = []

with open(chipline_sample_list) as csl:
    for line in csl:
        chipline_all_samples.append(line.strip())
        
with open(merged_chipline_sample_list) as mcsl:
    for line in mcsl:
        merged_chipline_all_samples.append(line.strip())

In [74]:
# Create a dataframe with the following columns:
# 1) Sample Name
# 2) HiC-Pro .allValidPairs file
# 3) Matching ChIP-seq peaks (if it exists)
# 4) Matching FitHiChIP peaks (if it has been processed yet)

def generate_df():
    
    data = []
    info = []
    org = "Homo"
    hicpro_paths = glob.glob('results/hicpro/*' + org + '*/hic_results/data/*/*.allValidPairs')
    
    chipline_missing = []
    merged_chipline_missing = []
    fithichip_missing = []
    hichip_peaks_missing = []
    
    # iterate through each HiC-Pro output directory
    for fn in hicpro_paths:
        sample_name_hichip = fn.split('/')[2]
        tmp = sample_name_hichip.strip().split(".")
        tmp[1] = get_gse(tmp[1], org)
        sample_name_chipseq = ".".join(tmp)
        sample_name_chipseq = sample_name_chipseq[:-2] + "b1"
        sample_name_chipseq_general = sample_name_chipseq[:-2] + "b"
        
        # hicpro validpairs file
        hicpro_path = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/'+ fn
        
        chipseq_peak_file="Not_processed_or_not_available"
        chipline_path="/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks"
        
        # for in-house HiChIP samples with pre-generated peaks:
        # update hicpro sample names to match chipline sample names for pieQTL/NCM samples            
        if 'phs001703v' in sample_name_hichip:
            if 'phs001703v3' in sample_name_hichip:
                cell_type = sample_name_hichip.split('.')[0].split('_1')[0]
                sample_name_chipseq = cell_type + "_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1"
            if 'phs001703v4' in sample_name_hichip:
                cell_type = sample_name_hichip.split('.')[0].split('_1')[0]
                sample_name_chipseq = cell_type + "_merged_donors_hg38.phs001703v4p1.Homo_Sapiens.H3K27ac.b1"
                
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name_chipseq
        #count = 0
        
        #if os.path.exists(merged_chipline_path):   
        if sample_name_chipseq in merged_chipline_all_samples:
            sample_info = []
            sample_info.append(sample_name_hichip)
            chipseq_peak_file = merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"
            
            try:
                p = open(chipseq_peak_file)
            except OSError as e:
                sample = str(e).strip().split("/")[10]
                merged_chipline_missing.append(sample)
            
            #sample_info.append(read_chipseq_files(chipseq_peak_file))
            chipline_path += "/chipline/" + sample_name_chipseq_general + "*/MACS2_Ext*/*.macs2_peaks.narrowPeak_Q0.01filt"            
            replicates = glob.glob(chipline_path)
            
#             if "AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3" in sample_name_hichip:
#                 print("Sample:", sample_name_hichip)
#                 print(chipline_path)
#                 print(replicates)
            
            if len(replicates) == 0:
                reps_missing = [sample for sample in chipline_all_samples if sample_name_chipseq_general in sample]
                for rep in reps_missing:
                    chipline_missing.append(rep)
            
            print("find max rep for:", sample_name_hichip)
            max_rep_peaks = -1
            max_rep = ""
            for rep in replicates:
                try:
                    num_peaks = int(read_chipseq_files(rep))
                    print("rep:", rep.split("/")[-1], num_peaks)
                    sample_info.append(num_peaks)
                    if num_peaks > max_rep_peaks:
                        max_rep_peaks = num_peaks
                        max_rep = rep
                except OSError as e:
                    sample = str(e).strip().split("/")[10]
                    chipline_missing.append(sample)
            print("chosen:", max_rep.split("/")[-1])
            print()
            chipseq_peak_file = max_rep
            #sample_info.append(max_rep_peaks)
            info.append(sample_info)

        # check whether a folder matching sample_name exists in the chipline folder
        # IMPORTANT: Replace the last character of the sample name with 1 (so it is .b1 to match with the ChIP-seq loops)
        chipline_path += "/chipline/" + sample_name_chipseq + "/MACS2_Ext*/*.macs2_peaks.narrowPeak_Q0.01filt"
        if chipseq_peak_file == "Not_processed_or_not_available":
            if len(glob.glob(chipline_path)) > 0:
                chipseq_peak_file = glob.glob(chipline_path)[0]
            elif len(glob.glob(chipline_path)) == 0 and sample_name_chipseq in chipline_all_samples:
                chipline_missing.append(sample_name_chipseq)

        # check if the matching ChIP-seq sample does not exist
        if any(gse_id in sample_name_chipseq for gse_id in processed_gse_ids) and not os.path.exists(chipseq_peak_file):
            chipseq_peak_file = "Not_available"
            if sample_name_chipseq in chipline_all_samples and sample_name_chipseq not in chipline_missing:
                print(sample_name_chipseq)

        # fix path if assigned chip-seq path does not contain peak calls
        ## NOTE: REMOVE FIRST CONDITION FOR ADDITION OF CHIPSEQ PEAKS TO FILE
        if chipseq_peak_file!="Not_processed_or_not_available" and chipseq_peak_file != "Not_available" and os.path.getsize(chipseq_peak_file) == 0:
            print("sample:", sample_name_hichip)
            print("old path:", chipseq_peak_file)

            new_path = ''
            if "merged_chipline" in chipseq_peak_file:
                new_path = glob.glob(chipline_path)[0]
                if os.path.getsize(new_path) == 0:
                    print("rep 1 was empty")
                    new_path = glob.glob(chipline_path)[0].replace('b1', 'b2')

            if new_path == '':
                if os.path.getsize('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)) == 0:
                    print("reg chipline 0.05 filt is also empty")
                    new_path = "Not_available"
                else:
                    print("reg chipline 0.05 filt is NOT empty")
                    new_path = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)

            print("new path:", new_path)
            print("new path size:", os.path.getsize(new_path))
            print(os.path.exists(new_path))
            print()

            chipseq_peak_file = new_path
            
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file = "Not_processed"
        fithichip_peak_path = 'results/peaks/fithichip/{sn}/MACS2_ExtSize/out_macs2_peaks.narrowPeak'.format(sn = sample_name_hichip)
        
        try:
            p = open(fithichip_peak_path)
        except OSError as e:
            sample = str(e).strip().split("/")[3]
            fithichip_missing.append(sample)
        
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_file = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + fithichip_peak_path
        
        # check whether a folder matching the sample name exists in the hichip-peaks folder
        hichippeaks_peak_file = "Not_processed"
        hichippeaks_path = 'results/peaks/hichip-peaks/{sn}/out_peaks.bed'.format(sn = sample_name_hichip)
        
        try:
            p = open(hichippeaks_path)
        except OSError as e:
            sample = str(e).strip().split("/")[3]
            #print(sample)
            hichip_peaks_missing.append(sample)
        
        if os.path.exists(hichippeaks_path):
            hichippeaks_peak_file = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + hichippeaks_path

        # add all the information for this sample to the 2D array
        sample_data = [sample_name_hichip, hicpro_path, hichippeaks_peak_file, fithichip_peak_file, chipseq_peak_file]
        
        data.append(sample_data)
        
    return pd.DataFrame(data), set(chipline_missing), set(merged_chipline_missing), set(fithichip_missing), set(hichip_peaks_missing)
    

#### Display the pandas dataframe

In [75]:
df, chipline_missing, merged_chipline_missing, fithichip_missing, hichip_peaks_missing = generate_df()

find max rep for: HeLa-S3-shcontrol.GSE137849.Homo_Sapiens.CTCF.b1
rep: HeLa-S3-shcontrol.GSE137848.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt 111194
rep: HeLa-S3-shcontrol.GSE137848.Homo_Sapiens.CTCF.b2.macs2_peaks.narrowPeak_Q0.01filt 118265
chosen: HeLa-S3-shcontrol.GSE137848.Homo_Sapiens.CTCF.b2.macs2_peaks.narrowPeak_Q0.01filt

find max rep for: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b1
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b7.macs2_peaks.narrowPeak_Q0.01filt 55415
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b4.macs2_peaks.narrowPeak_Q0.01filt 45087
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b2.macs2_peaks.narrowPeak_Q0.01filt 45259
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b8.macs2_peaks.narrowPeak_Q0.01filt 56859
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b3.macs2_peaks.narrowPeak_Q0.01filt 44747
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3K27ac.b5.macs2_peaks.narrowPeak_Q0.01filt 57917
rep: THP-1-Vector.GSE149420.Homo_Sapiens.H3

rep: T47D-T0.GSE179666.Homo_Sapiens.PGR.b2.macs2_peaks.narrowPeak_Q0.01filt 229541
chosen: T47D-T0.GSE179666.Homo_Sapiens.PGR.b2.macs2_peaks.narrowPeak_Q0.01filt

find max rep for: 293T.GSE128106.Homo_Sapiens.YY1.b1
rep: 293T.GSE128106.Homo_Sapiens.YY1.b2.macs2_peaks.narrowPeak_Q0.01filt 119725
rep: 293T.GSE128106.Homo_Sapiens.YY1.b1.macs2_peaks.narrowPeak_Q0.01filt 81035
chosen: 293T.GSE128106.Homo_Sapiens.YY1.b2.macs2_peaks.narrowPeak_Q0.01filt

find max rep for: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b1
rep: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b3.macs2_peaks.narrowPeak_Q0.01filt 42388
rep: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b1.macs2_peaks.narrowPeak_Q0.01filt 42210
rep: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b2.macs2_peaks.narrowPeak_Q0.01filt 42370
rep: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b4.macs2_peaks.narrowPeak_Q0.01filt 42353
chosen: THP-1-Vector-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b3.macs2_peaks.narrowPeak

rep: HK2.GSE147646.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt 71603
rep: HK2.GSE147646.Homo_Sapiens.H3K27ac.b2.macs2_peaks.narrowPeak_Q0.01filt 91135
chosen: HK2.GSE147646.Homo_Sapiens.H3K27ac.b2.macs2_peaks.narrowPeak_Q0.01filt

find max rep for: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b1
rep: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b3.macs2_peaks.narrowPeak_Q0.01filt 53359
rep: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b4.macs2_peaks.narrowPeak_Q0.01filt 53062
rep: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b1.macs2_peaks.narrowPeak_Q0.01filt 52337
rep: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b2.macs2_peaks.narrowPeak_Q0.01filt 52651
chosen: THP-1-WT-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b3.macs2_peaks.narrowPeak_Q0.01filt

find max rep for: MDA-MB-231-sgRNA-Ctrl.GSE97585.Homo_Sapiens.H3K27ac.b1
rep: MDA-MB-231-sgRNA-Ctrl.GSE97584.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt 0
rep: MDA-MB-231-sgRNA-Ctrl.GSE97584.Homo_Sapiens.H3K

In [76]:
for l in chipline_missing, merged_chipline_missing, fithichip_missing, hichip_peaks_missing:
    print(len(l))

0
0
8
338


In [48]:
chipline_missing

set()

In [49]:
merged_chipline_missing

{'F1_CD4_CD8.GSE141847.Mus_Musculus.SMC1A.b1'}

In [185]:
missing_df = pd.DataFrame(merged_chipline_missing)
missing_df = missing_df.sort_values(by = 0)
missing_df.to_csv("broken.merged_chipline.mm10.txt", header=False, index=False, sep='\t')

In [50]:
test_df = pd.read_csv("results/samplesheets/post-hicpro/mouse_updated_0314.peaks_files.samplesheet.without_header.tsv", header = None, sep = "\t")

In [51]:
test_df

Unnamed: 0,0,1,2,3,4,5
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
3,3T3.GSE192387.Mus_Musculus.BATF.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
5,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
6,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BAT...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,Not_available,0
7,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,0
8,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,0
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip...,203757


In [52]:
len(test_df.loc[test_df[4].str.contains("/mnt")])

31

In [53]:
df = df.sort_values(by=0).reset_index(drop=True)

In [54]:
len(df.loc[df[4].str.contains("/mnt")])

31

In [55]:
df

Unnamed: 0,0,1,2,3,4
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
3,3T3.GSE192387.Mus_Musculus.BATF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
5,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
6,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BAT...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
7,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...
8,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...


In [56]:
df.columns = ["std_sample_name", "validpairs_path", "hichip_peaks_path", "fithichip_path", "chipseq_path"]
df = df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
df

Unnamed: 0,std_sample_name,validpairs_path,hichip_peaks_path,fithichip_path,chipseq_path
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
3,3T3.GSE192387.Mus_Musculus.BATF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
5,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
6,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BAT...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,Not_available
7,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...
8,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-l...


In [57]:
len(df.loc[df['fithichip_path'].str.contains("/mnt")])

194

In [58]:
df_filt = df.loc[df["chipseq_path"].str.contains("/mnt")].drop(columns = ["validpairs_path","hichip_peaks_path","fithichip_path"])
df_filt['fn'] = df_filt.apply(lambda row: row["chipseq_path"].strip().split("/")[-1], axis=1)
fn = list(df_filt['fn'])

In [59]:
test_df_filt = test_df.loc[test_df[4].str.contains("/mnt")].drop(columns = [1,2,3])
test_df_filt['fn'] = test_df_filt.apply(lambda row: row[4].strip().split("/")[-1], axis=1)
test_fn = list(test_df_filt['fn'])

In [60]:
count = 0
for f in test_fn:
    if f not in fn:
        print("test_fn:", f)
        print("fn:", fn[count])
        if f.split(".")[4] != fn[count].split(".")[4]:
            print("*******Bio rep mismatch")
        if f.split(".")[1] != fn[count].split(".")[1]:
            print("*******GSE change")
        if f.split(".")[0] != fn[count].split(".")[0]:
            print("*******Name change")
        if f.split(".")[-1] != fn[count].split(".")[-1]:
            print("*******File change")
        print()
    count = count + 1

test_fn: 3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt
fn: 3T3_Norm.GSE178346.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt
*******GSE change

test_fn: 3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt
fn: 3T3_TCF1.GSE178346.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt
*******GSE change

test_fn: AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b2.macs2_peaks.narrowPeak_Q0.01filt
fn: AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b2.macs2_peaks.narrowPeak_Q0.01filt
*******GSE change

test_fn: AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b2.macs2_peaks.narrowPeak_Q0.01filt
fn: AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b2.macs2_peaks.narrowPeak_Q0.01filt
*******GSE change

test_fn: AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1.macs2_peaks.narrowPeak_Q0.01filt
fn: AML12_shSafb.GSE141104.Mus_Musculus.H3K9me3.b1.macs2_peaks.narrowPeak_Q0.01filt
*******GSE change

test_fn: AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1.macs2_pe

In [99]:
## want 237; 32
num_samples_chipseq = 0
samples_chipseq = []
for index, row in df.iterrows():
    if '/mnt/' in row['chipseq_path']:
        num_samples_chipseq = num_samples_chipseq + 1
        study = row['std_sample_name'].split('.')[1]
        if study not in samples_chipseq:
            samples_chipseq.append(study)
print("num samples with chipseq:", num_samples_chipseq)
print("num studies with chipseq:", len(samples_chipseq))
print((' \n').join(samples_chipseq))

num samples with chipseq: 243
num studies with chipseq: 33
GSE128106 
GSE156650 
GSE154513 
GSE136090 
GSE165207 
phs001703v3p1 
GSE159985 
GSE117888 
GSE165303 
GSE180194 
GSE131054 
GSE151001 
GSE105028 
GSE179544 
GSE136629 
GSE147646 
GSE137849 
GSE108869 
GSE99519 
GSE166232 
GSE116869 
GSE173843 
GSE157381 
GSE97585 
GSE131651 
GSE115492 
phs001703v4p1 
GSE111537 
GSE116495 
GSE179666 
GSE149420 
GSE157107 
GSE116193


In [40]:
print((' \n').join(matching_chipseq_human))

GSE136090 
GSE116193 
GSE105028 
GSE128106 
GSE147646 
GSE179544 
GSE108869 
GSE117888 
GSE116869 
GSE116495 
GSE115492 
GSE111537 
GSE131054 
GSE131651 
GSE136629 
GSE137849 
GSE149420 
GSE151001 
GSE154513 
GSE156650 
GSE159985 
GSE157107 
GSE157381 
GSE165207 
GSE173843 
GSE179666 
GSE180194 
GSE97585 
GSE166232 
GSE165303 
phs001703v3p1 
phs001703v4p1


#### Output the dataframe as a tsv file

In [61]:
without_header_output = '{}.without_header.tsv'.format(output_prefix)
df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [26]:
def generate_df_merged_validpairs():
    
    data = []
    hicpro_paths = glob.glob('results/pieqtl_ncm_rep_combined_donorwise/validpairs/*/*.allValidPairs')
    
    # iterate through each HiC-Pro output directory
    for fn in hicpro_paths:
        
        sample_name = fn.split('/')[3]
        sample_name_chipseq = sample_name[:-12] + "1"
        print(sample_name_chipseq)
        # hicpro validpairs file
        hicpro_path = '/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/'+ fn
        
        chipseq_peak_file="Not_processed_or_not_available"
        chipline_path="/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks"
        
        # for in-house HiChIP samples with pre-generated peaks
    
        # update hicpro sample names to match chipline sample names for pieQTL/NCM samples            
        if 'phs001703v' in sample_name:
            if 'phs001703v3' in sample_name:
                cell_type = sample_name.split('.')[0].split('_1')[0]
                sample_name_chipseq = cell_type + "_merged_donors_hg38.phs001703v3p1.Homo_Sapiens.H3K27ac.b1"
            if 'phs001703v4' in sample_name:
                cell_type = sample_name.split('.')[0].split('_1')[0]
                sample_name_chipseq = cell_type + "_merged_donors_hg38.phs001703v4p1.Homo_Sapiens.H3K27ac.b1"
                
                
        # check whether a folder matching sample_name exists in the merged_chipline folder
        merged_chipline_path = chipline_path + "/merged_chipline/" + sample_name_chipseq
        if os.path.exists(merged_chipline_path):
            chipseq_peak_file = merged_chipline_path + "/FINAL_IDR_Peaks_FDR0.05.txt"

        # check whether a folder matching sample_name exists in the chipline folder
        # IMPORTANT: Replace the last character of the sample name with 1 (so it is .b1 to match with the ChIP-seq loops)
        chipline_path += "/chipline/" + sample_name_chipseq + "/MACS2_Ext*/" + sample_name_chipseq + ".macs2_peaks.narrowPeak_Q0.01filt"
        if chipseq_peak_file == "Not_processed_or_not_available" and len(glob.glob(chipline_path)) > 0:
            chipseq_peak_file = glob.glob(chipline_path)[0]

        # check if the matching ChIP-seq sample does not exist
        if any(gse_id in sample_name_chipseq for gse_id in processed_gse_ids) and not os.path.exists(chipseq_peak_file):
            chipseq_peak_file = "Not_available"

        # fix path if assigned chip-seq path does not contain peak calls
        ## NOTE: REMOVE FIRST CONDITION FOR ADDITION OF CHIPSEQ PEAKS TO FILE
        if chipseq_peak_file!="Not_processed_or_not_available" and chipseq_peak_file != "Not_available" and os.path.getsize(chipseq_peak_file) == 0:
            print("sample:", sample_name)
            print("old path:", chipseq_peak_file)

            new_path = ''
            if "merged_chipline" in chipseq_peak_file:
                new_path = glob.glob(chipline_path)[0]
                if os.path.getsize(new_path) == 0:
                    print("rep 1 was empty")
                    new_path = glob.glob(chipline_path)[0].replace('b1', 'b2')

            if new_path == '':
                if os.path.getsize('/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)) == 0:
                    print("reg chipline 0.05 filt is also empty")
                    new_path = "Not_available"
                else:
                    print("reg chipline 0.05 filt is NOT empty")
                    new_path = '/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline/{sn}/MACS2_Ext_with_Control/{sn}.macs2_peaks.narrowPeak_Q0.05filt'.format(sn = sample_name_chipseq)

            print("new path:", new_path)
            print("new path size:", os.path.getsize(new_path))
            print(os.path.exists(new_path))
            print()

            chipseq_peak_file = new_path
            
        # check whether a folder matching the sample name exists in the fithichip folder
        fithichip_peak_file = "Not_processed"
        fithichip_peak_path = 'results/peaks/fithichip/{sn}/MACS2_ExtSize/out_macs2_peaks.narrowPeak'.format(sn = sample_name)
        if os.path.exists(fithichip_peak_path):
            fithichip_peak_file = '/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + fithichip_peak_path
        
        # check whether a folder matching the sample name exists in the hichip-peaks folder
        hichippeaks_peak_file = "Not_processed"
        hichippeaks_path = 'results/peaks/hichip-peaks/{sn}/out_peaks.bed'.format(sn = sample_name)
        if os.path.exists(hichippeaks_path):
            hichippeaks_peak_file = '/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/' + hichippeaks_path

        # add all the information for this sample to the 2D array
        sample_data = [sample_name, hicpro_path, hichippeaks_peak_file, fithichip_peak_file, chipseq_peak_file]
        
        data.append(sample_data)
        
    return pd.DataFrame(data)