In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling')

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

outdir = 'results/peaks/'
ref = 'mm10'
rerun = True

In [2]:
assigned_date = '03_08_24' # MM.DD.YY

# Process ChIP-seq Data

### Helper functions

In [3]:
def read_chipseq_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            info = line.strip().split()
            peak_info = info[0:3]
            peak_data[count] = peak_info
            count = count + 1
    return(peak_data)

In [4]:
def get_chipseq_path(samplesheet, sample_name):
    path=''
    with open(samplesheet) as fr:
        for line in fr:
            if sample_name in line:
                info = line.strip().split()
                path = info[2]
    return(path) 

In [5]:
def generate_chipseq_df(outfn='test.tsv', rerun=False):
    
    if rerun == True:
        data = []
        glob_str = 'results/hicpro/*Mus_Musculus*/hic_results'

        for file in glob.glob(glob_str):
            
            # get sample information
            sample_name = file.split('/')[2]
            sample_info = []
            sample_info.append(sample_name)

            # parse MACS2 data
            fn = 'results/samplesheets/post-hicpro/2024.2.15.10.52.peaks_files_chipseq.all_batches.samplesheet.without_header.tsv'
            chip_path = get_chipseq_path(fn, sample_name)
            if os.path.exists(chip_path):
                peak_info = read_chipseq_files(chip_path)
                sample_info.append(len(peak_info))
                peak_sizes = []
                for peak in peak_info.items():
                    peak_sizes.append(int(int(peak[1][2])-int(peak[1][1])))
                sample_info.append(np.mean(peak_sizes))
            else:
                sample_info.append(-1)
                sample_info.append(-1)
            data.append(sample_info)

        # create a dataframe and save to file
        df = pd.DataFrame(data)
        df.to_csv(outfn, sep='\t', index=None)
        
    else:
        # load old data when re-run is not required
        df = pd.read_table(outfn)
        
    return df

In [6]:
# extract the chipseq_std_sample_name
def get_chipseq_std(x):
    if len(x) > 2:
        
        # sometimes a matching chipseq file could not be found, return -1
        if x[-1] == 'Could not find a matching file':
            return(-1)
        
        # else return the chipseq std sample name within the path
        else:
            return(x[-3])
    else:
        return(-1)

### Loading data

In [7]:
# load the hicpro samples
# samples = pd.read_table('results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv', header=None)
# columns = ['std_sample_name', 'gse_id', 'organism', 'bio_rep', 'antibody_target', 'restriction_enzyme', 'sample_name']
# samples.columns = columns

In [8]:
# load mapping between hicpro and chipseq std sample names
fn = 'results/samplesheets/post-hicpro/2024.2.1.10.52.peaks_files_chipseq.batch1.samplesheet.without_header.tsv'
batch1_df = pd.read_table(fn, names=['hicpro_std_sample_name', 'hicpro_path', 'chipseq_path'])

fn = 'results/samplesheets/post-hicpro/2024.2.1.10.52.peaks_files_chipseq.batch2.samplesheet.without_header.tsv'
batch2_df = pd.read_table(fn, names=['hicpro_std_sample_name', 'hicpro_path', 'chipseq_path'])

# concating the batches
hicpro_to_chipseq = pd.concat([batch1_df, batch2_df])

# extracting the chipseq std sample name
hicpro_to_chipseq['chipseq_std_sample_name'] = hicpro_to_chipseq.chipseq_path.str.split('/').apply(get_chipseq_std)

In [9]:
hicpro_to_chipseq.loc[hicpro_to_chipseq['chipseq_std_sample_name'] == -1].shape

(4, 4)

In [10]:
hicpro_to_chipseq.head()

Unnamed: 0,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name
0,H9.GSE105028.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/H9.GSE105028.Homo_Sapiens.CTCF.b1/hic_results/data/H9.GSE105028.Homo_Sapiens.CTCF.b1/H9.GSE105028.Homo_Sapiens.CTCF.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_No_Control/H9.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt,H9.GSE105028.Homo_Sapiens.CTCF.b1
1,H9-HS.GSE105028.Homo_Sapiens.CTCF.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1/hic_results/data/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1/MACS2_Ext_No_Control/H9-HS.GSE105028.Homo_Sapiens.CTCF.b1.macs2_peaks.narrowPeak_Q0.01filt,H9-HS.GSE105028.Homo_Sapiens.CTCF.b1
2,H9.GSE105028.Homo_Sapiens.H3K4me1.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/H9.GSE105028.Homo_Sapiens.H3K4me1.b1/hic_results/data/H9.GSE105028.Homo_Sapiens.H3K4me1.b1/H9.GSE105028.Homo_Sapiens.H3K4me1.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.H3K4me1.b1/MACS2_Ext_No_Control/H9.GSE105028.Homo_Sapiens.H3K4me1.b1.macs2_peaks.narrowPeak_Q0.01filt,H9.GSE105028.Homo_Sapiens.H3K4me1.b1
3,H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1/hic_results/data/H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1/H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1/MACS2_Ext_No_Control/H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1.macs2_peaks.narrowPeak_Q0.01filt,H9-HS.GSE105028.Homo_Sapiens.H3K4me1.b1
4,H9.GSE105028.Homo_Sapiens.KLF4.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/H9.GSE105028.Homo_Sapiens.KLF4.b1/hic_results/data/H9.GSE105028.Homo_Sapiens.KLF4.b1/H9.GSE105028.Homo_Sapiens.KLF4.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/H9.GSE105028.Homo_Sapiens.KLF4.b1/MACS2_Ext_No_Control/H9.GSE105028.Homo_Sapiens.KLF4.b1.macs2_peaks.narrowPeak_Q0.01filt,H9.GSE105028.Homo_Sapiens.KLF4.b1


#### Reviewing the data

In [11]:
print('Unique chipseq peak results: {}'.format(hicpro_to_chipseq.chipseq_std_sample_name.nunique()))

Unique chipseq peak results: 189


In [12]:
# count the number of unique per organim
hicpro_to_chipseq.loc[:, 'organism'] = hicpro_to_chipseq.hicpro_std_sample_name.str.split('.').apply(lambda x: x[2])
org_grps = hicpro_to_chipseq.groupby('organism')
org_grps.apply(lambda df: df.chipseq_std_sample_name.nunique())

organism
GSE212978         2
GSE213385         1
Homo_Sapiens    147
Mus_Musculus     39
dtype: int64

In [13]:
# rogue groups aka, forgot to replace . with -
print('GSE212978')
display(org_grps.get_group('GSE212978'))
print('GSE213385')
display(org_grps.get_group('GSE213385'))

GSE212978


Unnamed: 0,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name,organism
42,JN-DSRCT1.shEWSWT1.GSE212978.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/JN-DSRCT1.shEWSWT1.GSE212978.Homo_Sapiens.H3K27ac.b1/hic_results/data/JN-DSRCT1.shEWSWT1.GSE212978.Homo_Sapiens.H3K27ac.b1/JN-DSRCT1.shEWSWT1.GSE212978.Homo_Sapiens.H3K27ac.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/JN-DSRCT1.shEWSWT1.GSE212977.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_No_Control/JN-DSRCT1.shEWSWT1.GSE212977.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt,JN-DSRCT1.shEWSWT1.GSE212977.Homo_Sapiens.H3K27ac.b1,GSE212978
43,JN-DSRCT1.shGFP.GSE212978.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/JN-DSRCT1.shGFP.GSE212978.Homo_Sapiens.H3K27ac.b1/hic_results/data/JN-DSRCT1.shGFP.GSE212978.Homo_Sapiens.H3K27ac.b1/JN-DSRCT1.shGFP.GSE212978.Homo_Sapiens.H3K27ac.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/JN-DSRCT1.shGFP.GSE212977.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_No_Control/JN-DSRCT1.shGFP.GSE212977.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt,JN-DSRCT1.shGFP.GSE212977.Homo_Sapiens.H3K27ac.b1,GSE212978


GSE213385


Unnamed: 0,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name,organism
44,L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b1/hic_results/data/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b1/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_No_Control/L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt,L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1,GSE213385
45,L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b2,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b2/hic_results/data/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b2/L3.6pl.GSE213385.Homo_Sapiens.H3K27ac.b2.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1/MACS2_Ext_No_Control/L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt,L3.6pl.GSE213378.Homo_Sapiens.H3K27ac.b1,GSE213385


#### Generate the main ChIP-seq dataframe 

In [14]:
org = "Mus_musculus"

In [15]:
# generate the chipseq dataframe, update rerun as needed
outfn = os.path.join(outdir, r'{ref}.analysis.peaks.summary.tsv')
chipseq_df = generate_chipseq_df(outfn=outfn, rerun=rerun)

# add column  names and sort
chipseq_df.columns = ["sample_name", "num_peaks_chipseq", "avg_peak_size_chipseq"]
chipseq_df = chipseq_df.sort_values(by=["sample_name"], ascending=True).reset_index(drop=True)

# add chipseq std sample name and remove duplicates
chipseq_df = chipseq_df.merge(hicpro_to_chipseq, left_on='sample_name', right_on='hicpro_std_sample_name')
chipseq_df.drop_duplicates(subset=['chipseq_std_sample_name'], inplace=True)

In [16]:
chipseq_df.head()

Unnamed: 0,sample_name,num_peaks_chipseq,avg_peak_size_chipseq,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name,organism
0,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,180135,631.941766,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1/hic_results/data/3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1/3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/3T3_Norm.GSE178346.Mus_Musculus.SMC1A.b1/MACS2_Ext_No_Control/3T3_Norm.GSE178346.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt,3T3_Norm.GSE178346.Mus_Musculus.SMC1A.b1,Mus_Musculus
1,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,153909,628.176793,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1/hic_results/data/3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1/3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/3T3_TCF1.GSE178346.Mus_Musculus.SMC1A.b1/MACS2_Ext_No_Control/3T3_TCF1.GSE178346.Mus_Musculus.SMC1A.b1.macs2_peaks.narrowPeak_Q0.01filt,3T3_TCF1.GSE178346.Mus_Musculus.SMC1A.b1,Mus_Musculus
2,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,144388,820.821516,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1/hic_results/data/AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1/AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b1/MACS2_Ext_No_Control/AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b1.macs2_peaks.narrowPeak_Q0.01filt,AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b1,Mus_Musculus
4,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1,145594,775.635198,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1/hic_results/data/AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1/AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/AML12_shSafb.GSE141104.Mus_Musculus.H3K9me3.b1/MACS2_Ext_No_Control/AML12_shSafb.GSE141104.Mus_Musculus.H3K9me3.b1.macs2_peaks.narrowPeak_Q0.01filt,AML12_shSafb.GSE141104.Mus_Musculus.H3K9me3.b1,Mus_Musculus
6,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1,771880,349.529273,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/hicpro/BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1/hic_results/data/BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1/BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1.allValidPairs,/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/peaks/chipline_v2/BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1/MACS2_Ext_No_Control/BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1.macs2_peaks.narrowPeak_Q0.01filt,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1,Mus_Musculus


In [17]:
chipseq_df.loc[chipseq_df.sample_name == org]

Unnamed: 0,sample_name,num_peaks_chipseq,avg_peak_size_chipseq,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name,organism


In [18]:
#has_chipseq = df.loc[df['num_peaks_fithichip'] == 0]
len(chipseq_df.loc[chipseq_df["num_peaks_chipseq"] != -1])
#print(len(has_chipseq))
#has_inferred = has_chipseq.loc[df['num_peaks_fithichip'] != 0]
#low = ((has_inferred.num_peaks_hichip_peaks < 5000) | (has_inferred.num_peaks_fithichip < 5000) | (has_inferred.num_peaks_chipseq < 5000))
#has_inferred[low].drop(columns=['avg_peak_size_hichip_peaks', 'avg_peak_size_fithichip', 'avg_peak_size_chipseq']).reset_index(drop=True)

print('Problem with these samples here:')
display(chipseq_df.loc[chipseq_df["num_peaks_chipseq"] == -1])

Problem with these samples here:


Unnamed: 0,sample_name,num_peaks_chipseq,avg_peak_size_chipseq,hicpro_std_sample_name,hicpro_path,chipseq_path,chipseq_std_sample_name,organism


In [19]:
# save the summary for the loop catalog website/later use
save_df = chipseq_df[['chipseq_std_sample_name', 'num_peaks_chipseq', 'avg_peak_size_chipseq']]

# save in excel format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.chipseq.{assigned_date}.xlsx"
save_df.to_excel(outfn); os.chmod(outfn, 0o664)

# save in tsv format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.chipseq.{assigned_date}.tsv"
save_df.to_csv(outfn, sep = "\t", index = None); os.chmod(outfn, 0o664)

In [20]:
save_df

Unnamed: 0,chipseq_std_sample_name,num_peaks_chipseq,avg_peak_size_chipseq
0,3T3_Norm.GSE178346.Mus_Musculus.SMC1A.b1,180135,631.941766
1,3T3_TCF1.GSE178346.Mus_Musculus.SMC1A.b1,153909,628.176793
2,AML12_shCtrl.GSE141104.Mus_Musculus.H3K9me3.b1,144388,820.821516
4,AML12_shSafb.GSE141104.Mus_Musculus.H3K9me3.b1,145594,775.635198
6,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1,771880,349.529273
8,BL6_CD4_CD8.GSE141847.Mus_Musculus.SMC1A.b1,134931,616.007278
10,BMDC-CTCF_cKO-LPS.GSE185880.Mus_Musculus.H3K27ac.b1,2829,504.074938
12,BMDC-CTCF_cKO.GSE185880.Mus_Musculus.H3K27ac.b1,43050,557.478142
14,BMDC-JSH-JSH-LPS.GSE185880.Mus_Musculus.H3K27ac.b1,214756,979.582894
16,BMDC-JSH.GSE185880.Mus_Musculus.H3K27ac.b1,210499,1021.500278


# Load FithiChIP Data

In [21]:
def read_peak_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            info = line.strip().split()
            peak_info = info[0:3]
            peak_data[count] = peak_info
            count = count + 1
    return(peak_data)

In [22]:
def generate_fithichip_df(outfn='test.tsv', rerun=False):
    
    if rerun == True:
        data = []
        glob_str = 'results/hicpro/*Mus_Musculus*/hic_results'

        for i, file in enumerate(glob.glob(glob_str)):
            
            # get sample information
            sample_name = file.split('/')[2]
            sample_info = []
            sample_info.append(sample_name)

            ## FitHiChIP peaks
            fithichip_path = 'results/peaks/fithichip/{sn}/MACS2_ExtSize/out_macs2_peaks.narrowPeak'.format(sn = sample_name)
            if os.path.exists(fithichip_path):
                
                # count the number of peaks
                peak_info = read_peak_files(fithichip_path)
                sample_info.append(len(peak_info))
                
                # averaging peak sizes
                peak_sizes = []
                for peak in peak_info.items():
                    peak_sizes.append(int(int(peak[1][2])-int(peak[1][1])))
                sample_info.append(np.mean(peak_sizes))

            else:
                
                # setting defaults/not present values, -1 is used to signify this
                sample_info.append(-1)
                sample_info.append(-1)

            data.append(sample_info)
                
            if i % 100 == 0:
                print('Processed: {}'.format(i))

        # create a dataframe and save to file
        df = pd.DataFrame(data)
        df.to_csv(outfn, sep='\t', index=None)
        
    else:
        # load old data when re-run is not required
        df = pd.read_table(outfn)
        
    return df

### Loading data

#### Generate the main fithichip dataframe 

In [23]:
org = "Mus_Musculus"

In [24]:
# generate the chipseq dataframe, update rerun as needed
outfn = os.path.join(outdir, f'{ref}.analysis.peaks.summary.fithichip.tsv')
fithichip_df = generate_fithichip_df(outfn=outfn, rerun=rerun)

Processed: 0
Processed: 100


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Processed: 200


In [25]:
# add column  names and sort
fithichip_df.columns = ["sample_name", "num_peaks_fithichip", "avg_peak_size_fithichip"]
fithichip_df = fithichip_df.sort_values(by=["sample_name"], ascending=True).reset_index(drop=True)

In [26]:
fithichip_df.head()

Unnamed: 0,sample_name,num_peaks_fithichip,avg_peak_size_fithichip
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,13,179.461538
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,1084,319.728782
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,961,262.8564
3,3T3.GSE192387.Mus_Musculus.BATF.b1,31330,265.011969
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,84620,245.203758


In [27]:
fithichip_df.shape

(281, 3)

In [28]:
#has_chipseq = df.loc[df['num_peaks_fithichip'] == 0]
len(fithichip_df.loc[fithichip_df["num_peaks_fithichip"] != -1])
#print(len(has_chipseq))
#has_inferred = has_chipseq.loc[df['num_peaks_fithichip'] != 0]
#low = ((has_inferred.num_peaks_hichip_peaks < 5000) | (has_inferred.num_peaks_fithichip < 5000) | (has_inferred.num_peaks_chipseq < 5000))
#has_inferred[low].drop(columns=['avg_peak_size_hichip_peaks', 'avg_peak_size_fithichip', 'avg_peak_size_chipseq']).reset_index(drop=True)

print('Problem with these samples here:')
display(fithichip_df.loc[fithichip_df["num_peaks_fithichip"] == -1])

Problem with these samples here:


Unnamed: 0,sample_name,num_peaks_fithichip,avg_peak_size_fithichip


In [29]:
# save the summary for the loop catalog website/later use
save_df = fithichip_df[['sample_name', 'num_peaks_fithichip', 'avg_peak_size_fithichip']]

# save in excel format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.fithichip.{assigned_date}.xlsx"
save_df.to_excel(outfn); os.chmod(outfn, 0o664)

# save in tsv format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.fithichip.{assigned_date}.tsv"
save_df.to_csv(outfn, sep = "\t", index = None); os.chmod(outfn, 0o664)

In [30]:
7 + 7

14