In [None]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling')
#os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling')

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

outdir = 'results/peaks/'
ref = 'hg38'
rerun = False

In [None]:
latest_date = '2024.1.16.10.52'

# Process ChIP-seq Data

### Helper functions

In [None]:
def read_chipseq_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            info = line.strip().split()
            peak_info = info[0:3]
            peak_data[count] = peak_info
            count = count + 1
    return(peak_data)

In [None]:
def get_chipseq_path(samplesheet, sample_name):
    path=''
    with open(samplesheet) as fr:
        for line in fr:
            if sample_name in line:
                info = line.strip().split()
                path = info[2]
    return(path) 

In [None]:
def generate_chipseq_df(outfn='test.tsv', rerun=False):
    
    if rerun == True:
        data = []
        glob_str = 'results/hicpro/*Homo*/hic_results'

        for file in glob.glob(glob_str):
            
            # get sample information
            sample_name = file.split('/')[2]
            sample_info = []
            sample_info.append(sample_name)

            # parse MACS2 data
            fn = 'results/samplesheets/post-hicpro/2024.2.15.10.52.peaks_files_chipseq.all_batches.samplesheet.without_header.tsv'
            chip_path = get_chipseq_path(fn, sample_name)
            if os.path.exists(chip_path):
                peak_info = read_chipseq_files(chip_path)
                sample_info.append(len(peak_info))
                peak_sizes = []
                for peak in peak_info.items():
                    peak_sizes.append(int(int(peak[1][2])-int(peak[1][1])))
                sample_info.append(np.mean(peak_sizes))
            else:
                sample_info.append(-1)
                sample_info.append(-1)
            data.append(sample_info)

        # create a dataframe and save to file
        df = pd.DataFrame(data)
        df.to_csv(outfn, sep='\t', index=None)
        
    else:
        # load old data when re-run is not required
        df = pd.read_table(outfn)
        
    return df

In [None]:
# extract the chipseq_std_sample_name
def get_chipseq_std(x):
    if len(x) > 2:
        
        # sometimes a matching chipseq file could not be found, return -1
        if x[-1] == 'Could not find a matching file':
            return(-1)
        
        # else return the chipseq std sample name within the path
        else:
            return(x[-3])
    else:
        return(-1)

### Loading data

In [None]:
# load the hicpro samples
# samples = pd.read_table('results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv', header=None)
# columns = ['std_sample_name', 'gse_id', 'organism', 'bio_rep', 'antibody_target', 'restriction_enzyme', 'sample_name']
# samples.columns = columns

In [None]:
# load mapping between hicpro and chipseq std sample names
fn = 'results/samplesheets/post-hicpro/2024.2.1.10.52.peaks_files_chipseq.batch1.samplesheet.without_header.tsv'
batch1_df = pd.read_table(fn, names=['hicpro_std_sample_name', 'hicpro_path', 'chipseq_path'])

fn = 'results/samplesheets/post-hicpro/2024.2.1.10.52.peaks_files_chipseq.batch2.samplesheet.without_header.tsv'
batch2_df = pd.read_table(fn, names=['hicpro_std_sample_name', 'hicpro_path', 'chipseq_path'])

# concating the batches
hicpro_to_chipseq = pd.concat([batch1_df, batch2_df])

# extracting the chipseq std sample name
hicpro_to_chipseq['chipseq_std_sample_name'] = hicpro_to_chipseq.chipseq_path.str.split('/').apply(get_chipseq_std)

In [None]:
hicpro_to_chipseq.loc[hicpro_to_chipseq['chipseq_std_sample_name'] == -1].shape

In [None]:
hicpro_to_chipseq.head()

#### Reviewing the data

In [None]:
print('Unique chipseq peak results: {}'.format(hicpro_to_chipseq.chipseq_std_sample_name.nunique()))

In [None]:
# count the number of unique per organim
hicpro_to_chipseq.loc[:, 'organism'] = hicpro_to_chipseq.hicpro_std_sample_name.str.split('.').apply(lambda x: x[2])
org_grps = hicpro_to_chipseq.groupby('organism')
org_grps.apply(lambda df: df.chipseq_std_sample_name.nunique())

In [None]:
# rogue groups aka, forgot to replace . with -
print('GSE212978')
display(org_grps.get_group('GSE212978'))
print('GSE213385')
display(org_grps.get_group('GSE213385'))

#### Generate the main ChIP-seq dataframe 

In [None]:
org = "Homo"

In [None]:
# generate the chipseq dataframe, update rerun as needed
outfn = os.path.join(outdir, 'analysis.peaks.summary.tsv')
chipseq_df = generate_chipseq_df(outfn=outfn, rerun=rerun)

# add column  names and sort
chipseq_df.columns = ["sample_name", "num_peaks_chipseq", "avg_peak_size_chipseq"]
chipseq_df = chipseq_df.sort_values(by=["sample_name"], ascending=True).reset_index(drop=True)

# add chipseq std sample name and remove duplicates
chipseq_df = chipseq_df.merge(hicpro_to_chipseq, left_on='sample_name', right_on='hicpro_std_sample_name')
chipseq_df.drop_duplicates(subset=['chipseq_std_sample_name'], inplace=True)

In [None]:
chipseq_df.head()

In [None]:
chipseq_df.shape

In [None]:
#has_chipseq = df.loc[df['num_peaks_fithichip'] == 0]
len(chipseq_df.loc[chipseq_df["num_peaks_chipseq"] != -1])
#print(len(has_chipseq))
#has_inferred = has_chipseq.loc[df['num_peaks_fithichip'] != 0]
#low = ((has_inferred.num_peaks_hichip_peaks < 5000) | (has_inferred.num_peaks_fithichip < 5000) | (has_inferred.num_peaks_chipseq < 5000))
#has_inferred[low].drop(columns=['avg_peak_size_hichip_peaks', 'avg_peak_size_fithichip', 'avg_peak_size_chipseq']).reset_index(drop=True)

print('Problem with these samples here:')
display(chipseq_df.loc[chipseq_df["num_peaks_chipseq"] == -1])

In [None]:
# save the summary for the loop catalog website/later use
save_df = chipseq_df[['chipseq_std_sample_name', 'num_peaks_chipseq', 'avg_peak_size_chipseq']]

# save in excel format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.chipseq.02_26_24.xlsx"
save_df.to_excel(outfn); os.chmod(outfn, 0o664)

# save in tsv format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.chipseq.02_26_24.tsv"
save_df.to_csv(outfn, sep = "\t", index = None); os.chmod(outfn, 0o664)

# Load FithiChIP Data

In [None]:
def read_peak_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            info = line.strip().split()
            peak_info = info[0:3]
            peak_data[count] = peak_info
            count = count + 1
    return(peak_data)

In [None]:
def generate_fithichip_df(outfn='test.tsv', rerun=False):
    
    if rerun == True:
        data = []
        glob_str = 'results/hicpro/*Homo*/hic_results'

        for i, file in enumerate(glob.glob(glob_str)):
            
            # get sample information
            sample_name = file.split('/')[2]
            sample_info = []
            sample_info.append(sample_name)

            ## FitHiChIP peaks
            fithichip_path = 'results/peaks/fithichip/{sn}/MACS2_ExtSize/out_macs2_peaks.narrowPeak'.format(sn = sample_name)
            if os.path.exists(fithichip_path):
                
                # count the number of peaks
                peak_info = read_peak_files(fithichip_path)
                sample_info.append(len(peak_info))
                
                # averaging peak sizes
                peak_sizes = []
                for peak in peak_info.items():
                    peak_sizes.append(int(int(peak[1][2])-int(peak[1][1])))
                sample_info.append(np.mean(peak_sizes))

            else:
                
                # setting defaults/not present values, -1 is used to signify this
                sample_info.append(-1)
                sample_info.append(-1)

            data.append(sample_info)
                
            if i % 100 == 0:
                print('Processed: {}'.format(i))

        # create a dataframe and save to file
        df = pd.DataFrame(data)
        df.to_csv(outfn, sep='\t', index=None)
        
    else:
        # load old data when re-run is not required
        df = pd.read_table(outfn)
        
    return df

### Loading data

#### Generate the main fithichip dataframe 

In [None]:
org = "Homo"

In [None]:
# generate the chipseq dataframe, update rerun as needed
outfn = os.path.join(outdir, 'analysis.peaks.summary.fithichip.tsv')
fithichip_df = generate_fithichip_df(outfn=outfn, rerun=rerun)

In [None]:
# add column  names and sort
fithichip_df.columns = ["sample_name", "num_peaks_fithichip", "avg_peak_size_fithichip"]
fithichip_df = fithichip_df.sort_values(by=["sample_name"], ascending=True).reset_index(drop=True)

In [None]:
fithichip_df.head()

In [None]:
fithichip_df.shape

In [None]:
#has_chipseq = df.loc[df['num_peaks_fithichip'] == 0]
len(fithichip_df.loc[fithichip_df["num_peaks_fithichip"] != -1])
#print(len(has_chipseq))
#has_inferred = has_chipseq.loc[df['num_peaks_fithichip'] != 0]
#low = ((has_inferred.num_peaks_hichip_peaks < 5000) | (has_inferred.num_peaks_fithichip < 5000) | (has_inferred.num_peaks_chipseq < 5000))
#has_inferred[low].drop(columns=['avg_peak_size_hichip_peaks', 'avg_peak_size_fithichip', 'avg_peak_size_chipseq']).reset_index(drop=True)

print('Problem with these samples here:')
display(fithichip_df.loc[fithichip_df["num_peaks_fithichip"] == -1])

In [None]:
# save the summary for the loop catalog website/later use
save_df = fithichip_df[['sample_name', 'num_peaks_fithichip', 'avg_peak_size_fithichip']]

# save in excel format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.fithichip.02_26_24.xlsx"
save_df.to_excel(outfn); os.chmod(outfn, 0o664)

# save in tsv format
outfn = f"results/tables/final.all_batches/{ref}.unmerged_peaks.fithichip.02_26_24.tsv"
save_df.to_csv(outfn, sep = "\t", index = None); os.chmod(outfn, 0o664)

In [None]:
7 + 7