In [10]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Check HiC-Pro and HiCCUPS Output Files for Pairs and Loop Information

#### read output logs helper functions

In [11]:
def read_hiccups_loop_files(file):
    loop_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            if not line.startswith('1'):
                continue
            else:
                info = line.strip().split()
                loop_info = info[0:6]
                loop_data[count] = loop_info
                count = count + 1
    return(loop_data)

In [12]:
def read_hicpro_stats_log(log):
    log_data = {}
    count = 0
    with open(log) as fr:
        for line in fr:
            if line.startswith('#'):
                continue
            else:
                info = line.strip().split()
                pair_info = info[0:2]
                log_data[count] = pair_info
                count = count + 1
    return(log_data)

#### generate data frame with loop and pairs info

In [29]:
loop_data = []
tpl = '/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/loops/hiccups_hicpro_chr1/{sample_name}/'
glob_str = tpl.format(sample_name='*')
for fn in glob.glob(glob_str):
    sample_name = fn.split('/')[10]
    sample_info = []
    file_path5 = os.path.join(fn, "postprocessed_pixels_5000.bedpe")
    file_path10 = os.path.join(fn, "postprocessed_pixels_10000.bedpe")
    file_path25 = os.path.join(fn, "postprocessed_pixels_25000.bedpe")
    file_path_mpairstat = "/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/hicpro/" + sample_name + "/hic_results/stats/" + sample_name + "/" + sample_name + ".mpairstat"
    file_path_mergestat = "/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/results/hicpro/" + sample_name + "/hic_results/stats/" + sample_name + "/" + sample_name + "_allValidPairs.mergestat"
    sample_info.append(sample_name)
    if os.path.exists(file_path_mpairstat):
        pairs_info = read_hicpro_stats_log(file_path_mpairstat)
        sample_info.append(pairs_info[0][1])
    else:
        sample_info.append("None")
    if os.path.exists(file_path_mergestat):
        pairs_info = read_hicpro_stats_log(file_path_mergestat)
        sample_info.append(pairs_info[0][1])
    else:
        sample_info.append("None")
    if os.path.exists(file_path5):
        loop_info = read_hiccups_loop_files(file_path5)
        sample_info.append(len(loop_info))
    else:
        sample_info.append("None")
    if os.path.exists(file_path10):
        loop_info = read_hiccups_loop_files(file_path10)
        sample_info.append(len(loop_info))
    else:
        sample_info.append("None")
    if os.path.exists(file_path25):
        loop_info = read_hiccups_loop_files(file_path25)
        sample_info.append(len(loop_info))
    else:
        sample_info.append("None")    
    loop_data.append(sample_info)
loop_df = pd.DataFrame(loop_data)
loop_df.columns = ["Sample Name", "Total Pairs Processed", "Valid Interaction Pairs", "Number of 5kb Loops (Chr1 Only)", "Number of 10kb Loops (Chr1 Only)", "Number of 25kb Loops (Chr1 Only)"]
loop_df = loop_df.sort_values(by=["Sample Name"]).reset_index(drop=True)
loop_df.index += 1

In [30]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(loop_df)

Unnamed: 0,Sample Name,Total Pairs Processed,Valid Interaction Pairs,Number of 5kb Loops (Chr1 Only),Number of 10kb Loops (Chr1 Only),Number of 25kb Loops (Chr1 Only)
1,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,45904302,14233750,,,
2,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,92276625,23170231,,,
3,293T.GSE128106.Homo_Sapiens.YY1.b1,247695887,24584547,,,0.0
4,AoSMC.GSE178598.Homo_Sapiens.H3K27ac.b1,337307006,191374595,295.0,481.0,509.0
5,BC1.GSE136090.Homo_Sapiens.H3K27ac.b1,39569105,12948113,1.0,2.0,3.0
6,BC3.GSE136090.Homo_Sapiens.H3K27ac.b1,38225847,9976222,,0.0,1.0
7,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,50329964,17716612,2.0,8.0,28.0
8,EBNA2_Neg_Ramos.GSE179755.Homo_Sapiens.H3K27ac.b1,69443381,30727420,7.0,19.0,60.0
9,EBNA2_Plus_Ramos.GSE179755.Homo_Sapiens.H3K27a...,73912411,34665842,13.0,32.0,91.0
10,GM.GSE101498.Homo_Sapiens.H3K27ac.b1,332279257,104375182,441.0,527.0,414.0
