In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.post_hicpro_processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.12.09.12'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-loop-calling'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def read_out_log(log):
    
    status = 0
    f_status = 0
    sample_name = 'check'
    peaks_found = 0
    no_interact = 0
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            if 'hichip-peaks peaks found and will be used to call loops' in info:
                peaks_found = 1
            if 'FitHiChIP pipeline is completely executed - congratulations !!!' in info:
                f_status = 1
            if 'SORRY !!!!!!!! FitHiChIP could not find any statistically significant interactions' in info:
                no_interact = 1
            if 'Ended: fithichip loop calling' in info:
                status = 1
                

    return([index, sample_name, job_id, peaks_found, no_interact, f_status, status])

In [5]:
def read_error_log(log):
    
    status = 1
    eline = []
    has_errors = 0
    mem = 0
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'cat: /var/spool/torque/aux' in info:
                continue
            if 'Loading required package:' in info:
                continue
            if 'Attaching package:' in info:
                continue
            if 'The following objects are masked' in info:
                continue
            if line.startswith('\t'):
                continue
            if 'The following object is masked' in info:
                continue
            if 'spar-finding: non-finite value inf; using BIG value' in info:
                continue
            if 'Warning message' in info:
                continue
            if line.startswith('\n'):
                continue
            if 'cluster' in line:
                continue
            if 'par' in line:
                continue
            if 'IQR' in line or 'anyDuplicated' in line or 'dirname' in line or 'grepl' in line or 'order' in line or 'rbind' in line or 'union' in line or 'expand.grid' in line or 'shift' in line or 'first, second' in line:
                continue
            if 'containing missing values (geom_bar).' in line:
                continue
            if 'containing missing values (geom_path).' in line:
                continue
            if 'containing missing values (geom_point).' in line:
                continue
            if 'NaNs produced' in line:
                continue
            if 'In data.frame(group = paste(' in line:
                continue
            status = 0
            eline.append(line)
            if 'memory' in line:
                mem = 1
        if len(eline) != 0:
            has_errors = 1
            
    return(status, eline, has_errors, mem)

## Check Logs

In [6]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed
print('number of samples:', len(df))

number of samples: 338


In [7]:
# create a dataframe of log information, 506307, 508200, 5085036 (fixing mem errors), 50850 37-42 (hichip-peaks), 51014 (fithichip peaks)
output_logs = glob.glob('results/loops/logs/run_fithichip_loopcalling*.o51014*-*')
log_data = []   
for out_log in output_logs: 
    sample_index = out_log.split('-')[1]
    index, sample_name, job_id, peaks_found, no_interact, f_status, status = read_out_log(out_log)
    error_logs = [log for log in glob.glob('results/loops/logs/run_fithichip_loopcalling*.e51014*-*') if log.split('_')[3].split('.')[0] == out_log.split('/')[3].split('_')[3].split('.')[0] and log.split('-')[1] == sample_index]
    error_log = error_logs[0]
    if len(error_logs) != 1:
        print("Length more than 1")
        print(error_logs)
    error_status, eline, has_errors, mem = read_error_log(error_log)
    log_data.append([index, sample_name, job_id, peaks_found, no_interact, f_status, status, error_status, eline, has_errors, mem, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'job_id', 'peaks_found', 'no_interact', 'f_status', 'out_status', 'error_status', 'errors', 'has_errors', 'mem', 'log']
log_df.drop('sample_index', inplace=True, axis=1)
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True)

In [9]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,job_id,peaks_found,no_interact,f_status,out_status,error_status,errors,has_errors,mem,log
0,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,5101446,0,1,0,1,1,[],0,0,run_fithichip_loopcalling_S25.o5101446-2
1,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,5101419,0,1,0,1,1,[],0,0,run_fithichip_loopcalling_L10.o5101419-2
2,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,5101463,0,1,0,1,1,[],0,0,run_fithichip_loopcalling_S10.o5101463-2
3,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,5101411,0,1,0,1,1,[],0,0,run_fithichip_loopcalling_L5.o5101411-2
4,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,5101473,0,1,0,1,1,[],0,0,run_fithichip_loopcalling_S5.o5101473-2
...,...,...,...,...,...,...,...,...,...,...,...
2017,lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1,5101435,0,0,1,1,1,[],0,0,run_fithichip_loopcalling_L25.o5101435-199
2018,lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1,5101446,0,0,1,1,1,[],0,0,run_fithichip_loopcalling_S25.o5101446-199
2019,lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1,5101411,0,0,1,1,1,[],0,0,run_fithichip_loopcalling_L5.o5101411-199
2020,lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1,5101419,0,0,1,1,1,[],0,0,run_fithichip_loopcalling_L10.o5101419-199


## Identifying problem samples

In [10]:
problems = (log_df.mem == 1)
problems_df = log_df.loc[problems,:]
print('number of problem samples:', len(problems_df))

number of problem samples: 4


In [11]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']]

Unnamed: 0,std_sample_name,job_id,peaks_found,f_status,out_status,no_interact,error_status,errors,log
332,DND41-DMSO.GSE173871.Homo_Sapiens.SMC1A.b1,5101411,0,0,0,0,0,"[cat: /sys/fs/cgroup/memory/torque/5101411[64].herman.hpc.lji.org/memory.limit_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[64].herman.hpc.lji.org/memory.max_usage_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[64].herman.hpc.lji.org/memory.failcnt: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[64].h...",run_fithichip_loopcalling_L5.o5101411-64
626,HCC1599-GSI.GSE116872.Homo_Sapiens.SMC1A.b1,5101411,0,0,0,0,0,"[cat: /sys/fs/cgroup/memory/torque/5101411[115].herman.hpc.lji.org/memory.limit_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[115].herman.hpc.lji.org/memory.max_usage_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[115].herman.hpc.lji.org/memory.failcnt: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[11...",run_fithichip_loopcalling_L5.o5101411-115
748,HaCaT_Unstimulated.GSE151193.Homo_Sapiens.H3K27ac.b1,5101411,0,0,0,0,0,"[cat: /sys/fs/cgroup/memory/torque/5101411[108].herman.hpc.lji.org/memory.limit_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[108].herman.hpc.lji.org/memory.max_usage_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[108].herman.hpc.lji.org/memory.failcnt: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[10...",run_fithichip_loopcalling_L5.o5101411-108
1873,hESC-WT-D0.GSE120294.Homo_Sapiens.SMC1A.b1,5101411,0,0,0,0,0,"[cat: /sys/fs/cgroup/memory/torque/5101411[143].herman.hpc.lji.org/memory.limit_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[143].herman.hpc.lji.org/memory.max_usage_in_bytes: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[143].herman.hpc.lji.org/memory.failcnt: No such file or directory\n, cat: /sys/fs/cgroup/memory/torque/5101411[14...",run_fithichip_loopcalling_L5.o5101411-143


In [35]:
problems_df = problems_df.sort_values(by='std_sample_name').reset_index(drop=True)
problems_df.to_excel("results/loops/problems.xlsx", sheet_name='sample_summary')