In [2]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [3]:
latest_date

'2022.09.22.15.30'

In [4]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-loop-calling'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [5]:
def read_out_log(log):
    
    status = 0
    f_status = 0
    sample_name = 'check'
    peaks_found = 0
    no_interact = 0
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            if 'chip-seq peaks found and will be used to call loops' in info:
                peaks_found = 1
            if 'FitHiChIP pipeline is completely executed - congratulations !!!' in info:
                f_status = 1
            if 'SORRY !!!!!!!! FitHiChIP could not find any statistically significant interactions' in info:
                no_interact = 1
            if 'Ended: fithichip loop calling' in info:
                status = 1
                

    return([index, sample_name, job_id, peaks_found, no_interact, f_status, status])

In [6]:
def read_error_log(log):
    
    status = 1
    eline = []
    has_errors = 0
    mem = 0
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'cat: /var/spool/torque/aux' in info:
                continue
            if 'Loading required package:' in info:
                continue
            if 'Attaching package:' in info:
                continue
            if 'The following objects are masked' in info:
                continue
            if line.startswith('\t'):
                continue
            if 'The following object is masked' in info:
                continue
            if 'spar-finding: non-finite value inf; using BIG value' in info:
                continue
            if 'Warning message' in info:
                continue
            if line.startswith('\n'):
                continue
            if 'cluster' in line:
                continue
            if 'par' in line:
                continue
            if 'IQR' in line or 'anyDuplicated' in line or 'dirname' in line or 'grepl' in line or 'order' in line or 'rbind' in line or 'union' in line or 'expand.grid' in line or 'shift' in line or 'first, second' in line:
                continue
            if 'containing missing values (geom_bar).' in line:
                continue
            if 'containing missing values (geom_path).' in line:
                continue
            if 'containing missing values (geom_point).' in line:
                continue
            if 'NaNs produced' in line:
                continue
            if 'In data.frame(group = paste(' in line:
                continue
            status = 0
            eline.append(line)
            if 'memory' in line:
                mem = 1
        if len(eline) != 0:
            has_errors = 1
            
    return(status, eline, has_errors, mem)

## Check Logs

In [7]:
# load the data
df = pd.read_table(input_fn, skiprows=0)
df['sample_index'] = df.index
print('number of samples:', len(df))

number of samples: 485


In [8]:
df

Unnamed: 0,293T.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T,sample_index
0,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T-PDS,0
1,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T-TMPYP4,1
2,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA1m1,2
3,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA1m1,3
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA2m1,4
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA2m1,5
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,Homo_Sapiens,3,CTCF,MboI,A673_SA2m1,6
7,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,Homo_Sapiens,4,CTCF,MboI,A673_SA2m1,7
8,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,Homo_Sapiens,5,CTCF,MboI,A673_SA2m1,8
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,Homo_Sapiens,1,H3K27ac,MboI,A673_SA2m1,9


In [52]:
# create a dataframe of log information, 506307, 508200, 5085036 (fixing mem errors), 50850 37-42 (hichip-peaks), 51014 (fithichip peaks)
output_logs = glob.glob('ref_genome/results/loops/logs/run_fithichip_loopcalling*.o614*-*')
log_data = []   
for out_log in output_logs: 
    print(out_log)
    sample_index = out_log.split('-')[1]
    index, sample_name, job_id, peaks_found, no_interact, f_status, status = read_out_log(out_log)
    error_logs = [log for log in glob.glob('ref_genome/results/loops/logs/run_fithichip_loopcalling*.e614*-*') if log.split('_')[5].split('.')[1][1:] == out_log.split('/')[4].split('_')[4].split('.')[1][1:] and log.split('-')[1] == sample_index]
    print(error_logs)
    error_log = error_logs[0]
    if len(error_logs) != 1:
        print("Length more than 1")
        print(error_logs)
    error_status, eline, has_errors, mem = read_error_log(error_log)
    log_data.append([index, sample_name, job_id, peaks_found, no_interact, f_status, status, error_status, eline, has_errors, mem, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'job_id', 'peaks_found', 'no_interact', 'f_status', 'out_status', 'error_status', 'errors', 'has_errors', 'mem', 'log']
log_df.drop('sample_index', inplace=True, axis=1)
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True)

ref_genome/results/loops/logs/run_fithichip_loopcalling_S10_t2t.o6146030-208
['ref_genome/results/loops/logs/run_fithichip_loopcalling_S10_t2t.e6146030-208']
ref_genome/results/loops/logs/run_fithichip_loopcalling_S5_t2t.o6146029-209
['ref_genome/results/loops/logs/run_fithichip_loopcalling_S5_t2t.e6146029-209']


In [53]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,job_id,peaks_found,no_interact,f_status,out_status,error_status,errors,has_errors,mem,log
0,MB157-pInd20EBF1-10dox.GSE173843.Homo_Sapiens.SMC1A.b1,6146030,1,0,1,1,0,"[Error in (function (file = if (onefile) ""Rplots.pdf"" else ""Rplot%03d.pdf"", : \n, cannot open file 'Rplots.pdf'\n, Calls: <Anonymous> -> print.ggplot -> grid.newpage -> <Anonymous>\n, Execution halted\n]",1,0,run_fithichip_loopcalling_S10_t2t.o6146030-208
1,MB157-pInd20TCF1-10dox.GSE173843.Homo_Sapiens.SMC1A.b1,6146029,1,0,1,1,0,"[Error in (function (file = if (onefile) ""Rplots.pdf"" else ""Rplot%03d.pdf"", : \n, cannot open file 'Rplots.pdf'\n, Calls: <Anonymous> -> print.ggplot -> grid.newpage -> <Anonymous>\n, Execution halted\n]",1,0,run_fithichip_loopcalling_S5_t2t.o6146029-209


## Identifying problem samples

In [47]:
problems = (log_df.out_status == 0)
problems_df = log_df.loc[problems,:]
print('number of problem samples:', len(problems_df))

number of problem samples: 3


In [48]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']]

Unnamed: 0,std_sample_name,job_id,peaks_found,f_status,out_status,no_interact,error_status,errors,log
11,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.b1,6127362,1,0,0,0,0,"[Error in (function (file = if (onefile) ""Rplots.pdf"" else ""Rplot%03d.pdf"", : \n, cannot open file 'Rplots.pdf'\n, Calls: <Anonymous> -> print.ggplot -> grid.newpage -> <Anonymous>\n, Execution halted\n]",run_fithichip_loopcalling_S25.o6127362-70
50,NOD_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.b1,6127358,1,0,0,0,1,[],run_fithichip_loopcalling_L10.o6127358-79
55,NOD_CD4_CD8.GSE141847.Mus_Musculus.SMC1A.b1,6127358,1,0,0,0,1,[],run_fithichip_loopcalling_L10.o6127358-77


In [35]:
problems_df = problems_df.sort_values(by='std_sample_name').reset_index(drop=True)
problems_df.to_excel("results/loops/problems.xlsx", sheet_name='sample_summary')