In [None]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
import config
os.chdir(config.LOOP_CATALOG_DIR)
latest_date = tracker.processing_dates[-1]
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [None]:
latest_date='2024.1.16.10.52'

In [None]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-loop-calling'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [None]:
paths = glob.glob("results/biorep_merged/results/motif_analysis/meme/fimo/*/summarize_results/summary.txt")

In [None]:
len(paths)

In [None]:
for p in paths:
    if os.path.getsize(p) <= 1:
        print(p)
    with open(p) as motifs:
        count = 0
        for line in motifs:
            count +=1
        print(count)

In [None]:
def read_out_log(log):
    
    status = 0
    f_status = 0
    sample_name = 'check'
    peaks_found = 0
    no_interact = 0
    date = ''
    config = ''
    peak_type = ''

     # extract job_id and index
    meta = os.path.basename(log)
    job_id = meta.split('-')[1].split('.')[0]
    #job_id = meta[0].replace('o', '')
    #index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            if 'peaks found and will be used to call loops' in info:
                peaks_found = 1
            if 'FitHiChIP pipeline is completely executed - congratulations !!!' in info:
                f_status = 1
            if 'SORRY !!!!!!!! FitHiChIP could not find any statistically significant interactions' in info:
                no_interact = 1
            if 'Ended: fithichip loop calling' in info:
                status = 1
            if 'Started: run_fithichip_loopcalling' in info:
                config = info.split("_")[3]
            if 'Selected Peak Mode' in info:
                mode = info.strip().split(": ")[1]
                if mode == "2":
                    peak_type = "fithichip"
                elif mode == "3":
                    peak_type = "chipseq"
                

    return([sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status])

In [None]:
def read_error_log(log):
    
    status = 1
    eline = []
    has_errors = 0
    mem = 0
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'cat: /var/spool/torque/aux' in info:
                continue
            if 'Loading required package:' in info:
                continue
            if 'Attaching package:' in info:
                continue
            if 'The following objects are masked' in info:
                continue
            if line.startswith('\t'):
                continue
            if 'The following object is masked' in info:
                continue
            if 'spar-finding: non-finite value inf; using BIG value' in info:
                continue
            if 'Warning message' in info:
                continue
            if line.startswith('\n'):
                continue
            if 'cluster' in line:
                continue
            if 'par' in line:
                continue
            if 'IQR' in line or 'anyDuplicated' in line or 'dirname' in line or 'grepl' in line or 'order' in line or 'rbind' in line or 'union' in line or 'expand.grid' in line or 'shift' in line or 'first, second' in line:
                continue
            if 'containing missing values (geom_bar).' in line:
                continue
            if 'containing missing values (geom_path).' in line:
                continue
            if 'containing missing values (geom_point).' in line:
                continue
            if 'NaNs produced' in line:
                continue
            if 'In data.frame(group = paste(' in line:
                continue
            if 'aesthetic for lines was deprecated in ggplot2 3.4.0.' in line:
                continue
            if 'Please use `linewidth` instead.' in line:
                continue
            if 'Removed 2 rows containing missing values (`geom_bar()`).' in line:
                continue
            status = 0
            eline.append(line)
            if 'memory' in line:
                mem = 1
        if len(eline) != 0:
            has_errors = 1
            
    return(status, eline, has_errors, mem)

## Check Logs

In [None]:
# load the data
df = pd.read_table(input_fn, skiprows=0, header = None)
df['sample_index'] = df.index
print('number of samples:', len(df))

In [None]:
df

In [None]:
# create a dataframe of log information, 506307, 508200, 5085036 (fixing mem errors), 50850 37-42 (hichip-peaks), 51014 (fithichip peaks)
output_logs = glob.glob('results/biorep_merged/results/loops/logs/job-7145*.out')
error_logs = glob.glob('results/biorep_merged/results/loops/logs/job-7145*.error')
log_data = []   
for out_log in output_logs: 
    sample_index = out_log.split('-')[1].split('.')[0]
    sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status = read_out_log(out_log)
    error_log = [log for log in error_logs if log.split('-')[1].split('.')[0] == sample_index][0]
    error_status, eline, has_errors, mem = read_error_log(error_log)
    log_data.append([sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status, error_status, eline, has_errors, mem, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'no_interact', 'f_status', 'out_status', 'error_status', 'errors', 'has_errors', 'mem', 'log']
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True)

In [None]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

## Identifying problem samples

In [None]:
problems = (log_df.out_status == 0) | (log_df.error_status == 0) | (log_df.has_errors == 1)
problems_df = log_df.loc[problems,:]
print('number of problem samples:', len(problems_df))

In [None]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']]

In [None]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']].loc[problems_df['config'] == "S5"]

In [None]:
problems_df = problems_df.sort_values(by='std_sample_name').reset_index(drop=True)
problems_df.to_excel("problems_biorepmerged.xlsx", sheet_name='sample_summary')