In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
import config
os.chdir(config.LOOP_CATALOG_DIR)
latest_date = tracker.processing_dates[-1]
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date='2024.1.16.10.52'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-loop-calling'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [12]:
paths = glob.glob("results/biorep_merged/results/motif_analysis/meme/fimo/*/summarize_results/summary.txt")

In [13]:
len(paths)

52

In [14]:
for p in paths:
    if os.path.getsize(p) <= 1:
        print(p)
    with open(p) as motifs:
        count = 0
        for line in motifs:
            count +=1
        print(count)

7834
6802
26930
6817
20554
22222
12603
16933
12839
34403
13689
8444
6899
5735
7876
8422
5344
10111
19763
10561
7068
14880
17983
31992
10914
11879
12934
25915
3624
13342
4153
9444
12619
8737
20306
30589
6853
21419
5791
7271
6529
21445
12961
27439
7488
16855
11022
590
30558
28637
31269
7813


In [33]:
def read_out_log(log):
    
    status = 0
    f_status = 0
    sample_name = 'check'
    peaks_found = 0
    no_interact = 0
    date = ''
    config = ''
    peak_type = ''

     # extract job_id and index
    meta = os.path.basename(log)
    job_id = meta.split('-')[1].split('.')[0]
    #job_id = meta[0].replace('o', '')
    #index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            if 'peaks found and will be used to call loops' in info:
                peaks_found = 1
            if 'FitHiChIP pipeline is completely executed - congratulations !!!' in info:
                f_status = 1
            if 'SORRY !!!!!!!! FitHiChIP could not find any statistically significant interactions' in info:
                no_interact = 1
            if 'Ended: fithichip loop calling' in info:
                status = 1
            if 'Started: run_fithichip_loopcalling' in info:
                config = info.split("_")[3]
            if 'Selected Peak Mode' in info:
                mode = info.strip().split(": ")[1]
                if mode == "2":
                    peak_type = "fithichip"
                elif mode == "3":
                    peak_type = "chipseq"
                

    return([sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status])

In [34]:
def read_error_log(log):
    
    status = 1
    eline = []
    has_errors = 0
    mem = 0
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'cat: /var/spool/torque/aux' in info:
                continue
            if 'Loading required package:' in info:
                continue
            if 'Attaching package:' in info:
                continue
            if 'The following objects are masked' in info:
                continue
            if line.startswith('\t'):
                continue
            if 'The following object is masked' in info:
                continue
            if 'spar-finding: non-finite value inf; using BIG value' in info:
                continue
            if 'Warning message' in info:
                continue
            if line.startswith('\n'):
                continue
            if 'cluster' in line:
                continue
            if 'par' in line:
                continue
            if 'IQR' in line or 'anyDuplicated' in line or 'dirname' in line or 'grepl' in line or 'order' in line or 'rbind' in line or 'union' in line or 'expand.grid' in line or 'shift' in line or 'first, second' in line:
                continue
            if 'containing missing values (geom_bar).' in line:
                continue
            if 'containing missing values (geom_path).' in line:
                continue
            if 'containing missing values (geom_point).' in line:
                continue
            if 'NaNs produced' in line:
                continue
            if 'In data.frame(group = paste(' in line:
                continue
            if 'aesthetic for lines was deprecated in ggplot2 3.4.0.' in line:
                continue
            if 'Please use `linewidth` instead.' in line:
                continue
            if 'Removed 2 rows containing missing values (`geom_bar()`).' in line:
                continue
            status = 0
            eline.append(line)
            if 'memory' in line:
                mem = 1
        if len(eline) != 0:
            has_errors = 1
            
    return(status, eline, has_errors, mem)

## Check Logs

In [6]:
# load the data
df = pd.read_table(input_fn, skiprows=0, header = None)
df['sample_index'] = df.index
print('number of samples:', len(df))

number of samples: 478


In [7]:
df

Unnamed: 0,0,1,2,3,4,5,6,sample_index
0,C666-1.GSE123645.Homo_Sapiens.H3K27ac.b1,GSE123645,Homo_Sapiens,1,H3K27ac,MboI,C666-1,0
1,C666-1.GSE123645.Homo_Sapiens.H3K27ac.b2,GSE123645,Homo_Sapiens,2,H3K27ac,MboI,C666-1,1
2,C17.GSE123645.Homo_Sapiens.H3K27ac.b1,GSE123645,Homo_Sapiens,1,H3K27ac,MboI,C17,2
3,C17.GSE123645.Homo_Sapiens.H3K27ac.b2,GSE123645,Homo_Sapiens,2,H3K27ac,MboI,C17,3
4,LCL-EBNA3A-HT-On.GSE128952.Homo_Sapiens.H3K27ac.b1,GSE128952,Homo_Sapiens,1,H3K27ac,MboI,LCL-EBNA3A-HT-On,4
5,LCL-EBNA3A-HT-On.GSE128952.Homo_Sapiens.H3K27ac.b2,GSE128952,Homo_Sapiens,2,H3K27ac,MboI,LCL-EBNA3A-HT-On,5
6,LCL-EBNA3A-HT-Off.GSE128952.Homo_Sapiens.H3K27ac.b1,GSE128952,Homo_Sapiens,1,H3K27ac,MboI,LCL-EBNA3A-HT-Off,6
7,LCL-EBNA3A-HT-Off.GSE128952.Homo_Sapiens.H3K27ac.b2,GSE128952,Homo_Sapiens,2,H3K27ac,MboI,LCL-EBNA3A-HT-Off,7
8,B-Lymphocyte-EBV-Day0.GSE128952.Homo_Sapiens.RNA-Pol-II.b1,GSE128952,Homo_Sapiens,1,RNA-Pol-II,MboI,B-Lymphocyte-EBV-Day0,8
9,B-Lymphocyte-EBV-Day0.GSE128952.Homo_Sapiens.RNA-Pol-II.b2,GSE128952,Homo_Sapiens,2,RNA-Pol-II,MboI,B-Lymphocyte-EBV-Day0,9


In [45]:
# create a dataframe of log information, 506307, 508200, 5085036 (fixing mem errors), 50850 37-42 (hichip-peaks), 51014 (fithichip peaks)
output_logs = glob.glob('results/biorep_merged/results/loops/logs/job-7145*.out')
error_logs = glob.glob('results/biorep_merged/results/loops/logs/job-7145*.error')
log_data = []   
for out_log in output_logs: 
    sample_index = out_log.split('-')[1].split('.')[0]
    sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status = read_out_log(out_log)
    error_log = [log for log in error_logs if log.split('-')[1].split('.')[0] == sample_index][0]
    error_status, eline, has_errors, mem = read_error_log(error_log)
    log_data.append([sample_name, job_id, config, peak_type, peaks_found, no_interact, f_status, status, error_status, eline, has_errors, mem, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'no_interact', 'f_status', 'out_status', 'error_status', 'errors', 'has_errors', 'mem', 'log']
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True)

In [46]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,job_id,config,peak_type,peaks_found,no_interact,f_status,out_status,error_status,errors,has_errors,mem,log
0,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145914,L25,chipseq,1,0,1,1,1,[],0,0,job-7145914.out
1,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145858,L5,chipseq,1,0,1,1,1,[],0,0,job-7145858.out
2,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145886,L10,chipseq,1,0,1,1,1,[],0,0,job-7145886.out
3,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145859,L5,chipseq,1,0,1,1,1,[],0,0,job-7145859.out
4,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145915,L25,chipseq,1,0,1,1,1,[],0,0,job-7145915.out
5,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3.biorep_merged,7145887,L10,chipseq,1,0,1,1,1,[],0,0,job-7145887.out
6,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145946,L10,chipseq,1,0,1,1,1,[],0,0,job-7145946.out
7,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145944,L25,chipseq,1,0,1,1,1,[],0,0,job-7145944.out
8,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145854,S5,chipseq,1,0,0,0,1,[],0,0,job-7145854.out
9,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145942,L5,chipseq,1,0,0,0,1,[],0,0,job-7145942.out


## Identifying problem samples

In [47]:
problems = (log_df.out_status == 0) | (log_df.error_status == 0) | (log_df.has_errors == 1)
problems_df = log_df.loc[problems,:]
print('number of problem samples:', len(problems_df))

number of problem samples: 4


In [48]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']]

Unnamed: 0,std_sample_name,job_id,config,peak_type,peaks_found,f_status,out_status,no_interact,error_status,errors,log
8,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145854,S5,chipseq,1,0,0,0,1,[],job-7145854.out
9,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145942,L5,chipseq,1,0,0,0,1,[],job-7145942.out
10,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7145855,S10,chipseq,1,0,0,0,1,[],job-7145855.out
77,Jurkat-1D7.GSE230313.Homo_Sapiens.H3K27ac.biorep_merged,7145851,L5,fithichip,1,0,0,0,1,[],job-7145851.out


In [29]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'config', 'peak_type', 'peaks_found', 'f_status', 'out_status', 'no_interact', 'error_status', 'errors', 'log']].loc[problems_df['config'] == "S5"]

Unnamed: 0,std_sample_name,job_id,config,peak_type,peaks_found,f_status,out_status,no_interact,error_status,errors,log
41,B-Lymphocyte-EBV-Day0.GSE128952.Homo_Sapiens.RNA-Pol-II.biorep_merged,7126211,S5,fithichip,1,0,1,1,0,"[Error: no function to return from, jumping to top level\n, Execution halted\n, awk: fatal: cannot open file `/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/biorep_merged/results/loops/fithichip/B-Lymphocyte-EBV-Day0.GSE128952.Homo_Sapiens.RNA-Pol-II.biorep_merged_fithichip.peaks/S5//FitHiChIP_Peak2ALL_b5000_L20000_U2000000/P2PBckgr_1/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-...",job-7126211.out
48,B-Lymphocyte-EBV-Day28.GSE128952.Homo_Sapiens.RNA-Pol-II.biorep_merged,7126212,S5,fithichip,1,0,1,1,0,"[Error: no function to return from, jumping to top level\n, Execution halted\n, awk: fatal: cannot open file `/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/biorep_merged/results/loops/fithichip/B-Lymphocyte-EBV-Day28.GSE128952.Homo_Sapiens.RNA-Pol-II.biorep_merged_fithichip.peaks/S5//FitHiChIP_Peak2ALL_b5000_L20000_U2000000/P2PBckgr_1/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP...",job-7126212.out
71,BL6_CD4_CD8.GSE141847.Mus_Musculus.H3K27ac.biorep_merged,7132768,S5,chipseq,1,1,1,0,0,"[/mnt/BioAdHoc/Groups/vd-ay/kfetter/packages/fithichip/FitHiChIP/FitHiChIP_HiCPro.sh: line 1130: 1929048 Killed $RScriptExec ./src/InteractionHicPro.r $InpBinIntervalFile $InpMatrixFile $Interaction_Initial_File\n, slurmstepd: error: Detected 1 oom-kill event(s) in StepId=7132768.batch. Some of your processes may have been killed by the cgroup out-of-memory handler.\n]",job-7132768.out
83,BMDC-CTCF_cKO-LPS.GSE185883.Mus_Musculus.H3K27ac.biorep_merged,7126216,S5,fithichip,1,1,1,0,0,"[3: Removed 32 rows containing missing values (`geom_point()`). \n, There were 50 or more warnings (use warnings() to see the first 50)\n]",job-7126216.out
92,BMDC-CTCF_cKO.GSE185883.Mus_Musculus.H3K27ac.biorep_merged,7126217,S5,fithichip,1,1,1,0,0,"[There were 50 or more warnings (use warnings() to see the first 50)\n, Error in hist.default(histdata$dist, breaks = seq(min_dist, max_dist, : \n, some 'x' not counted; maybe 'breaks' do not span range of 'x'\n, Calls: hist -> hist.default\n, Execution halted\n]",job-7126217.out
128,BMDC-WT-LPS.GSE185883.Mus_Musculus.H3K27ac.biorep_merged,7126221,S5,fithichip,1,1,1,0,0,"[2: Removed 67 rows containing missing values (`geom_point()`). \n, There were 50 or more warnings (use warnings() to see the first 50)\n]",job-7126221.out
133,BMDC-WT-LPS_p65.GSE185883.Mus_Musculus.p65.biorep_merged,7126222,S5,fithichip,1,1,1,0,0,"[3: Removed 394 rows containing missing values (`geom_point()`). \n, 4: Removed 372 rows containing missing values (`geom_line()`). \n, There were 50 or more warnings (use warnings() to see the first 50)\n]",job-7126222.out
141,BMDC-WT.GSE185883.Mus_Musculus.H3K27ac.biorep_merged,7126223,S5,fithichip,1,1,1,0,0,[2: Removed 18 rows containing missing values (`geom_point()`). \n],job-7126223.out
181,C666-1.GSE123645.Homo_Sapiens.H3K27ac.biorep_merged,7126227,S5,fithichip,1,0,1,1,0,"[3: Removed 20 rows containing missing values (`geom_point()`). \n, 4: Removed 20 rows containing missing values (`geom_line()`). \n]",job-7126227.out
224,Cardiomyocyte-E18.5.GSE178673.Mus_Musculus.H3K27ac.biorep_merged,7126232,S5,fithichip,1,0,1,1,0,"[Error: no function to return from, jumping to top level\n, Execution halted\n, awk: fatal: cannot open file `/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/biorep_merged/results/loops/fithichip/Cardiomyocyte-E18.5.GSE178673.Mus_Musculus.H3K27ac.biorep_merged_fithichip.peaks/S5//FitHiChIP_Peak2ALL_b5000_L20000_U2000000/P2PBckgr_1/Coverage_Bias/FitHiC_BiasCorr/FitHiChIP-S5.in...",job-7126232.out


In [39]:
problems_df = problems_df.sort_values(by='std_sample_name').reset_index(drop=True)
problems_df.to_excel("problems_biorepmerged.xlsx", sheet_name='sample_summary')