In [25]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.post_hicpro_processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.05.14.26'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-peaks'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def read_out_log(log):
    
    status = 0
    sample_name = 'check'
    read_length = -1
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'Using read length:' in info:
                read_length = info.split()[3]
            elif 'Ended: fithichip peak calling' in info:
                status = 1

    return([index, sample_name, read_length, job_id, status])

In [5]:
def read_error_log(log):
    
    status = 0
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'Done!' in info:
                status = 1
    
    return(status)

In [6]:
def read_fithichip_peak_files(file):
    peak_data = {}
    count = 0
    with open(file) as fr:
        for line in fr:
            if line.startswith('c'):
                info = line.strip().split()
                peak_info = info[0:4]
                peak_data[count] = peak_info
                count = count + 1
    return(peak_data)

## Check Logs

In [7]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [8]:
df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol-II.b1,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg,1
1,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol-II.b1,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinPos,2
2,IMR90-Proliferating.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,IMR90-Proliferating,3
3,IMR90-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,IMR90-Senescent,4
4,HUVEC-Proliferating.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,HUVEC-Proliferating,5
5,HUVEC-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,HUVEC-Senescent,6
6,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,Homo_Sapiens,1,CTCF,MboI,Hela-S3-Control,7
7,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b2,GSE108869,Homo_Sapiens,2,CTCF,MboI,Hela-S3-Control,8
8,OCI-AML3-WT.GSE111537.Homo_Sapiens.CTCF.b1,GSE111537,Homo_Sapiens,1,CTCF,MboI,OCI-AML3-WT,9
9,OCI-AML3-WT.GSE111537.Homo_Sapiens.CTCF.b2,GSE111537,Homo_Sapiens,2,CTCF,MboI,OCI-AML3-WT,10


In [9]:
print('number of samples:', len(df))

number of samples: 181


In [10]:
# create a dataframe of log information
output_logs = glob.glob('results/peaks/logs/run_fithichip_peakcalling.o*-*')
log_data = []   
for out_log in output_logs:
    if int(out_log.split('/')[3].split('.')[1].split('-')[0].replace('o','')) > 4974167:
        sample_index = out_log.split('-')[1]
        index, sample_name, read_length, job_id, out_status = read_out_log(out_log)
        error_log = [log for log in glob.glob('results/peaks/logs/run_fithichip_peakcalling.e*-*') if log.split('-')[1] == sample_index][0]
        error_status = read_error_log(error_log)
        log_data.append([index, sample_name, read_length, job_id, out_status, error_status, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'read_length', 'job_id', 'out_status', 'error_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [11]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,read_length,job_id,out_status,error_status,log
0,ARK-1.GSE137936.Homo_Sapiens.H3K27ac.b1,75,5041439,1,1,run_fithichip_peakcalling.o5041439-75
1,Aortic-VIC.GSE154513.Homo_Sapiens.H3K27ac.b1,101,5041439,1,1,run_fithichip_peakcalling.o5041439-105
2,CAR-T-CD19-D10.GSE168881.Homo_Sapiens.H3K27ac.b1,76,5041439,1,1,run_fithichip_peakcalling.o5041439-130
3,CAR-T-CD19-D10.GSE168881.Homo_Sapiens.H3K27ac.b2,76,5041439,1,1,run_fithichip_peakcalling.o5041439-131
4,CAR-T-HA-D10.GSE168881.Homo_Sapiens.H3K27ac.b1,76,5041439,1,1,run_fithichip_peakcalling.o5041439-132
5,CAR-T-HA-D10.GSE168881.Homo_Sapiens.H3K27ac.b2,76,5041439,1,1,run_fithichip_peakcalling.o5041439-133
6,COLO320-DM.GSE159985.Homo_Sapiens.H3K27ac.b1,76,5041439,1,1,run_fithichip_peakcalling.o5041439-124
7,CORL279.GSE151001.Homo_Sapiens.H3K27ac.b1,75,5041439,1,1,run_fithichip_peakcalling.o5041439-84
8,CORL88.GSE151001.Homo_Sapiens.H3K27ac.b1,75,5041439,1,1,run_fithichip_peakcalling.o5041439-85
9,CUTLL1.GSE115896.Homo_Sapiens.H3K27ac.b1,51,5041439,1,1,run_fithichip_peakcalling.o5041439-17


## Identifying problem samples

In [12]:
problems = (log_df.out_status != 1) | (log_df.error_status != 1) | (log_df.read_length == -1)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 8


In [15]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'out_status', 'out_status', 'read_length', 'log']]

Unnamed: 0,std_sample_name,job_id,out_status,out_status.1,read_length,log
148,T47D-T0.GSE179666.Homo_Sapiens.PR.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-147
149,T47D-T30.GSE179666.Homo_Sapiens.PR.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-146
159,VCaP-AU-4h.GSE171591.Homo_Sapiens.CTCF.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-139
160,VCaP-AU-4h.GSE171591.Homo_Sapiens.H3K27ac.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-137
161,VCaP-AU-4h.GSE171591.Homo_Sapiens.H3K4me4.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-135
162,VCaP-DMSO-4h.GSE171591.Homo_Sapiens.CTCF.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-138
163,VCaP-DMSO-4h.GSE171591.Homo_Sapiens.H3K27ac.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-136
164,VCaP-DMSO-4h.GSE171591.Homo_Sapiens.H3K4me3.b1,5041439,0,0,-1,run_fithichip_peakcalling.o5041439-134


In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
problems_df.to_csv(header_output, header=True, index=False, sep='\t')