In [17]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.post_hicpro_processing_dates[-1]
os.chdir('/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [18]:
latest_date

'2022.07.12.09.12'

In [19]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/post-hicpro/{}.post-hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/post-hicpro/{}.post-hicpro.post-check.fithichip-peaks'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [20]:
def read_out_log(log):
    
    status = 0
    sample_name = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'HiCCUPS complete' in info:
                status = 1

    return([index, sample_name, job_id, status])

In [21]:
def read_error_log(log):
    
    status = ''
    
    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'Could not read hic file: null' in info:
                status = 'null dataset'
            if 'Hi-C map is too sparse' in info:
                status = 'too sparse for many loops'
                
    
    return(status)

## Check Logs

In [22]:
# load the data
df = pd.read_table('results/samplesheets/hicpro/current.hicpro.samplesheet.without_header.tsv', header=None)
columns = ['std_sample_name', 'gse_id', 'organism', 'bio_rep', 'antibody_target', 'restriction_enzyme', 'sample_name']
df.columns = columns
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [23]:
df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,293T.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T,1
1,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T-PDS,2
2,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,GSE128106,Homo_Sapiens,1,YY1,HindIII,293T-TMPYP4,3
3,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA1m1,4
4,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA1m1,5
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA2m1,6
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA2m1,7
7,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,Homo_Sapiens,3,CTCF,MboI,A673_SA2m1,8
8,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,Homo_Sapiens,4,CTCF,MboI,A673_SA2m1,9
9,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,Homo_Sapiens,5,CTCF,MboI,A673_SA2m1,10


In [24]:
print('number of samples:', len(df))

number of samples: 486


In [30]:
# create a dataframe of log information
output_logs = glob.glob('results/loops/logs/run_hiccups.o5941947*-*')
log_data = []   
for out_log in output_logs: 
    sample_index = out_log.split('-')[1]
    index, sample_name, job_id, out_status = read_out_log(out_log)
    sample_iden = [str(job_id), sample_index]
    error_log = [log for log in glob.glob('results/loops/logs/run_hiccups.e5941947*-*') if log.split('hiccups.e')[1] == ('-').join(sample_iden)]
    error_status = read_error_log(error_log[0])
    log_data.append([index, sample_name, job_id, out_status, error_status, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'job_id', 'out_status', 'error_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)
log_df = log_df.sort_values(by=["std_sample_name", "job_id"], ascending=True)#.drop_duplicates(subset=['std_sample_name'], keep='last')

In [31]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,job_id,out_status,error_status,log
0,CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,,run_hiccups.o5941947-52
1,cbCD34+-HSPC-SJALL068279-D1.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,too sparse for many loops,run_hiccups.o5941947-47
2,cbCD34+-HSPC-SJAUL068292-D1.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,too sparse for many loops,run_hiccups.o5941947-48
3,cbCD34+-HSPC-SJMPAL011911-D1.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,too sparse for many loops,run_hiccups.o5941947-49
4,cbCD34+-HSPC-SJMPAL011914-D1.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,too sparse for many loops,run_hiccups.o5941947-50
5,cbCD34+-HSPC-SJTALL005006-D2.GSE165207.Homo_Sapiens.H3K27ac.b1,5941947,1,too sparse for many loops,run_hiccups.o5941947-51


## Identifying problem samples

In [32]:
problems = (log_df.out_status == 0)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 0


In [33]:
problems_df.loc[:, ['std_sample_name', 'job_id', 'out_status', 'error_status', 'log']]

Unnamed: 0,std_sample_name,job_id,out_status,error_status,log


In [29]:
s="/mnt/bioadhoc-temp/Groups/vd-ay/kfetter/hichip-db-loop-calling/results/samplesheets/hicpro/2022.10.20.09.39.hicpro.samplesheet.without_header.tsv"
samples_to_run = []

index = 1
with open(s) as s:
    for line in s:
        info = line.strip().split()
        sample = info[0]
        if(sample in problems_df.values):
            samples_to_run.append(str(index))
        index += 1
print(','.join(samples_to_run))
print("length:", len(samples_to_run))

47,48,49,50,51,52
length: 6


In [61]:
samples = []
for entry in problems_df.values:
    samples.append(entry[0])
print(' '.join(samples))

DND41-DMSO.GSE173871.Homo_Sapiens.SMC1A.b1 DND41-Resistant.GSE173871.Homo_Sapiens.SMC1A.b1 HCC1599-GSI.GSE116872.Homo_Sapiens.SMC1A.b1 HCC1599-WT.GSE116872.Homo_Sapiens.SMC1A.b1 HaCaT_Stimulated.GSE151193.Homo_Sapiens.H3K27ac.b1 MB157-GSI.GSE116869.Homo_Sapiens.SMC1A.b1 MB157-WO.GSE116869.Homo_Sapiens.SMC1A.b1 Rec1-GSI.GSE116875.Homo_Sapiens.SMC1A.b1 Rec1-WT.GSE116875.Homo_Sapiens.SMC1A.b1 STAG2-KO.GSE116495.Homo_Sapiens.SMC1A.b1 TC71_WT.GSE133227.Homo_Sapiens.CTCF.b1 THP-1-del-cIDR-Batch1.GSE149420.Homo_Sapiens.H3K4me3.b1 THP-1-del-cIDR.GSE149420.Homo_Sapiens.H3K27ac.b1 Th17.GSE101498.Homo_Sapiens.H3K27ac.b1
