In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/2022.03.30.fastq.samplesheet.with_header.tsv'   
    output_prefix = 'results/samplesheets/fastq/2022.03.30.fastq.post-check.split_fastqs'
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [3]:
def get_log(x, mode='index'):
    
    if mode == 'index':
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(x))
        
    elif mode == 'path':
        info = x.split('/')
        sample_name = info[3]
        srr_id  = info[4].split('.')[0].split('_')[0]
        index = df.loc[(df[0] == sample_name) & (df[3] == srr_id)].iloc[0].name
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(index))
        
    else:
        return('Try again.')
    
    return(logs)

def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

In [4]:
def read_log(log):
    
    """
    Read the log and extract status information including
    the sample_name, srr_id, status (as indicated by the end message),
    and the download status (as indicate by the "reads written" message).
    """
    
    status = 0
    download = 1
    sample_name = 'check'
    srr_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'Ended: split_fastqs' in info:
                status = 1

    return([index, sample_name, srr_id, job_id, status])

In [5]:
def long_to_short_index_list(serial_list):
    
    small_num = serial_list[0]
    serial_list_short = []
    for i in range(1, len(serial_list)):

        prev_num = serial_list[i -1]
        next_num = serial_list[i]

        if (prev_num + 1) != next_num:

            if small_num == prev_num:
                curr_range = '{}'.format(small_num)
                serial_list_short.append(curr_range)
                small_num = next_num

            else:
                curr_range = '{}-{}'.format(small_num, prev_num)
                serial_list_short.append(curr_range)
                small_num = next_num

    if small_num == next_num:
        curr_range = '{}'.format(small_num,)
        serial_list_short.append(curr_range)
    else:
        curr_range = '{}-{}'.format(small_num, next_num)
        serial_list_short.append(curr_range)
        
    return(serial_list_short)

## Check Logs

In [6]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1

In [7]:
# create a dataframe of log information 
logs = glob.glob('results/fastqs/parallel/logs/split_reads.o*-*')
log_data = []
for log in logs:
    index, sample_name, srr_id, job_id, status = read_log(log)
    log_data.append([index, sample_name, srr_id, job_id, status, log])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'end_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [8]:
log_df.head()

Unnamed: 0,std_sample_name,srr_id,job_id,end_status,log
0,lgs102943.GSE116193.Homo_Sapiens.CTCF.b1,SRR7417517,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...
1,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,SRR5831482,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...
2,lgs102943.GSE116193.Homo_Sapiens.H3K27ac.b1,SRR7417511,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...
3,HAEC.GSE178598.Homo_Sapiens.H3K27ac.b1,SRR14872079,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...
4,AoSMC.GSE178598.Homo_Sapiens.H3K27ac.b1,SRR14872076,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...


In [9]:
# merge the log information
df = df.merge(log_df, how='left', on=['std_sample_name', 'srr_id'], indicator='merge')
df.loc[df.end_status.isna(), 'end_status'] = 0
df.end_status = df.end_status.astype(int)

In [10]:
# get the latest job id for a give sample + srr id combo
df = df.sort_values('job_id', ascending=False).drop_duplicates(['std_sample_name', 'srr_id'])

In [11]:
df.shape

(80, 15)

In [12]:
df.head()

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index,job_id,end_status,log,merge
0,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705031,SRR5831479,Mus_Musculus,1,1,H3K27ac,MboI,mES_25m_cells,1,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...,both
59,JSC.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,GSM4040944,SRR10008153,Homo_Sapiens,1,1,H3K27ac,MboI,JSC,60,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...,both
57,BC1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,GSM4040943,SRR10008151,Homo_Sapiens,1,1,H3K27ac,MboI,BC1,58,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...,both
56,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,GSM4040942,SRR10008150,Homo_Sapiens,1,2,H3K27ac,MboI,BCBL1,57,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...,both
55,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,GSM4040942,SRR10008149,Homo_Sapiens,1,1,H3K27ac,MboI,BCBL1,56,4919273,1,results/fastqs/parallel/logs/split_reads.o4919...,both


In [13]:
header_output = '{}.with_header.tsv'.format(output_prefix)
df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [14]:
header_output

'results/samplesheets/fastq/2022.03.30.fastq.post-check.split_fastqs.with_header.tsv'

## Identifying problem samples

In [15]:
problems = (df.end_status != 1)
problems_df = df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index,job_id,end_status,log,merge
7,mES_100k_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705038,SRR5831486,Mus_Musculus,2,1,H3K27ac,MboI,mES_100k_cells,8,,0,,left_only
51,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,GSE147646,GSM4437223,SRR11434878,Homo_Sapiens,1,1,H3K27ac,MboI,HK2,52,,0,,left_only
52,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,GSE147646,GSM4437224,SRR11434879,Homo_Sapiens,1,2,H3K27ac,MboI,HK2,53,,0,,left_only
61,HAVIC.GSE154512.Homo_Sapiens.H3K27ac.b1,GSE154512,GSM4672364,SRR12231664,Homo_Sapiens,1,1,H3K27ac,MboI,HAVIC,62,,0,,left_only


In [16]:
problems_df.log.tolist()

[nan, nan, nan, nan]

## Rerun problematic samples

In [17]:
# convert not run serial IDs to ranges/short format
rerun_indexes = sorted(problems_df.sample_index.tolist())
rerun_indexes = long_to_short_index_list(rerun_indexes)

In [18]:
rerun_indexes

['8', '52-53', '62']

In [19]:
# generate a string version of not_run
final_rerun = ','.join(rerun_indexes)
final_rerun = 'qsub -t {} workflow/scripts/hicpro/split_fastqs.qarray.qsh'.format(final_rerun)

In [20]:
print(final_rerun)

qsub -t 8,52-53,62 workflow/scripts/hicpro/split_fastqs.qarray.qsh
