In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
import config
os.chdir(config.LOOP_CATALOG_DIR)
latest_date = tracker.processing_dates[-1]
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.05.11.57'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

def read_log(log):
    
    """
    Read the log and extract status information including
    the sample_name, srr_id, status (as indicated by the end message),
    and the download status (as indicate by the "reads written" message).
    """
    
    status = 0
    download = 1
    sample_name = 'check'
    srr_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'Ended:' in info:
                status = 1
            elif 'reads written' in info:
                download = 1
            
    return([index, sample_name, srr_id, job_id, status, download])

def long_to_short_index_list(serial_list):
    
    small_num = serial_list[0]
    serial_list_short = []
    for i in range(1, len(serial_list)):

        prev_num = serial_list[i -1]
        next_num = serial_list[i]

        if (prev_num + 1) != next_num:

            if small_num == prev_num:
                curr_range = '{}'.format(small_num)
                serial_list_short.append(curr_range)
                small_num = next_num

            else:
                curr_range = '{}-{}'.format(small_num, prev_num)
                serial_list_short.append(curr_range)
                small_num = next_num

    if small_num == next_num:
        curr_range = '{}'.format(small_num,)
        serial_list_short.append(curr_range)
    else:
        curr_range = '{}-{}'.format(small_num, next_num)
        serial_list_short.append(curr_range)
        
    return(serial_list_short)

## Check Logs

In [5]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [6]:
df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705041,SRR5831489,Homo_Sapiens,1,1,H3K27ac,MboI,GM12878,1
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705042,SRR5831490,Homo_Sapiens,2,1,H3K27ac,MboI,GM12878,2
2,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902791,SRR9590180,Homo_Sapiens,1,1,CTCF,MboI,A673_SA1m1,3
3,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902792,SRR9590181,Homo_Sapiens,2,1,CTCF,MboI,A673_SA1m1,4
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902793,SRR9590182,Homo_Sapiens,1,1,CTCF,MboI,A673_SA2m1,5
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902794,SRR9590183,Homo_Sapiens,2,1,CTCF,MboI,A673_SA2m1,6
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,GSM3902795,SRR9590184,Homo_Sapiens,3,1,CTCF,MboI,A673_SA2m1,7
7,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,GSM3902796,SRR9590185,Homo_Sapiens,4,1,CTCF,MboI,A673_SA2m1,8
8,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,GSM3902797,SRR9590186,Homo_Sapiens,5,1,CTCF,MboI,A673_SA2m1,9
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902798,SRR9590187,Homo_Sapiens,1,1,H3K27ac,MboI,A673_SA2m1,10


In [7]:
# create a dataframe of log information 
logs = glob.glob('results/fastqs/raw/logs/*download_srr_fastqs.o*-*')
log_data = []
for log in logs:
    index, sample_name, srr_id, job_id, status, download = read_log(log)
    log_data.append([index, sample_name, srr_id, job_id, status, download, log])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'end_status', 'read_written_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [12]:
sorted_log_df = log_df.sort_values(by=['std_sample_name','job_id'])
rdup_log_df = sorted_log_df.drop_duplicates(subset=['std_sample_name','srr_id'], keep='last')
log_df = rdup_log_df
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,end_status,read_written_status,log
34,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492723,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-76
69,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492726,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-79
114,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492727,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-80
121,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492724,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-77
29,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492725,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-78
42,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492728,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-81
10,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492729,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-82
24,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492732,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-85
54,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492733,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-86
80,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492730,5033885,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-83


In [9]:
# merge the log information
df = df.merge(log_df, how='left', on=['std_sample_name', 'srr_id'], indicator='merge')
df.loc[df.end_status.isna(), 'end_status'] = 0
df.loc[df.read_written_status.isna(), 'read_written_status'] = 0
df.end_status = df.end_status.astype(int)
df.read_written_status = df.read_written_status.astype(int)
df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index,job_id,end_status,read_written_status,log,merge
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705041,SRR5831489,Homo_Sapiens,1,1,H3K27ac,MboI,GM12878,1,,0,0,,left_only
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705042,SRR5831490,Homo_Sapiens,2,1,H3K27ac,MboI,GM12878,2,,0,0,,left_only
2,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902791,SRR9590180,Homo_Sapiens,1,1,CTCF,MboI,A673_SA1m1,3,5033793.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033793-1,both
3,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902792,SRR9590181,Homo_Sapiens,2,1,CTCF,MboI,A673_SA1m1,4,5033793.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033793-2,both
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902793,SRR9590182,Homo_Sapiens,1,1,CTCF,MboI,A673_SA2m1,5,5033793.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033793-3,both
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902794,SRR9590183,Homo_Sapiens,2,1,CTCF,MboI,A673_SA2m1,6,5033793.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033793-4,both
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,GSM3902795,SRR9590184,Homo_Sapiens,3,1,CTCF,MboI,A673_SA2m1,7,5033885.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-5,both
7,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,GSM3902796,SRR9590185,Homo_Sapiens,4,1,CTCF,MboI,A673_SA2m1,8,5033885.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-6,both
8,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,GSM3902797,SRR9590186,Homo_Sapiens,5,1,CTCF,MboI,A673_SA2m1,9,5033885.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-7,both
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902798,SRR9590187,Homo_Sapiens,1,1,H3K27ac,MboI,A673_SA2m1,10,5033885.0,1,1,results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o5033885-8,both


In [10]:
# get the latest job id for a give sample + srr id combo
df = df.sort_values('job_id', ascending=False).drop_duplicates(['std_sample_name', 'srr_id'])

# sort back into the sample index values, needed for correctly running qsub
df.sort_values('sample_index', inplace=True)

## Check the presences of SRR FASTQ files

In [11]:
# get the download path
download_tpl = 'results/fastqs/raw/{}/{}_2.fastq.gz'
df.loc[:, 'download_output'] = df.apply(lambda x: download_tpl.format(x[0], x[3]), axis=1)

# find whether the download SRR FASTQ is present
df.loc[:, 'download_present'] = df.loc[:, 'download_output'].apply(os.path.exists).astype(int)

In [12]:
reorder_cols = ['std_sample_name',
                 'gse_id',
                 'gsm_id',
                 'srr_id',
                 'organism',
                 'bio_rep',
                 'tech_rep',
                 'sample_index',
                 'job_id',
                 'end_status',
                 'read_written_status',
                 'download_present',
                 'antibody_target',
                 'restriction_enzyme',
                 'sample_name',
                 'log',
                 'download_output']

In [13]:
df = df[reorder_cols]

In [14]:
save_df = df.rename(columns={'end_status': 'download_end_status', 
                   'read_written_status': 'download_read_written_status',
                   'log': 'download_log', 
                   'job_id': 'download_job_id'})

header_output = '{}.with_header.tsv'.format(output_prefix)
save_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
save_df.to_csv(without_header_output, header=False, index=False, sep='\t')

## Identifying problem samples

In [15]:
problems = (df.end_status != 1) | (df.read_written_status != 1) | (df.download_present != 1)
problems_df = df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,sample_index,job_id,end_status,read_written_status,download_present,antibody_target,restriction_enzyme,sample_name,log,download_output
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,GSM2705041,SRR5831489,Homo_Sapiens,1,1,1,,0,0,1,H3K27ac,MboI,GM12878,,results/fastqs/raw/GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1/SRR5831489_2.fastq.gz
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,GSM2705042,SRR5831490,Homo_Sapiens,2,1,2,,0,0,1,H3K27ac,MboI,GM12878,,results/fastqs/raw/GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2/SRR5831490_2.fastq.gz
37,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,GSE147646,GSM4437223,SRR11434878,Homo_Sapiens,1,1,38,,0,0,1,H3K27ac,MboI,HK2,,results/fastqs/raw/HK2.GSE147646.Homo_Sapiens.H3K27ac.b1/SRR11434878_2.fastq.gz
38,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,GSE147646,GSM4437224,SRR11434879,Homo_Sapiens,1,2,39,,0,0,1,H3K27ac,MboI,HK2,,results/fastqs/raw/HK2.GSE147646.Homo_Sapiens.H3K27ac.b1/SRR11434879_2.fastq.gz
55,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,GSM2974087,SRR6657728,Homo_Sapiens,1,1,56,,0,0,1,CTCF,MboI,Hela-S3-OE,,results/fastqs/raw/Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b1/SRR6657728_2.fastq.gz
56,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b2,GSE108869,GSM2974088,SRR6657729,Homo_Sapiens,2,1,57,,0,0,1,CTCF,MboI,Hela-S3-OE,,results/fastqs/raw/Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b2/SRR6657729_2.fastq.gz
57,GM12878.GSE115524.Homo_Sapiens.CTCF.b1,GSE115524,GSM3424974,SRR7990656,Homo_Sapiens,1,1,58,,0,0,1,CTCF,MboI,GM12878,,results/fastqs/raw/GM12878.GSE115524.Homo_Sapiens.CTCF.b1/SRR7990656_2.fastq.gz
58,GM12878.GSE115524.Homo_Sapiens.CTCF.b2,GSE115524,GSM3424975,SRR7990657,Homo_Sapiens,2,1,59,,0,0,1,CTCF,MboI,GM12878,,results/fastqs/raw/GM12878.GSE115524.Homo_Sapiens.CTCF.b2/SRR7990657_2.fastq.gz
63,HEC-1-B-WT.GSE148958.Homo_Sapiens.V5.b1,GSE148958,GSM5533482,SRR15569144,Homo_Sapiens,1,1,64,,0,0,1,V5,MboI,HEC-1-B-WT,,results/fastqs/raw/HEC-1-B-WT.GSE148958.Homo_Sapiens.V5.b1/SRR15569144_2.fastq.gz
88,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,GSM4735769,SRR12492723,Homo_Sapiens,1,1,89,,0,0,1,CTCF,MboI,A673-siCT-Dh1-72h-D347-D362,,results/fastqs/raw/A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492723_2.fastq.gz


In [16]:
problems_df.loc[:, ['std_sample_name', 'log']]

Unnamed: 0,std_sample_name,log
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,
37,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,
38,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,
55,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b1,
56,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b2,
57,GM12878.GSE115524.Homo_Sapiens.CTCF.b1,
58,GM12878.GSE115524.Homo_Sapiens.CTCF.b2,
63,HEC-1-B-WT.GSE148958.Homo_Sapiens.V5.b1,
88,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,


In [46]:
problems_df.shape

(9, 17)

In [47]:
df.shape

(262, 17)

## Rerun problematic samples

In [48]:
# convert not run serial IDs to ranges/short format
rerun_indexes = sorted(problems_df.sample_index.tolist())
rerun_indexes = long_to_short_index_list(rerun_indexes)

In [49]:
len(rerun_indexes)

6

In [40]:
# generate a string version of not_run
final_rerun = ','.join(rerun_indexes)
final_rerun = 'qsub -t {}%4 workflow/scripts/fastq/download_srr_fastqs.qarray.qsh'.format(final_rerun)

In [41]:
print(final_rerun)

qsub -t 20,177-178,192,226,237-239,245%4 workflow/scripts/fastq/download_srr_fastqs.qarray.qsh


<p style="color: red"><b>Re-running in progress: 2022.04.02 - 16:30</b></p>

In [42]:
'vim -p {}'.format(' '.join(problems_df.log.tolist()))

'vim -p results/fastqs/raw/logs/download_srr_fastqs.o5028246-20 results/fastqs/raw/logs/download_srr_fastqs.o5028246-177 results/fastqs/raw/logs/download_srr_fastqs.o5028246-178 results/fastqs/raw/logs/download_srr_fastqs.o5028246-192 results/fastqs/raw/logs/download_srr_fastqs.o5028246-226 results/fastqs/raw/logs/download_srr_fastqs.o5028246-237 results/fastqs/raw/logs/download_srr_fastqs.o5028246-238 results/fastqs/raw/logs/download_srr_fastqs.o5028246-239 results/fastqs/raw/logs/download_srr_fastqs.o5028246-245'