In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [3]:
def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

def read_output_log(log):
    
    download_output = 0
    sample_name = 'check'
    srr_id = 'check'
    link_r1 = 'check'
    link_r2 = 'check'
    link_check = 0

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'download link R1' in info:
                link_r1 = info.split()[3]
            elif 'download link R2' in info:
                link_r2 = info.split()[3]
            elif 'Ended: ebi_urls_download_srr_fastqs' in info:
                download_output = 1       
    if link_r1.split('/')[8] == srr_id + '_1.fastq.gz' and link_r2.split('/')[8] == srr_id + '_2.fastq.gz':
        link_check = 1
    
    return([index, sample_name, srr_id, job_id, link_r1, link_r2, link_check, download_output])

def read_error_log(log):
    
    complete_count = 0
    download_error = '0'
    error = ''

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if len(info.split()) != 0 and info.split()[0] == '100' and (info.split()[1] == info.split()[3]):
                complete_count = complete_count + 1
            elif len(info.split()) != 0 and info.split()[0] == 'curl:':
                error = line
            
    if complete_count == 2:
        download_error = '1'
    elif complete_count == 1:
        download_error = '1/2'
    
    return([error, download_error])

def long_to_short_index_list(serial_list):
    
    small_num = serial_list[0]
    serial_list_short = []
    for i in range(1, len(serial_list)):

        prev_num = serial_list[i -1]
        next_num = serial_list[i]

        if (prev_num + 1) != next_num:

            if small_num == prev_num:
                curr_range = '{}'.format(small_num)
                serial_list_short.append(curr_range)
                small_num = next_num

            else:
                curr_range = '{}-{}'.format(small_num, prev_num)
                serial_list_short.append(curr_range)
                small_num = next_num

    if small_num == next_num:
        curr_range = '{}'.format(small_num,)
        serial_list_short.append(curr_range)
    else:
        curr_range = '{}-{}'.format(small_num, next_num)
        serial_list_short.append(curr_range)
        
    return(serial_list_short)

## Check Logs

In [4]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [5]:
df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902791,SRR9590180,Homo_Sapiens,1,1,CTCF,MboI,A673_SA1m1,1
1,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902792,SRR9590181,Homo_Sapiens,2,1,CTCF,MboI,A673_SA1m1,2
2,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902793,SRR9590182,Homo_Sapiens,1,1,CTCF,MboI,A673_SA2m1,3
3,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902794,SRR9590183,Homo_Sapiens,2,1,CTCF,MboI,A673_SA2m1,4
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,GSM3902795,SRR9590184,Homo_Sapiens,3,1,CTCF,MboI,A673_SA2m1,5
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,GSM3902796,SRR9590185,Homo_Sapiens,4,1,CTCF,MboI,A673_SA2m1,6
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,GSM3902797,SRR9590186,Homo_Sapiens,5,1,CTCF,MboI,A673_SA2m1,7
7,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902798,SRR9590187,Homo_Sapiens,1,1,H3K27ac,MboI,A673_SA2m1,8
8,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b2,GSE133227,GSM3902799,SRR9590188,Homo_Sapiens,2,1,H3K27ac,MboI,A673_SA2m1,9
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b3,GSE133227,GSM3902800,SRR9590189,Homo_Sapiens,3,1,H3K27ac,MboI,A673_SA2m1,10


In [6]:
print('number of SRRs to be downloaded:', len(df))
print('number of fastq files expected:', len(df)*2)

number of SRRs to be downloaded: 103
number of fastq files expected: 206


In [7]:
df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902791,SRR9590180,Homo_Sapiens,1,1,CTCF,MboI,A673_SA1m1,1
1,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902792,SRR9590181,Homo_Sapiens,2,1,CTCF,MboI,A673_SA1m1,2
2,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902793,SRR9590182,Homo_Sapiens,1,1,CTCF,MboI,A673_SA2m1,3
3,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,GSM3902794,SRR9590183,Homo_Sapiens,2,1,CTCF,MboI,A673_SA2m1,4
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,GSM3902795,SRR9590184,Homo_Sapiens,3,1,CTCF,MboI,A673_SA2m1,5
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,GSM3902796,SRR9590185,Homo_Sapiens,4,1,CTCF,MboI,A673_SA2m1,6
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,GSM3902797,SRR9590186,Homo_Sapiens,5,1,CTCF,MboI,A673_SA2m1,7
7,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902798,SRR9590187,Homo_Sapiens,1,1,H3K27ac,MboI,A673_SA2m1,8
8,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b2,GSE133227,GSM3902799,SRR9590188,Homo_Sapiens,2,1,H3K27ac,MboI,A673_SA2m1,9
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b3,GSE133227,GSM3902800,SRR9590189,Homo_Sapiens,3,1,H3K27ac,MboI,A673_SA2m1,10


In [8]:
print('number of uniqe samples/bioreps for planned download:', len(df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)))

number of uniqe samples/bioreps for planned download: 76


In [9]:
# create a dataframe of log information
output_logs = glob.glob('results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.o*-*')
error_logs = glob.glob('results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.e*-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, srr_id, job_id, link_r1, link_r2, link_check, download_out = read_output_log(out_log)
    error_log = [log for log in glob.glob('results/fastqs/raw/logs/ebi_urls_download_srr_fastqs.e*-*') if log.split('-')[1] == sample_index][0]
    error, download_error = read_error_log(error_log)
    log_data.append([index, sample_name, srr_id, job_id, link_r1, link_r2, link_check, download_out, os.path.basename(out_log), error, download_error])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'link_r1', 'link_r2', 'link_check', 'download_status_out', 'log', 'error', 'download_status_error']
log_df.drop('sample_index', inplace=True, axis=1)
reorder = ['std_sample_name', 'srr_id', 'job_id', 'link_r1', 'link_r2', 'link_check', 'download_status_out', 'download_status_error','error', 'log']
log_df = log_df[reorder]

In [10]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,link_r1,link_r2,link_check,download_status_out,download_status_error,error,log
0,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492726,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/026/SRR12492726/SRR12492726_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/026/SRR12492726/SRR12492726_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-79
1,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492727,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/027/SRR12492727/SRR12492727_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/027/SRR12492727/SRR12492727_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-80
2,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492723,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/023/SRR12492723/SRR12492723_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/023/SRR12492723/SRR12492723_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-76
3,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492724,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/024/SRR12492724/SRR12492724_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/024/SRR12492724/SRR12492724_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-77
4,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492728,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/028/SRR12492728/SRR12492728_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/028/SRR12492728/SRR12492728_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-81
5,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492725,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/025/SRR12492725/SRR12492725_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/025/SRR12492725/SRR12492725_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-78
6,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492730,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/030/SRR12492730/SRR12492730_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/030/SRR12492730/SRR12492730_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-83
7,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492733,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/033/SRR12492733/SRR12492733_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/033/SRR12492733/SRR12492733_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-86
8,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492729,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/029/SRR12492729/SRR12492729_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/029/SRR12492729/SRR12492729_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-82
9,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492732,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/032/SRR12492732/SRR12492732_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/032/SRR12492732/SRR12492732_2.fastq.gz,1,1,1,,ebi_urls_download_srr_fastqs.o5033885-85


## Check the presences of SRR FASTQ files

In [11]:
# get the download path
download_r1_tpl = 'results/fastqs/raw/{}/{}_1.fastq.gz'
download_r2_tpl = 'results/fastqs/raw/{}/{}_2.fastq.gz'
log_df.loc[:, 'download_output_r1'] = log_df.apply(lambda x: download_r1_tpl.format(x[0], x[1]), axis=1)
log_df.loc[:, 'download_output_r2'] = log_df.apply(lambda x: download_r2_tpl.format(x[0], x[1]), axis=1)

# find whether the download SRR FASTQ is present
log_df.loc[:, 'download_present_r1'] = log_df.loc[:, 'download_output_r1'].apply(os.path.exists).astype(int)
log_df.loc[:, 'download_present_r2'] = log_df.loc[:, 'download_output_r2'].apply(os.path.exists).astype(int)

In [12]:
reorder_cols = ['std_sample_name',
                 'srr_id',
                 'job_id',
                 'link_r1',
                 'link_r2',
                 'link_check',
                 'download_status_out',
                 'download_status_error',
                 'error',
                 'download_present_r1',
                 'download_present_r2',
                 'log',
                 'download_output_r1',
                 'download_output_r2']

In [13]:
log_df = log_df[reorder_cols]
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,link_r1,link_r2,link_check,download_status_out,download_status_error,error,download_present_r1,download_present_r2,log,download_output_r1,download_output_r2
0,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492726,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/026/SRR12492726/SRR12492726_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/026/SRR12492726/SRR12492726_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-79,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492726_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492726_2.fastq.gz
1,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492727,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/027/SRR12492727/SRR12492727_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/027/SRR12492727/SRR12492727_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-80,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492727_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492727_2.fastq.gz
2,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492723,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/023/SRR12492723/SRR12492723_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/023/SRR12492723/SRR12492723_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-76,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492723_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492723_2.fastq.gz
3,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492724,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/024/SRR12492724/SRR12492724_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/024/SRR12492724/SRR12492724_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-77,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492724_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492724_2.fastq.gz
4,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492728,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/028/SRR12492728/SRR12492728_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/028/SRR12492728/SRR12492728_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-81,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2/SRR12492728_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2/SRR12492728_2.fastq.gz
5,A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492725,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/025/SRR12492725/SRR12492725_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/025/SRR12492725/SRR12492725_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-78,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2/SRR12492725_1.fastq.gz,results/fastqs/raw/A673-siCT-Dh1-72h.GSE156650.Homo_Sapiens.CTCF.b2/SRR12492725_2.fastq.gz
6,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492730,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/030/SRR12492730/SRR12492730_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/030/SRR12492730/SRR12492730_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-83,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492730_1.fastq.gz,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492730_2.fastq.gz
7,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492733,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/033/SRR12492733/SRR12492733_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/033/SRR12492733/SRR12492733_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-86,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492733_1.fastq.gz,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492733_2.fastq.gz
8,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492729,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/029/SRR12492729/SRR12492729_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/029/SRR12492729/SRR12492729_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-82,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492729_1.fastq.gz,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492729_2.fastq.gz
9,A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492732,5033885,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/032/SRR12492732/SRR12492732_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR124/032/SRR12492732/SRR12492732_2.fastq.gz,1,1,1,,1,1,ebi_urls_download_srr_fastqs.o5033885-85,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492732_1.fastq.gz,results/fastqs/raw/A673-siSA2-Dh6-72h.GSE156650.Homo_Sapiens.CTCF.b1/SRR12492732_2.fastq.gz


## Identifying problem samples

In [14]:
problems = (log_df.link_check != 1) | (log_df.download_status_out != 1) | (log_df.download_status_error != '1') | (log_df.download_present_r1 != 1) | (log_df.download_present_r2 != 1) | (log_df.error != '')
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 11


In [15]:
problems_df.loc[:, ['std_sample_name', 'link_check', 'download_status_out', 'download_status_error', 'error', 'download_present_r1', 'download_present_r2', 'log']]

Unnamed: 0,std_sample_name,link_check,download_status_out,download_status_error,error,download_present_r1,download_present_r2,log
43,CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1,1,0,1/2,curl: (56) Recv failure: Connection timed out\n,1,1,ebi_urls_download_srr_fastqs.o5033885-103
53,HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,1/2,curl: (56) Recv failure: Connection timed out\n,1,1,ebi_urls_download_srr_fastqs.o5033885-42
58,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (56) Recv failure: Connection timed out\n,1,0,ebi_urls_download_srr_fastqs.o5033885-46
59,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (56) Recv failure: Connection timed out\n,1,0,ebi_urls_download_srr_fastqs.o5033885-47
60,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (56) Recv failure: Connection timed out\n,1,0,ebi_urls_download_srr_fastqs.o5033885-44
61,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (18) transfer closed with 481312266 bytes remaining to read\n,1,0,ebi_urls_download_srr_fastqs.o5033885-45
66,RERFLCAI.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (56) Recv failure: Connection timed out\n,1,0,ebi_urls_download_srr_fastqs.o5033885-43
69,SKMES1_sg_e1_1.GSE166232.Homo_Sapiens.H3K27ac.b1,1,0,0,curl: (18) transfer closed with 1716561560 bytes remaining to read\n,1,0,ebi_urls_download_srr_fastqs.o5033885-50
75,TC71_WT.GSE133227.Homo_Sapiens.CTCF.b1,1,0,1/2,curl: (18) transfer closed with 4023685358 bytes remaining to read\n,1,1,ebi_urls_download_srr_fastqs.o5033885-32
77,TC71_WT.GSE133227.Homo_Sapiens.H3K27ac.b1,1,0,1/2,curl: (18) transfer closed with 5405929190 bytes remaining to read\n,1,1,ebi_urls_download_srr_fastqs.o5033885-34


In [None]:
save_df = df.rename(columns={'end_status': 'download_end_status', 
                   'read_written_status': 'download_read_written_status',
                   'log': 'download_log', 
                   'job_id': 'download_job_id'})

header_output = '{}.with_header.tsv'.format(output_prefix)
save_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
save_df.to_csv(without_header_output, header=False, index=False, sep='\t')