In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.06.29.20.41'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def read_output_log(log):
    
    download_output = 0
    sample_name = 'check'
    srr_id = 'check'

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'Ended: fasterq_download_srr_fastqs' in info:
                download_output = 1
    
    return([index, sample_name, srr_id, job_id, download_output])

def read_error_log(log):
    
    download_error = 0
    spots_read = -1
    reads_read = -1
    reads_written = -1
    reads_check = 0

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if line.split('-')[0] == '2022':
                if 'was downloaded successfully' in line:
                    download_error = 1
            else:
                value_split = line.split(':')[1].strip().split(',')
                value = ''.join(value_split)
                if 'spots read' in line:
                    spots_read = int(value)
                if 'reads read' in line:
                    reads_read = int(value)
                if 'reads written' in line:
                    reads_written = int(value)
                if reads_read == 2 * spots_read and reads_written == 2 * spots_read:
                    reads_check = 1
        
    return([download_error, spots_read, reads_read, reads_written, reads_check])

## Check Logs

In [5]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [6]:
df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,TC71_WT.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902827,SRR9590216,Homo_Sapiens,1,1,CTCF,MboI,TC71_WT,1
1,TC71_WT.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902829,SRR9590218,Homo_Sapiens,1,1,H3K27ac,MboI,TC71_WT,2
2,TC71_WT.GSE133227.Homo_Sapiens.H3K27ac.b2,GSE133227,GSM3902830,SRR9590219,Homo_Sapiens,2,1,H3K27ac,MboI,TC71_WT,3
3,HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066592,SRR13637944,Homo_Sapiens,1,1,H3K27ac,MboI,HCC15,4
4,RERFLCAI.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066593,SRR13637945,Homo_Sapiens,1,1,H3K27ac,MboI,RERFLCAI,5
5,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066594,SRR13637946,Homo_Sapiens,1,1,H3K27ac,MboI,KYSE140_DMSO2hrs,6
6,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066595,SRR13637947,Homo_Sapiens,1,2,H3K27ac,MboI,KYSE140_DMSO2hrs,7
7,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066596,SRR13637948,Homo_Sapiens,1,1,H3K27ac,MboI,KYSE140_ARV2hrs,8
8,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066597,SRR13637949,Homo_Sapiens,1,2,H3K27ac,MboI,KYSE140_ARV2hrs,9
9,SKMES1_sg_e1_1.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066600,SRR13637952,Homo_Sapiens,1,1,H3K27ac,MboI,SKMES1_sg_e1_1,10


In [7]:
print('number of SRRs to be downloaded:', len(df))
print('number of fastq files expected:', len(df)*2)

number of SRRs to be downloaded: 19
number of fastq files expected: 38


In [8]:
df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,TC71_WT.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,GSM3902827,SRR9590216,Homo_Sapiens,1,1,CTCF,MboI,TC71_WT,1
1,TC71_WT.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,GSM3902829,SRR9590218,Homo_Sapiens,1,1,H3K27ac,MboI,TC71_WT,2
2,TC71_WT.GSE133227.Homo_Sapiens.H3K27ac.b2,GSE133227,GSM3902830,SRR9590219,Homo_Sapiens,2,1,H3K27ac,MboI,TC71_WT,3
3,HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066592,SRR13637944,Homo_Sapiens,1,1,H3K27ac,MboI,HCC15,4
4,RERFLCAI.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066593,SRR13637945,Homo_Sapiens,1,1,H3K27ac,MboI,RERFLCAI,5
5,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066594,SRR13637946,Homo_Sapiens,1,1,H3K27ac,MboI,KYSE140_DMSO2hrs,6
6,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066596,SRR13637948,Homo_Sapiens,1,1,H3K27ac,MboI,KYSE140_ARV2hrs,8
7,SKMES1_sg_e1_1.GSE166232.Homo_Sapiens.H3K27ac.b1,GSE166232,GSM5066600,SRR13637952,Homo_Sapiens,1,1,H3K27ac,MboI,SKMES1_sg_e1_1,10
8,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b1,GSE115524,GSM3424974,SRR7990656,Homo_Sapiens,1,1,CTCF,MboI,GM12878-HiChIP,12
9,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b2,GSE115524,GSM3424975,SRR7990657,Homo_Sapiens,2,1,CTCF,MboI,GM12878-HiChIP,13


In [9]:
print('number of uniqe samples/bioreps for planned download:', len(df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)))

number of uniqe samples/bioreps for planned download: 15


In [10]:
# create a dataframe of log information
output_logs = glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.o*-*')
error_logs = glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.e*-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, srr_id, job_id, download_output = read_output_log(out_log)
    error_log = [log for log in glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.e*-*') if log.split('-')[1] == sample_index][0]
    download_error, spots_read, reads_read, reads_written, reads_check = read_error_log(error_log)
    log_data.append([index, sample_name, srr_id, job_id, download_output, download_error, spots_read, reads_read, reads_written, reads_check, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'download_status_out', 'download_status_error', 'spots_read', 'reads_read', 'reads_written', 'reads_check', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [11]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,download_status_out,download_status_error,spots_read,reads_read,reads_written,reads_check,log
0,CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1,SRR13492051,5034275,1,1,400231692,800463384,800463384,1,fasterq_download_srr_fastqs.o5034275-103
1,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b1,SRR7990656,5034309,1,1,274443458,548886916,548886916,1,fasterq_download_srr_fastqs.o5034309-12
2,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b2,SRR7990657,5034309,1,1,259088017,518176034,518176034,1,fasterq_download_srr_fastqs.o5034309-13
3,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,SRR3467178,5034309,0,0,-1,-1,-1,0,fasterq_download_srr_fastqs.o5034309-19
4,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,SRR3467177,5034309,0,0,-1,-1,-1,0,fasterq_download_srr_fastqs.o5034309-18
5,HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637944,5034309,1,1,55434080,110868160,110868160,1,fasterq_download_srr_fastqs.o5034309-4
6,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637948,5034309,1,1,87427470,174854940,174854940,1,fasterq_download_srr_fastqs.o5034309-8
7,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637949,5034309,1,1,87715470,175430940,175430940,1,fasterq_download_srr_fastqs.o5034309-9
8,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637946,5034309,1,1,95550864,191101728,191101728,1,fasterq_download_srr_fastqs.o5034309-6
9,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637947,5034309,1,1,88799457,177598914,177598914,1,fasterq_download_srr_fastqs.o5034309-7


## Check the presences of SRR FASTQ files

In [12]:
# get the download path
download_r1_tpl = 'results/fastqs/raw/{}/{}_1.fastq.gz'
download_r2_tpl = 'results/fastqs/raw/{}/{}_2.fastq.gz'
log_df.loc[:, 'download_output_r1'] = log_df.apply(lambda x: download_r1_tpl.format(x[0], x[1]), axis=1)
log_df.loc[:, 'download_output_r2'] = log_df.apply(lambda x: download_r2_tpl.format(x[0], x[1]), axis=1)

# find whether the download SRR FASTQ is present
log_df.loc[:, 'download_present_r1'] = log_df.loc[:, 'download_output_r1'].apply(os.path.exists).astype(int)
log_df.loc[:, 'download_present_r2'] = log_df.loc[:, 'download_output_r2'].apply(os.path.exists).astype(int)

In [13]:
reorder_cols = ['std_sample_name', 
                'srr_id', 'job_id', 
                'download_status_out', 
                'download_status_error',
                'spots_read', 
                'reads_read', 
                'reads_written', 
                'reads_check',
                'download_present_r1',
                'download_present_r2',
                'log',
                'download_output_r1',
                'download_output_r2']

In [14]:
log_df = log_df[reorder_cols]
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,download_status_out,download_status_error,spots_read,reads_read,reads_written,reads_check,download_present_r1,download_present_r2,log,download_output_r1,download_output_r2
0,CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1,SRR13492051,5034275,1,1,400231692,800463384,800463384,1,1,1,fasterq_download_srr_fastqs.o5034275-103,results/fastqs/raw/CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1/SRR13492051_1.fastq.gz,results/fastqs/raw/CD34+-Cord-Blood.GSE165207.Homo_Sapiens.H3K27ac.b1/SRR13492051_2.fastq.gz
1,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b1,SRR7990656,5034309,1,1,274443458,548886916,548886916,1,1,1,fasterq_download_srr_fastqs.o5034309-12,results/fastqs/raw/GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b1/SRR7990656_1.fastq.gz,results/fastqs/raw/GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b1/SRR7990656_2.fastq.gz
2,GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b2,SRR7990657,5034309,1,1,259088017,518176034,518176034,1,1,1,fasterq_download_srr_fastqs.o5034309-13,results/fastqs/raw/GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b2/SRR7990657_1.fastq.gz,results/fastqs/raw/GM12878-HiChIP.GSE115524.Homo_Sapiens.CTCF.b2/SRR7990657_2.fastq.gz
3,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,SRR3467178,5034309,0,0,-1,-1,-1,0,1,1,fasterq_download_srr_fastqs.o5034309-19,results/fastqs/raw/GM12878.GSE80820.Homo_Sapiens.SMC1A.b2/SRR3467178_1.fastq.gz,results/fastqs/raw/GM12878.GSE80820.Homo_Sapiens.SMC1A.b2/SRR3467178_2.fastq.gz
4,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,SRR3467177,5034309,0,0,-1,-1,-1,0,1,1,fasterq_download_srr_fastqs.o5034309-18,results/fastqs/raw/GM12878.GSE80820.Homo_Sapiens.SMC1A.b2/SRR3467177_1.fastq.gz,results/fastqs/raw/GM12878.GSE80820.Homo_Sapiens.SMC1A.b2/SRR3467177_2.fastq.gz
5,HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637944,5034309,1,1,55434080,110868160,110868160,1,1,1,fasterq_download_srr_fastqs.o5034309-4,results/fastqs/raw/HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637944_1.fastq.gz,results/fastqs/raw/HCC15.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637944_2.fastq.gz
6,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637948,5034309,1,1,87427470,174854940,174854940,1,1,1,fasterq_download_srr_fastqs.o5034309-8,results/fastqs/raw/KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637948_1.fastq.gz,results/fastqs/raw/KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637948_2.fastq.gz
7,KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637949,5034309,1,1,87715470,175430940,175430940,1,1,1,fasterq_download_srr_fastqs.o5034309-9,results/fastqs/raw/KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637949_1.fastq.gz,results/fastqs/raw/KYSE140_ARV2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637949_2.fastq.gz
8,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637946,5034309,1,1,95550864,191101728,191101728,1,1,1,fasterq_download_srr_fastqs.o5034309-6,results/fastqs/raw/KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637946_1.fastq.gz,results/fastqs/raw/KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637946_2.fastq.gz
9,KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1,SRR13637947,5034309,1,1,88799457,177598914,177598914,1,1,1,fasterq_download_srr_fastqs.o5034309-7,results/fastqs/raw/KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637947_1.fastq.gz,results/fastqs/raw/KYSE140_DMSO2hrs.GSE166232.Homo_Sapiens.H3K27ac.b1/SRR13637947_2.fastq.gz


## Identifying problem samples

In [15]:
problems = (log_df.download_status_out != 1) | (log_df.download_status_error != 1) | (log_df.reads_check != 1) | (log_df.download_present_r1 != 1) | (log_df.download_present_r2 != 1)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 4


In [16]:
problems_df.loc[:, ['std_sample_name', 'download_status_out', 'download_status_error', 'reads_check', 'download_present_r1', 'download_present_r2', 'log']]

Unnamed: 0,std_sample_name,download_status_out,download_status_error,reads_check,download_present_r1,download_present_r2,log
3,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,0,0,0,1,1,fasterq_download_srr_fastqs.o5034309-19
4,GM12878.GSE80820.Homo_Sapiens.SMC1A.b2,0,0,0,1,1,fasterq_download_srr_fastqs.o5034309-18
10,PAEC-siRNA-Ctrl.GSE152900.Homo_Sapiens.H3K27ac.b1,0,0,0,0,0,fasterq_download_srr_fastqs.o5034309-14
11,PAEC-siRNA-Ctrl.GSE152900.Homo_Sapiens.H3K27ac.b2,0,0,0,1,1,fasterq_download_srr_fastqs.o5034309-15


In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
problems_df.to_csv(header_output, header=True, index=False, sep='\t')