In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.08.18.08.42'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/fastq/{}.fastq.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def read_output_log(log):
    
    download_output = 0
    sample_name = 'check'
    srr_id = 'check'

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'Ended: fasterq_download_srr_fastqs' in info:
                download_output = 1
    
    return([index, sample_name, srr_id, job_id, download_output])

def read_error_log(log):
    
    download_error = 0
    spots_read = -1
    reads_read = -1
    reads_written = -1
    reads_check = 0

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if line.split('-')[0] == '2022':
                if 'was downloaded successfully' in line:
                    download_error = 1
            else:
                value_split = line.split(':')[1].strip().split(',')
                value = ''.join(value_split)
                if 'spots read' in line:
                    spots_read = int(value)
                if 'reads read' in line:
                    reads_read = int(value)
                if 'reads written' in line:
                    reads_written = int(value)
                if reads_read == 2 * spots_read and reads_written == 2 * spots_read:
                    reads_check = 1
        
    return([download_error, spots_read, reads_read, reads_written, reads_check])

## Check Logs

In [5]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [6]:
df

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955433,SRR13192949,Mus_Musculus,1,1,GR,MboI,3134_siCTRL_1hr_Dex,1
1,mES_25m.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705032,SRR5831480,Mus_Musculus,1,2,H3K27ac,MboI,mES_25m,2
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955434,SRR13192950,Mus_Musculus,1,1,GR,MboI,3134_siNIPBL_1hr_Dex,3
3,mES_25m.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705034,SRR5831482,Mus_Musculus,2,2,H3K27ac,MboI,mES_25m,4
4,3134_WT.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955432,SRR13192948,Mus_Musculus,1,1,GR,MboI,3134_WT,5
5,3T3.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746145,SRR17296607,Mus_Musculus,1,1,BATF,MboI,3T3,6
6,3T3.GSE192387.Mus_Musculus.CTCF.b1,GSE192387,GSM5746148,SRR17296610,Mus_Musculus,1,1,CTCF,MboI,3T3,7
7,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746146,SRR17296608,Mus_Musculus,1,1,BATF,MboI,3T3_Irf4,8
8,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746147,SRR17296609,Mus_Musculus,1,1,BATF,MboI,3T3_Irf4_Runx3_Tbet,9
9,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,GSE178344,GSM5388160,SRR14850834,Mus_Musculus,1,1,SMC1A,MboI,3T3_Norm,10


In [7]:
print('number of SRRs to be downloaded:', len(df))
print('number of fastq files expected:', len(df)*2)

number of SRRs to be downloaded: 238
number of fastq files expected: 476


In [8]:
df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)

Unnamed: 0,std_sample_name,gse_id,gsm_id,srr_id,organism,bio_rep,tech_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955433,SRR13192949,Mus_Musculus,1,1,GR,MboI,3134_siCTRL_1hr_Dex,1
1,mES_25m.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,GSM2705032,SRR5831480,Mus_Musculus,1,2,H3K27ac,MboI,mES_25m,2
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955434,SRR13192950,Mus_Musculus,1,1,GR,MboI,3134_siNIPBL_1hr_Dex,3
3,mES_25m.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,GSM2705034,SRR5831482,Mus_Musculus,2,2,H3K27ac,MboI,mES_25m,4
4,3134_WT.GSE162617.Mus_Musculus.GR.b1,GSE162617,GSM4955432,SRR13192948,Mus_Musculus,1,1,GR,MboI,3134_WT,5
5,3T3.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746145,SRR17296607,Mus_Musculus,1,1,BATF,MboI,3T3,6
6,3T3.GSE192387.Mus_Musculus.CTCF.b1,GSE192387,GSM5746148,SRR17296610,Mus_Musculus,1,1,CTCF,MboI,3T3,7
7,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746146,SRR17296608,Mus_Musculus,1,1,BATF,MboI,3T3_Irf4,8
8,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1,GSE192387,GSM5746147,SRR17296609,Mus_Musculus,1,1,BATF,MboI,3T3_Irf4_Runx3_Tbet,9
9,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,GSE178344,GSM5388160,SRR14850834,Mus_Musculus,1,1,SMC1A,MboI,3T3_Norm,10


In [9]:
print('number of uniqe samples/bioreps for planned download:', len(df.drop_duplicates(subset=['std_sample_name']).reset_index(drop=True)))

number of uniqe samples/bioreps for planned download: 196


In [10]:
# create a dataframe of log information
output_logs = glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.o5139204-*')
error_logs = glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.e5139204-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, srr_id, job_id, download_output = read_output_log(out_log)
    error_log = [log for log in glob.glob('results/fastqs/raw/logs/fasterq_download_srr_fastqs.e5139204-*') if log.split('-')[1] == sample_index][0]
    download_error, spots_read, reads_read, reads_written, reads_check = read_error_log(error_log)
    log_data.append([index, sample_name, srr_id, job_id, download_output, download_error, spots_read, reads_read, reads_written, reads_check, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'download_status_out', 'download_status_error', 'spots_read', 'reads_read', 'reads_written', 'reads_check', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [11]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,download_status_out,download_status_error,spots_read,reads_read,reads_written,reads_check,log
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,SRR13192948,5139204,1,1,60977251,121954502,121954502,1,fasterq_download_srr_fastqs.o5139204-156
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,SRR13192949,5139204,1,1,95173343,190346686,190346686,1,fasterq_download_srr_fastqs.o5139204-157
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,SRR13192950,5139204,1,1,61528277,123056554,123056554,1,fasterq_download_srr_fastqs.o5139204-158
3,3T3.GSE192387.Mus_Musculus.BATF.b1,SRR17296607,5139204,1,1,254054774,508109548,508109548,1,fasterq_download_srr_fastqs.o5139204-183
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,SRR17296610,5139204,1,1,318884253,637768506,637768506,1,fasterq_download_srr_fastqs.o5139204-186
5,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,SRR17296608,5139204,1,1,227232360,454464720,454464720,1,fasterq_download_srr_fastqs.o5139204-184
6,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1,SRR17296609,5139204,1,1,226938707,453877414,453877414,1,fasterq_download_srr_fastqs.o5139204-185
7,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,SRR14850834,5139204,1,1,524441382,1048882764,1048882764,1,fasterq_download_srr_fastqs.o5139204-165
8,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,SRR14850835,5139204,1,1,552100726,1104201452,1104201452,1,fasterq_download_srr_fastqs.o5139204-166
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,SRR10545051,5139204,1,1,123557860,247115720,247115720,1,fasterq_download_srr_fastqs.o5139204-97


## Check the presences of SRR FASTQ files

In [12]:
# get the download path
download_r1_tpl = 'results/fastqs/raw/{}/{}_1.fastq.gz'
download_r2_tpl = 'results/fastqs/raw/{}/{}_2.fastq.gz'
log_df.loc[:, 'download_output_r1'] = log_df.apply(lambda x: download_r1_tpl.format(x[0], x[1]), axis=1)
log_df.loc[:, 'download_output_r2'] = log_df.apply(lambda x: download_r2_tpl.format(x[0], x[1]), axis=1)

# find whether the download SRR FASTQ is present
log_df.loc[:, 'download_present_r1'] = log_df.loc[:, 'download_output_r1'].apply(os.path.exists).astype(int)
log_df.loc[:, 'download_present_r2'] = log_df.loc[:, 'download_output_r2'].apply(os.path.exists).astype(int)

In [13]:
reorder_cols = ['std_sample_name', 
                'srr_id', 'job_id', 
                'download_status_out', 
                'download_status_error',
                'spots_read', 
                'reads_read', 
                'reads_written', 
                'reads_check',
                'download_present_r1',
                'download_present_r2',
                'log',
                'download_output_r1',
                'download_output_r2']

In [14]:
log_df = log_df[reorder_cols]
log_df

Unnamed: 0,std_sample_name,srr_id,job_id,download_status_out,download_status_error,spots_read,reads_read,reads_written,reads_check,download_present_r1,download_present_r2,log,download_output_r1,download_output_r2
0,3134_WT.GSE162617.Mus_Musculus.GR.b1,SRR13192948,5139204,1,1,60977251,121954502,121954502,1,1,1,fasterq_download_srr_fastqs.o5139204-156,results/fastqs/raw/3134_WT.GSE162617.Mus_Musculus.GR.b1/SRR13192948_1.fastq.gz,results/fastqs/raw/3134_WT.GSE162617.Mus_Musculus.GR.b1/SRR13192948_2.fastq.gz
1,3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,SRR13192949,5139204,1,1,95173343,190346686,190346686,1,1,1,fasterq_download_srr_fastqs.o5139204-157,results/fastqs/raw/3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1/SRR13192949_1.fastq.gz,results/fastqs/raw/3134_siCTRL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1/SRR13192949_2.fastq.gz
2,3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1,SRR13192950,5139204,1,1,61528277,123056554,123056554,1,1,1,fasterq_download_srr_fastqs.o5139204-158,results/fastqs/raw/3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1/SRR13192950_1.fastq.gz,results/fastqs/raw/3134_siNIPBL_1hr_Dex.GSE162617.Mus_Musculus.GR.b1/SRR13192950_2.fastq.gz
3,3T3.GSE192387.Mus_Musculus.BATF.b1,SRR17296607,5139204,1,1,254054774,508109548,508109548,1,1,1,fasterq_download_srr_fastqs.o5139204-183,results/fastqs/raw/3T3.GSE192387.Mus_Musculus.BATF.b1/SRR17296607_1.fastq.gz,results/fastqs/raw/3T3.GSE192387.Mus_Musculus.BATF.b1/SRR17296607_2.fastq.gz
4,3T3.GSE192387.Mus_Musculus.CTCF.b1,SRR17296610,5139204,1,1,318884253,637768506,637768506,1,1,1,fasterq_download_srr_fastqs.o5139204-186,results/fastqs/raw/3T3.GSE192387.Mus_Musculus.CTCF.b1/SRR17296610_1.fastq.gz,results/fastqs/raw/3T3.GSE192387.Mus_Musculus.CTCF.b1/SRR17296610_2.fastq.gz
5,3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1,SRR17296608,5139204,1,1,227232360,454464720,454464720,1,1,1,fasterq_download_srr_fastqs.o5139204-184,results/fastqs/raw/3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1/SRR17296608_1.fastq.gz,results/fastqs/raw/3T3_Irf4.GSE192387.Mus_Musculus.BATF.b1/SRR17296608_2.fastq.gz
6,3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1,SRR17296609,5139204,1,1,226938707,453877414,453877414,1,1,1,fasterq_download_srr_fastqs.o5139204-185,results/fastqs/raw/3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1/SRR17296609_1.fastq.gz,results/fastqs/raw/3T3_Irf4_Runx3_Tbet.GSE192387.Mus_Musculus.BATF.b1/SRR17296609_2.fastq.gz
7,3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1,SRR14850834,5139204,1,1,524441382,1048882764,1048882764,1,1,1,fasterq_download_srr_fastqs.o5139204-165,results/fastqs/raw/3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1/SRR14850834_1.fastq.gz,results/fastqs/raw/3T3_Norm.GSE178344.Mus_Musculus.SMC1A.b1/SRR14850834_2.fastq.gz
8,3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1,SRR14850835,5139204,1,1,552100726,1104201452,1104201452,1,1,1,fasterq_download_srr_fastqs.o5139204-166,results/fastqs/raw/3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1/SRR14850835_1.fastq.gz,results/fastqs/raw/3T3_TCF1.GSE178344.Mus_Musculus.SMC1A.b1/SRR14850835_2.fastq.gz
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,SRR10545051,5139204,1,1,123557860,247115720,247115720,1,0,0,fasterq_download_srr_fastqs.o5139204-97,results/fastqs/raw/AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3/SRR10545051_1.fastq.gz,results/fastqs/raw/AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3/SRR10545051_2.fastq.gz


## Identifying problem samples

In [15]:
problems = (log_df.download_status_out != 1) | (log_df.download_status_error != 1) | (log_df.reads_check != 1) | (log_df.download_present_r1 != 1) | (log_df.download_present_r2 != 1)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 8


In [16]:
problems_df.loc[:, ['std_sample_name', 'download_status_out', 'download_status_error', 'reads_check', 'download_present_r1', 'download_present_r2', 'log']]

Unnamed: 0,std_sample_name,download_status_out,download_status_error,reads_check,download_present_r1,download_present_r2,log
9,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-97
10,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-94
11,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-95
12,AML12_shCtrl.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-96
13,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-99
14,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-100
15,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-98
16,AML12_shSafb.GSE141113.Mus_Musculus.H3K9me3,1,1,1,0,0,fasterq_download_srr_fastqs.o5139204-101


In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
problems_df.to_csv(header_output, header=True, index=False, sep='\t')

In [4]:
def read_output_log(log):
    
    r1 = 0
    r2 = 0
    sample_name = 'check'
    srr_id = 'check'

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'Analysis complete for ' + srr_id + '_1.fastq.gz' in info:
                r1 = 1
            elif 'Analysis complete for ' + srr_id + '_2.fastq.gz' in info:
                r2 = 1
    
    return([index, sample_name, srr_id, job_id, r1, r2])

In [5]:
# create a dataframe of log information
output_logs = glob.glob('results/qc/fastqc/logs/fastqc_report.o*-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, srr_id, job_id, r1, r2 = read_output_log(out_log)
    log_data.append([index, sample_name, srr_id, job_id, r1, r2, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'job_id', 'r1', 'r2', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [8]:
problems = (log_df.r1 != 1) | (log_df.r2 != 1)
problems_df = log_df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,srr_id,job_id,r1,r2,log
415,IMR90-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,SRR5808479,5042843,1,0,fastqc_report.o5042843-166
