In [3]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.08.11.51'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/{}.hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/hicpro/{}.hicpro.post-check.split_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [5]:
def read_log(log):
    
    status = 0
    sample_name = 'check'
    gse_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'geo_id' in info:
                gse_id = info.split()[1]
            elif 'Ended: split_fastqs' in info:
                status = 1

    return([index, sample_name, gse_id, job_id, status])

## Check Logs

In [6]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [7]:
df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,GM12878,1
1,GM12878.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,GM12878,2
2,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA1m1,3
3,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA1m1,4
4,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b1,GSE133227,Homo_Sapiens,1,CTCF,MboI,A673_SA2m1,5
5,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b2,GSE133227,Homo_Sapiens,2,CTCF,MboI,A673_SA2m1,6
6,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b3,GSE133227,Homo_Sapiens,3,CTCF,MboI,A673_SA2m1,7
7,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b4,GSE133227,Homo_Sapiens,4,CTCF,MboI,A673_SA2m1,8
8,A673_SA2m1.GSE133227.Homo_Sapiens.CTCF.b5,GSE133227,Homo_Sapiens,5,CTCF,MboI,A673_SA2m1,9
9,A673_SA2m1.GSE133227.Homo_Sapiens.H3K27ac.b1,GSE133227,Homo_Sapiens,1,H3K27ac,MboI,A673_SA2m1,10


In [8]:
print('number of samples visited:', len(df))

number of samples visited: 94


In [9]:
# create a dataframe of log information
output_logs = glob.glob('results/fastqs/parallel/logs/split_fastqs.o*-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, gse_id, job_id, status = read_log(out_log)
    log_data.append([index, sample_name, gse_id, job_id, status, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'gse_id', 'job_id', 'split_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [10]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,gse_id,job_id,split_status,log
0,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-1
1,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-2
2,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-3
3,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-4
4,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-5
5,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-6
6,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-7
7,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-8
8,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-9
9,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-10


## Identifying problem samples

In [11]:
problems = (log_df.split_status != 1)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 0


In [12]:
problems_df.loc[:, ['std_sample_name', 'gse_id', 'job_id', 'split_status', 'log']]

Unnamed: 0,std_sample_name,gse_id,job_id,split_status,log


In [33]:
samples = glob.glob('results/fastqs/parallel/*Homo_Sapiens*/')

for sample in samples:
    os.chdir(sample)
    srr_files = glob.glob('*_SRR*')
    srr_1_files = [round(os.path.getsize(srr_file)/1024,-5) for srr_file in srr_files if '_1' in srr_file]
    srr_2_files = [round(os.path.getsize(srr_file)/1024,-5) for srr_file in srr_files if '_2' in srr_file]
    srr_R1_files = [os.path.getsize(srr_file) for srr_file in srr_files if '_R1' in srr_file]
    srr_R2_files = [os.path.getsize(srr_file) for srr_file in srr_files if '_R2' in srr_file]
    
    if len(srr_1_files) != len(srr_R1_files) or len(srr_2_files) != len(srr_R2_files):
        print("this sample has linking issues:", sample)
    if len(srr_1_files) != len(srr_2_files):
        print("this sample has splitting issues:", sample)
    not_same_size = 0
    for srr in srr_1_files:
        if srr not in srr_2_files:
            not_same_size = 1
    if not_same_size == 1:
        print("this sample's R1/R2 not same size:", sample)
        
    os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

this sample's R1/R2 not same size: results/fastqs/parallel/KBM7-MED14-dTAG_2h_dTAG7.GSE139466.Homo_Sapiens.H3K27ac.b2/
this sample's R1/R2 not same size: results/fastqs/parallel/293T-PDS.GSE128106.Homo_Sapiens.YY1.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/293T.GSE128106.Homo_Sapiens.YY1.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/KBM7-MED14-dTAG_2h_dTAG7.GSE139466.Homo_Sapiens.H3K27ac.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/IMR90-Senescent.GSE100856.Homo_Sapiens.CTCF.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/KBM7-MED14-dTAG_2h_DMSO.GSE139466.Homo_Sapiens.H3K27ac.b2/
this sample's R1/R2 not same size: results/fastqs/parallel/IMR90-Proliferating.GSE100856.Homo_Sapiens.CTCF.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1/
this sample's R1/R2 not same size: results/fastqs/parallel/KBM7-MED14-dTAG_2h_DMSO.GSE139466.Homo_Sapiens.H3K27ac.b1/


In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
problems_df.to_csv(header_output, header=True, index=False, sep='\t')

## Check for Truncated, Unsplit FastQ Files

#### helper function

In [13]:
def read_rlog(log):
    
    status = 0
    sample_name = 'check'
    srr_id = 'check'
    r1_reads = -1
    r2_reads = -1
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'r1 reads' in info:
                r1_reads = int(info.split()[2])
            elif 'r2 reads' in info:
                r2_reads = int(info.split()[2])
            elif 'Ended: check_missing_reads' in info:
                status = 1

    return([index, sample_name, srr_id, r1_reads, r2_reads, job_id, status])

#### create dataframe

In [14]:
# create a dataframe of log information
routput_logs = glob.glob('results/qc/check_missing_reads.o*-*')
rlog_data = []   
for rout_log in routput_logs:
    sample_index = rout_log.split('-')[1]
    index, sample_name, srr_id, r1_reads, r2_reads, job_id, status = read_rlog(rout_log)
    rlog_data.append([index, sample_name, srr_id, r1_reads, r2_reads, job_id, status, os.path.basename(rout_log)])
rlog_df = pd.DataFrame(rlog_data)
rlog_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'r1_reads', 'r2_reads', 'job_id', 'status', 'log']
rlog_df.drop('sample_index', inplace=True, axis=1)

In [15]:
rlog_df = rlog_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
rlog_df

Unnamed: 0,std_sample_name,srr_id,r1_reads,r2_reads,job_id,status,log
0,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,SRR8707617,101827760,101827760,5046551,1,check_missing_reads.o5046551-160
1,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,SRR8707618,81789448,81789448,5046551,1,check_missing_reads.o5046551-161
2,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,SRR8707620,186243704,186243704,5046551,1,check_missing_reads.o5046551-163
3,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,SRR8707619,182862796,182862796,5046551,1,check_missing_reads.o5046551-162
4,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707614,468342572,468342572,5046551,1,check_missing_reads.o5046551-157
5,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707616,37826524,37826524,5046551,1,check_missing_reads.o5046551-159
6,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707613,440812452,440812452,5046551,1,check_missing_reads.o5046551-156
7,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707615,43802000,43802000,5046551,1,check_missing_reads.o5046551-158
8,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492727,296922644,296922644,5046551,1,check_missing_reads.o5046551-367
9,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492723,28934500,28934500,5046551,1,check_missing_reads.o5046551-363


#### identify problem samples

In [17]:
rproblems = (rlog_df.r1_reads == -1) | (rlog_df.r2_reads == -1) | (rlog_df.r1_reads != rlog_df.r2_reads) | (rlog_df.status != 1)
rproblems_df = rlog_df.loc[rproblems,:]
print('nubmer of problem samples:', len(rproblems_df))

nubmer of problem samples: 1


In [18]:
rproblems_df.loc[:, ['std_sample_name', 'srr_id', 'job_id', 'r1_reads', 'r2_reads', 'status', 'log']]

Unnamed: 0,std_sample_name,srr_id,job_id,r1_reads,r2_reads,status,log
190,IMR90-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,SRR5808479,5046551,-1,-1,0,check_missing_reads.o5046551-166


## Check for Unequal R1 and R2 read lengths in Unsplit FastQ Files

#### helper function

In [3]:
def read_llog(log):
    
    status = 0
    sample_name = 'check'
    srr_id = 'check'
    r1_length = -1
    r2_length = -1
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'srr_id' in info:
                srr_id = info.split()[1]
            elif 'r1 length' in info:
                r1_length = int(info.split()[2])
            elif 'r2 length' in info:
                r2_length = int(info.split()[2])
            elif 'Ended: check_r1_r2_lengths' in info:
                status = 1

    return([index, sample_name, srr_id, r1_length, r2_length, job_id, status])

#### create dataframe

In [4]:
# create a dataframe of log information
loutput_logs = glob.glob('results/qc/check_r1_r2_lengths.o*-*')
llog_data = []   
for lout_log in loutput_logs:
    sample_index = lout_log.split('-')[1]
    index, sample_name, srr_id, r1_length, r2_length, job_id, status = read_llog(lout_log)
    llog_data.append([index, sample_name, srr_id, r1_length, r2_length, job_id, status, os.path.basename(lout_log)])
llog_df = pd.DataFrame(llog_data)
llog_df.columns = ['sample_index', 'std_sample_name', 'srr_id', 'r1_length', 'r2_length', 'job_id', 'status', 'log']
llog_df.drop('sample_index', inplace=True, axis=1)

In [5]:
llog_df = llog_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
llog_df

Unnamed: 0,std_sample_name,srr_id,r1_length,r2_length,job_id,status,log
0,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,SRR8707618,75,75,5047350,1,check_r1_r2_lengths.o5047350-161
1,293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,SRR8707617,75,75,5047350,1,check_r1_r2_lengths.o5047350-160
2,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,SRR8707619,75,75,5047350,1,check_r1_r2_lengths.o5047350-162
3,293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,SRR8707620,75,75,5047350,1,check_r1_r2_lengths.o5047350-163
4,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707616,75,75,5047350,1,check_r1_r2_lengths.o5047350-159
5,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707613,75,75,5047350,1,check_r1_r2_lengths.o5047350-156
6,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707614,75,75,5047350,1,check_r1_r2_lengths.o5047350-157
7,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707615,74,75,5047350,1,check_r1_r2_lengths.o5047350-158
8,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492726,101,101,5047350,1,check_r1_r2_lengths.o5047350-366
9,A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492727,101,101,5047350,1,check_r1_r2_lengths.o5047350-367


#### identify problem samples

In [6]:
lproblems = (llog_df.r1_length == -1) | (llog_df.r2_length == -1) | (llog_df.r1_length != llog_df.r2_length) | (llog_df.status != 1)
lproblems_df = llog_df.loc[lproblems,:]
print('nubmer of problem samples:', len(lproblems_df))

nubmer of problem samples: 37


In [7]:
lproblems_df.loc[:, ['std_sample_name', 'srr_id', 'job_id', 'r1_length', 'r2_length', 'status', 'log']]

Unnamed: 0,std_sample_name,srr_id,job_id,r1_length,r2_length,status,log
7,293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707615,5047350,74,75,1,check_r1_r2_lengths.o5047350-158
24,A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,SRR9590180,5047350,74,76,1,check_r1_r2_lengths.o5047350-38
37,A673_SA2r.GSE133227.Homo_Sapiens.H3K27ac.b1,SRR9590193,5047350,75,76,1,check_r1_r2_lengths.o5047350-51
46,A673_WT.GSE133227.Homo_Sapiens.CTCF.b5,SRR9590202,5047350,75,76,1,check_r1_r2_lengths.o5047350-60
79,DND41-DMSO.GSE173871.Homo_Sapiens.SMC1A.b2,SRR14423663,5047350,35,38,1,check_r1_r2_lengths.o5047350-427
81,DND41-Resistant.GSE173871.Homo_Sapiens.SMC1A.b2,SRR14423665,5047350,38,37,1,check_r1_r2_lengths.o5047350-429
97,GM12878.GSE80820.Homo_Sapiens.SMC1A.b1,SRR3467175,5047350,76,75,1,check_r1_r2_lengths.o5047350-455
98,GM12878.GSE80820.Homo_Sapiens.SMC1A.b1,SRR3467176,5047350,74,75,1,check_r1_r2_lengths.o5047350-456
154,HCC1599-WT.GSE116872.Homo_Sapiens.SMC1A.b1,SRR7505611,5047350,36,38,1,check_r1_r2_lengths.o5047350-193
173,HUVEC-Proliferating.GSE100856.Homo_Sapiens.CTCF.b1,SRR6496514,5047350,75,76,1,check_r1_r2_lengths.o5047350-167


### Look at FastQC Reports for Read Lengths

In [18]:
def read_lengths(sample):
    
    os.chdir(sample)
    data_logs = glob.glob('*_fastqc.html')
    pairs = []
    
    for log in data_logs:
        srr_id = ('_').join(log.split('_')[0:2])
        length = -1

        with open(log, errors='ignore') as fr:
            for line in fr:
                info = line.strip()
                if 'Sequence length' in info:
                    length = info.split('Sequence length</td><td>')[1].split('</td><td>')[0].split('</td>')[0]
    
        pairs.append(srr_id)
        pairs.append(length)
    
    os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling')
    return(pairs)

In [44]:
# create a dataframe of log information
samples = glob.glob('results/qc/fastqc/*Homo_Sapiens*/')
length_data = {}
for sample in samples:
    length_data[sample.split('/')[3]] = read_lengths(sample)

length_df = pd.DataFrame.from_dict(length_data, orient='index')
length_df.fillna('').sort_index()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
293T-PDS.GSE128106.Homo_Sapiens.YY1.b1,SRR8707617_2,35-76,SRR8707618_2,35-76,SRR8707618_1,35-76,SRR8707617_1,35-76,,,,,,,,,,,,,,,,
293T-TMPYP4.GSE128106.Homo_Sapiens.YY1.b1,SRR8707620_2,35-76,SRR8707619_2,35-76,SRR8707619_1,35-76,SRR8707620_1,35-76,,,,,,,,,,,,,,,,
293T.GSE128106.Homo_Sapiens.YY1.b1,SRR8707616_2,35-76,SRR8707613_2,35-76,SRR8707614_2,35-76,SRR8707615_1,35-76,SRR8707615_2,35-76,SRR8707614_1,35-76,SRR8707613_1,35-76,SRR8707616_1,35-76,,,,,,,,
A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492724_2,101,SRR12492727_1,101,SRR12492726_2,101,SRR12492723_2,101,SRR12492723_1,101,SRR12492726_1,101,SRR12492727_2,101,SRR12492724_1,101,,,,,,,,
A673-siCT-Dh1-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492728_1,101,SRR12492725_1,101,SRR12492725_2,101,SRR12492728_2,101,,,,,,,,,,,,,,,,
A673-siSA2-Dh6-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492730_2,101,SRR12492729_2,101,SRR12492733_1,101,SRR12492732_2,101,SRR12492732_1,101,SRR12492733_2,101,SRR12492729_1,101,SRR12492730_1,101,,,,,,,,
A673-siSA2-Dh6-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492731_1,101,SRR12492734_1,101,SRR12492734_2,101,SRR12492731_2,101,,,,,,,,,,,,,,,,
A673-siSA2-Dh8-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b1,SRR12492721_2,101,SRR12492735_2,101,SRR12492735_1,101,SRR12492721_1,101,,,,,,,,,,,,,,,,
A673-siSA2-Dh8-72h-D347-D362.GSE156650.Homo_Sapiens.CTCF.b2,SRR12492722_1,101,SRR12492720_1,101,SRR12492720_2,101,SRR12492722_2,101,,,,,,,,,,,,,,,,
A673_SA1m1.GSE133227.Homo_Sapiens.CTCF.b1,SRR9590180_2,35-76,SRR9590180_1,35-76,,,,,,,,,,,,,,,,,,,,
