In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', 400)

In [2]:
latest_date

'2022.07.04.10.56'

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/{}.hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/hicpro/{}.hicpro.post-check.split_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def read_log(log):
    
    status = 0
    sample_name = 'check'
    gse_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'geo_id' in info:
                gse_id = info.split()[1]
            elif 'Ended: split_fastqs' in info:
                status = 1

    return([index, sample_name, gse_id, job_id, status])

## Check Logs

In [5]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1 # add 1 since we'll be using 1-based indexing with sed

In [6]:
df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index
0,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,Homo_Sapiens,1,CTCF,MboI,A673-siCT-Dh1-72h-D347,1
1,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,Homo_Sapiens,2,CTCF,MboI,A673-siCT-Dh1-72h-D347,2
2,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,Homo_Sapiens,1,CTCF,MboI,A673-siCT-Dh1-72h-D362,3
3,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,Homo_Sapiens,2,CTCF,MboI,A673-siCT-Dh1-72h-D362,4
4,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,Homo_Sapiens,1,CTCF,MboI,A673-siSA2-Dh6-72h-D347,5
5,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,Homo_Sapiens,2,CTCF,MboI,A673-siSA2-Dh6-72h-D347,6
6,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,Homo_Sapiens,1,CTCF,MboI,A673-siSA2-Dh6-72h-D362,7
7,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,Homo_Sapiens,2,CTCF,MboI,A673-siSA2-Dh6-72h-D362,8
8,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,Homo_Sapiens,1,CTCF,MboI,A673-siSA2-Dh8-72h-D347,9
9,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,Homo_Sapiens,2,CTCF,MboI,A673-siSA2-Dh8-72h-D347,10


In [7]:
print('number of samples visited:', len(df))

number of samples visited: 12


In [8]:
# create a dataframe of log information
output_logs = glob.glob('results/fastqs/parallel/logs/split_fastqs.o*-*')
log_data = []   
for out_log in output_logs:
    sample_index = out_log.split('-')[1]
    index, sample_name, gse_id, job_id, status = read_log(out_log)
    log_data.append([index, sample_name, gse_id, job_id, status, os.path.basename(out_log)])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'gse_id', 'job_id', 'split_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [9]:
log_df = log_df.sort_values(by=["std_sample_name"], ascending=True).reset_index(drop=True)
log_df

Unnamed: 0,std_sample_name,gse_id,job_id,split_status,log
0,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-1
1,A673-siCT-Dh1-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-2
2,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-3
3,A673-siCT-Dh1-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-4
4,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-5
5,A673-siSA2-Dh6-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-6
6,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-7
7,A673-siSA2-Dh6-72h-D362.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-8
8,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b1,GSE156650,5038311,1,split_fastqs.o5038311-9
9,A673-siSA2-Dh8-72h-D347.GSE156650.Homo_Sapiens.CTCF.b2,GSE156650,5038311,1,split_fastqs.o5038311-10


## Identifying problem samples

In [10]:
problems = (log_df.split_status != 1)
problems_df = log_df.loc[problems,:]
print('nubmer of problem samples:', len(problems_df))

nubmer of problem samples: 0


In [11]:
problems_df.loc[:, ['std_sample_name', 'gse_id', 'job_id', 'split_status', 'log']]

Unnamed: 0,std_sample_name,gse_id,job_id,split_status,log


In [12]:
samples = glob.glob('results/fastqs/parallel/*Homo_Sapiens*/')

for sample in samples:
    os.chdir(sample)
    srr_files = glob.glob('*_SRR*')
    srr_1_files = [srr_file for srr_file in srr_files if '_1' in srr_file]
    srr_2_files = [srr_file for srr_file in srr_files if '_2' in srr_file]
    srr_R1_files = [srr_file for srr_file in srr_files if '_R1' in srr_file]
    srr_R2_files = [srr_file for srr_file in srr_files if '_R2' in srr_file]
    
    if len(srr_1_files) != len(srr_R1_files) or len(srr_2_files) != len(srr_R2_files):
        print("this sample has linking issues:", sample)
    if len(srr_1_files) != len(srr_2_files):
        print("this sample has splitting issues:", sample)
    
    os.chdir('/mnt/BioAdHoc/Groups/vd-ay/kfetter/hichip-db-loop-calling/')

this sample has splitting issues: results/fastqs/parallel/HK2.GSE147646.Homo_Sapiens.H3K27ac.b1/


In [17]:
header_output = '{}.with_header.tsv'.format(output_prefix)
problems_df.to_csv(header_output, header=True, index=False, sep='\t')