In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    download_fastqs_tracker_fn = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs.with_header.tsv'.format(latest_date) 
    hicpro_samplesheet = 'results/samplesheets/hicpro/{}.hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.split_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [4]:
def get_log(x, mode='index'):
    
    if mode == 'index':
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(x))
        
    elif mode == 'path':
        info = x.split('/')
        sample_name = info[3]
        srr_id  = info[4].split('.')[0].split('_')[0]
        index = df.loc[(df[0] == sample_name) & (df[3] == srr_id)].iloc[0].name
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(index))
        
    else:
        return('Try again.')
    
    return(logs)

def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

In [5]:
def read_log(log):
    
    """
    Read the log and extract status information including
    the sample_name, gse_id, status (as indicated by the end message),
    and the download status (as indicate by the "reads written" message).
    """
    
    status = 0
    download = 1
    sample_name = 'check'
    gse_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'geo_id' in info:
                gse_id = info.split()[1]
            elif 'Ended: split_fastqs' in info:
                status = 1

    return([index, sample_name, gse_id, job_id, status])

In [6]:
def long_to_short_index_list(serial_list):
    
    small_num = serial_list[0]
    serial_list_short = []
    for i in range(1, len(serial_list)):

        prev_num = serial_list[i -1]
        next_num = serial_list[i]

        if (prev_num + 1) != next_num:

            if small_num == prev_num:
                curr_range = '{}'.format(small_num)
                serial_list_short.append(curr_range)
                small_num = next_num

            else:
                curr_range = '{}-{}'.format(small_num, prev_num)
                serial_list_short.append(curr_range)
                small_num = next_num

    if small_num == next_num:
        curr_range = '{}'.format(small_num,)
        serial_list_short.append(curr_range)
    else:
        curr_range = '{}-{}'.format(small_num, next_num)
        serial_list_short.append(curr_range)
        
    return(serial_list_short)

## Check Dependency Jobs

#### load main samplesheet

In [7]:
hicpro_df = pd.read_table(hicpro_samplesheet)
hicpro_df['sample_index'] = hicpro_df.index + 1

#### load the tracker for download_fastqs

In [8]:
download_fastqs_tracker = pd.read_table(download_fastqs_tracker_fn)
grps = download_fastqs_tracker.groupby(['std_sample_name', 'gse_id'])

# checking that all srr fastq's have been downloaded for this sample
download_fastqs_tracker_ready = grps.download_end_status.apply(lambda x: int(len(x) == sum(x)))
download_fastqs_tracker_ready = download_fastqs_tracker_ready.to_frame().reset_index()

In [9]:
# merge the samplesheet with the tracker
hicpro_df = hicpro_df.merge(download_fastqs_tracker_ready, on=['std_sample_name', 'gse_id'])

#### load and add the log information 

In [10]:
logs = glob.glob('results/fastqs/parallel/logs/split_fastqs.o*-*')
log_data = []
for log in logs:
    index, sample_name, gse_id, job_id, status = read_log(log)
    log_data.append([index, sample_name, gse_id, job_id, status, log])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'gse_id', 'job_id', 'end_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [11]:
log_df

Unnamed: 0,std_sample_name,gse_id,job_id,end_status,log
0,JHUEM-14.GSE137936.Homo_Sapiens.H3K27ac.b1,GSE137936,5027383,1,results/fastqs/parallel/logs/split_fastqs.o502...
1,check,check,5027383,0,results/fastqs/parallel/logs/split_fastqs.o502...
2,check,check,5027383,0,results/fastqs/parallel/logs/split_fastqs.o502...
3,GM12878.GSE115524.Homo_Sapiens.CTCF.b1,GSE115524,5027383,0,results/fastqs/parallel/logs/split_fastqs.o502...
4,PAEC-Control.GSE152900.Homo_Sapiens.H3K27ac.b2,GSE152900,5027383,1,results/fastqs/parallel/logs/split_fastqs.o502...
5,Primary-Foreskin-Keratinocytes-Day0.GSE158642....,GSE158642,5028244,1,results/fastqs/parallel/logs/split_fastqs.o502...
6,OCI-AML3-DeltaSTAG2.GSE111537.Homo_Sapiens.CTC...,GSE111537,5027383,1,results/fastqs/parallel/logs/split_fastqs.o502...
7,SUCCS1-siEA.GSE180194.Homo_Sapiens.H3K27ac.b1,GSE180194,5028244,1,results/fastqs/parallel/logs/split_fastqs.o502...
8,OCI-AML3-DeltaSTAG2.GSE111537.Homo_Sapiens.CTC...,GSE111537,5027291,0,results/fastqs/parallel/logs/split_fastqs.o502...
9,NSD2-Low-TKO.GSE131651.Homo_Sapiens.CTCF.b2,GSE131651,5027291,1,results/fastqs/parallel/logs/split_fastqs.o502...


In [12]:
# merge the samplesheet with the log information
hicpro_df = hicpro_df.merge(log_df, how='left', on=['std_sample_name', 'gse_id'], indicator='merge')
hicpro_df.loc[hicpro_df.end_status.isna(), 'end_status'] = 0
hicpro_df.end_status = hicpro_df.end_status.astype(int)

In [13]:
# get the latest job id for a give sample + srr id combo
hicpro_df = hicpro_df.sort_values('job_id', ascending=False).drop_duplicates(['std_sample_name', 'gse_id'])

# sort back into the sample index values, needed for correctly running qsub
hicpro_df.sort_values('sample_index', inplace=True)

In [14]:
hicpro_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge
0,HAVIC.GSE154512.Homo_Sapiens.H3K27ac.b1,GSE154512,Homo_Sapiens,1,H3K27ac,MboI,HAVIC,1,1,4932620.0,1,results/fastqs/parallel/logs/split_fastqs.o493...,both
1,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg,2,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
2,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinPos,3,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
3,IMR90-Proliferating.GSE100856.Homo_Sapiens.CTC...,GSE100856,Homo_Sapiens,1,CTCF,HindIII,IMR90-Proliferating,4,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
4,IMR90-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,IMR90-Senescent,5,1,5028244.0,1,results/fastqs/parallel/logs/split_fastqs.o502...,both
6,HUVEC-Proliferating.GSE100856.Homo_Sapiens.CTC...,GSE100856,Homo_Sapiens,1,CTCF,HindIII,HUVEC-Proliferating,6,1,5028244.0,1,results/fastqs/parallel/logs/split_fastqs.o502...,both
9,HUVEC-Senescent.GSE100856.Homo_Sapiens.CTCF.b1,GSE100856,Homo_Sapiens,1,CTCF,HindIII,HUVEC-Senescent,7,1,5028244.0,1,results/fastqs/parallel/logs/split_fastqs.o502...,both
10,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,Homo_Sapiens,1,CTCF,MboI,Hela-S3-Control,8,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
11,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b2,GSE108869,Homo_Sapiens,2,CTCF,MboI,Hela-S3-Control,9,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
12,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,Homo_Sapiens,1,CTCF,MboI,Hela-S3-OE,10,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both


#### save a tracker file

In [15]:
save_df = hicpro_df.rename(columns={'job_id': 'split_fastqs_job_id', 
                             'end_status': 'split_fastqs_end_status',
                              'log': 'split_fastqs_log'})

header_output = '{}.with_header.tsv'.format(output_prefix)
save_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
save_df.to_csv(without_header_output, header=False, index=False, sep='\t')

## Identifying problem samples

In [17]:
problems = (hicpro_df.end_status != 1)
problems_df = hicpro_df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge
1,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg,2,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
2,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinPos,3,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
3,IMR90-Proliferating.GSE100856.Homo_Sapiens.CTC...,GSE100856,Homo_Sapiens,1,CTCF,HindIII,IMR90-Proliferating,4,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
10,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,Homo_Sapiens,1,CTCF,MboI,Hela-S3-Control,8,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
11,Hela-S3-Control.GSE108869.Homo_Sapiens.CTCF.b2,GSE108869,Homo_Sapiens,2,CTCF,MboI,Hela-S3-Control,9,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
12,Hela-S3-OE.GSE108869.Homo_Sapiens.CTCF.b1,GSE108869,Homo_Sapiens,1,CTCF,MboI,Hela-S3-OE,10,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
27,Nalm6.GSE115492.Homo_Sapiens.H3K27ac.b1,GSE115492,Homo_Sapiens,1,H3K27ac,MboI,Nalm6,18,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
31,GM12878.GSE115524.Homo_Sapiens.CTCF.b2,GSE115524,Homo_Sapiens,2,CTCF,MboI,GM12878,20,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
40,MB157-WT.GSE116869.Homo_Sapiens.SMC1A.b1,GSE116869,Homo_Sapiens,1,SMC1A,MboI,MB157-WT,26,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
41,MB157-GSI.GSE116869.Homo_Sapiens.SMC1A.b1,GSE116869,Homo_Sapiens,1,SMC1A,MboI,MB157-GSI,27,0,5027291.0,0,results/fastqs/parallel/logs/split_fastqs.o502...,both


## Rerun problematic samples that are ready

In [18]:
rerun_ready = problems_df.loc[(hicpro_df.download_end_status == 1)]

In [19]:
rerun_ready

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge
150,HEC-1-B-WT.GSE148958.Homo_Sapiens.V5.b1,GSE148958,Homo_Sapiens,1,V5,MboI,HEC-1-B-WT,88,1,5028244,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
155,HEC-1-B-eRNA-LNA.GSE148958.Homo_Sapiens.V5.b1,GSE148958,Homo_Sapiens,1,V5,MboI,HEC-1-B-eRNA-LNA,89,1,5028244,0,results/fastqs/parallel/logs/split_fastqs.o502...,both
158,HEC-1-B-SPT6_LNA.GSE148958.Homo_Sapiens.V5.b1,GSE148958,Homo_Sapiens,1,V5,MboI,HEC-1-B-SPT6_LNA,90,1,5028244,0,results/fastqs/parallel/logs/split_fastqs.o502...,both


In [22]:
# convert not run serial IDs to ranges/short format
rerun_indexes = sorted(rerun_ready.sample_index.tolist())
rerun_indexes = long_to_short_index_list(rerun_indexes)

In [24]:
rerun_indexes

['88-90']

In [25]:
# generate a string version of not_run
final_rerun = ','.join(rerun_indexes)
final_rerun = 'qsub -t {}%4 workflow/scripts/hicpro/split_fastqs.qarray.qsh'.format(final_rerun)

In [26]:
print(final_rerun)

qsub -t 88-90%4 workflow/scripts/hicpro/split_fastqs.qarray.qsh
