In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
import tracker
latest_date = tracker.processing_dates[-1]
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    download_fastqs_tracker_fn = 'results/samplesheets/fastq/{}.fastq.post-check.download_srr_fastqs.with_header.tsv'.format(latest_date) 
    hicpro_samplesheet = 'results/samplesheets/hicpro/{}.hicpro.samplesheet.with_header.tsv'.format(latest_date)
    output_prefix = 'results/samplesheets/fastq/{}.fastq.post-check.split_fastqs'.format(latest_date)
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [3]:
def get_log(x, mode='index'):
    
    if mode == 'index':
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(x))
        
    elif mode == 'path':
        info = x.split('/')
        sample_name = info[3]
        srr_id  = info[4].split('.')[0].split('_')[0]
        index = df.loc[(df[0] == sample_name) & (df[3] == srr_id)].iloc[0].name
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(index))
        
    else:
        return('Try again.')
    
    return(logs)

def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

In [4]:
def read_log(log):
    
    """
    Read the log and extract status information including
    the sample_name, gse_id, status (as indicated by the end message),
    and the download status (as indicate by the "reads written" message).
    """
    
    status = 0
    download = 1
    sample_name = 'check'
    gse_id = 'check'
    date = ''

    # extract job_id and index
    meta = os.path.basename(log)
    meta = meta.split('.')[1].split('-')
    job_id = meta[0].replace('o', '')
    index = meta[1]

    with open(log, errors='ignore') as fr:
        for line in fr:
            info = line.strip()
            if 'sample_name' in info:
                sample_name = info.split()[1]
            elif 'geo_id' in info:
                gse_id = info.split()[1]
            elif 'Ended: split_fastqs' in info:
                status = 1

    return([index, sample_name, gse_id, job_id, status])

In [5]:
def long_to_short_index_list(serial_list):
    
    small_num = serial_list[0]
    serial_list_short = []
    for i in range(1, len(serial_list)):

        prev_num = serial_list[i -1]
        next_num = serial_list[i]

        if (prev_num + 1) != next_num:

            if small_num == prev_num:
                curr_range = '{}'.format(small_num)
                serial_list_short.append(curr_range)
                small_num = next_num

            else:
                curr_range = '{}-{}'.format(small_num, prev_num)
                serial_list_short.append(curr_range)
                small_num = next_num

    if small_num == next_num:
        curr_range = '{}'.format(small_num,)
        serial_list_short.append(curr_range)
    else:
        curr_range = '{}-{}'.format(small_num, next_num)
        serial_list_short.append(curr_range)
        
    return(serial_list_short)

## Check Dependency Jobs

#### load main samplesheet

In [6]:
hicpro_df = pd.read_table(hicpro_samplesheet)
hicpro_df['sample_index'] = hicpro_df.index + 1

#### load the tracker for download_fastqs

In [7]:
download_fastqs_tracker = pd.read_table(download_fastqs_tracker_fn)
grps = download_fastqs_tracker.groupby(['std_sample_name', 'gse_id'])

# checking that all srr fastq's have been downloaded for this sample
download_fastqs_tracker_ready = grps.download_end_status.apply(lambda x: int(len(x) == sum(x)))
download_fastqs_tracker_ready = download_fastqs_tracker_ready.to_frame().reset_index()

In [8]:
# merge the samplesheet with the tracker
hicpro_df = hicpro_df.merge(download_fastqs_tracker_ready, on=['std_sample_name', 'gse_id'])

#### load and add the log information 

In [9]:
logs = glob.glob('results/fastqs/parallel/logs/split_fastqs.o*-*')
log_data = []
for log in logs:
    index, sample_name, gse_id, job_id, status = read_log(log)
    log_data.append([index, sample_name, gse_id, job_id, status, log])
log_df = pd.DataFrame(log_data)
log_df.columns = ['sample_index', 'std_sample_name', 'gse_id', 'job_id', 'end_status', 'log']
log_df.drop('sample_index', inplace=True, axis=1)

In [10]:
log_df

Unnamed: 0,std_sample_name,gse_id,job_id,end_status,log
0,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
1,GM.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
2,EBNA2_Plus_Ramos.GSE179755.Homo_Sapiens.H3K27a...,GSE179755,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
3,H9.GSE105028.Homo_Sapiens.KLF4.b1,GSE105028,4932620,1,results/fastqs/parallel/logs/split_fastqs.o493...
4,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
5,Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
6,lgs101645.GSE116193.Homo_Sapiens.CTCF.b1,GSE116193,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
7,mES_100k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...
8,Crypt-Hnf4DKO.GSE148691.Mus_Musculus.H3K4me3.b1,GSE148691,4932620,1,results/fastqs/parallel/logs/split_fastqs.o493...
9,HaCaT_Unstimulated.GSE151193.Homo_Sapiens.H3K2...,GSE151193,4919273,1,results/fastqs/parallel/logs/split_fastqs.o491...


In [11]:
# merge the samplesheet with the log information
hicpro_df = hicpro_df.merge(log_df, how='left', on=['std_sample_name', 'gse_id'], indicator='merge')
hicpro_df.loc[hicpro_df.end_status.isna(), 'end_status'] = 0
hicpro_df.end_status = hicpro_df.end_status.astype(int)

In [12]:
# get the latest job id for a give sample + srr id combo
hicpro_df = hicpro_df.sort_values('job_id', ascending=False).drop_duplicates(['std_sample_name', 'gse_id'])

# sort back into the sample index values, needed for correctly running qsub
hicpro_df.sort_values('sample_index', inplace=True)

In [13]:
hicpro_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge
1,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,Mus_Musculus,1,H3K27ac,MboI,mES_25m_cells,1,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
2,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,Mus_Musculus,2,H3K27ac,MboI,mES_25m_cells,2,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
4,mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,Mus_Musculus,1,H3K27ac,MboI,mES_500k_cells,3,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
5,mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,Mus_Musculus,2,H3K27ac,MboI,mES_500k_cells,4,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
6,mES_100k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,Mus_Musculus,1,H3K27ac,MboI,mES_100k_cells,5,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
7,mES_100k_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,Mus_Musculus,2,H3K27ac,MboI,mES_100k_cells,6,1,4932620.0,1,results/fastqs/parallel/logs/split_fastqs.o493...,both
8,mES_50k_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,Mus_Musculus,1,H3K27ac,MboI,mES_50k_cells,7,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
9,mES_50k_cells.GSE101498.Mus_Musculus.H3K27ac.b2,GSE101498,Mus_Musculus,2,H3K27ac,MboI,mES_50k_cells,8,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
10,GM.GSE101498.Homo_Sapiens.H3K27ac.b1,GSE101498,Homo_Sapiens,1,H3K27ac,MboI,GM,9,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both
11,GM.GSE101498.Homo_Sapiens.H3K27ac.b2,GSE101498,Homo_Sapiens,2,H3K27ac,MboI,GM,10,1,4919273.0,1,results/fastqs/parallel/logs/split_fastqs.o491...,both


#### save a tracker file

In [14]:
save_df = hicpro_df.rename(columns={'job_id': 'split_fastqs_job_id', 
                             'end_status': 'split_fastqs_end_status',
                              'log': 'split_fastqs_log'})

header_output = '{}.with_header.tsv'.format(output_prefix)
save_df.to_csv(header_output, header=True, index=False, sep='\t')

without_header_output = '{}.without_header.tsv'.format(output_prefix)
save_df.to_csv(without_header_output, header=False, index=False, sep='\t')

In [15]:
header_output

'results/samplesheets/fastq/2022.04.09.16.57.fastq.post-check.split_fastqs.with_header.tsv'

## Identifying problem samples

In [16]:
problems = (hicpro_df.end_status != 1)
problems_df = hicpro_df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge
98,HCT116-AuxinNeg.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinNeg,76,0,,0,,left_only
99,HCT116-AuxinPos.GSE179544.Homo_Sapiens.RNA-Pol...,GSE179544,Homo_Sapiens,1,RNA-Pol-II,DpnII,HCT116-AuxinPos,77,0,,0,,left_only


## Rerun problematic samples that are ready

In [17]:
rerun_ready = problems_df.loc[(hicpro_df.download_end_status == 1)]

In [18]:
rerun_ready

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge


In [19]:
' '.join(rerun_ready.std_sample_name.tolist())

''

In [20]:
rerun_ready.shape

(0, 13)

In [21]:
rerun_ready

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,sample_index,download_end_status,job_id,end_status,log,merge


In [22]:
# convert not run serial IDs to ranges/short format
rerun_indexes = sorted(rerun_ready.sample_index.tolist())
rerun_indexes = long_to_short_index_list(rerun_indexes)

IndexError: list index out of range

In [None]:
rerun_indexes

In [None]:
rerun_indexes

In [None]:
# generate a string version of not_run
final_rerun = ','.join(rerun_indexes)
final_rerun = 'qsub -t {} workflow/scripts/hicpro/split_fastqs.qarray.qsh'.format(final_rerun)

In [None]:
print(final_rerun)