In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from IPython.display import display
import time 
os.chdir('/mnt/BioAdHoc/Groups/vd-ay/hichip-db-loop-calling/')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
# setting input and output with jupyter notebook in context 
if 'ipykernel_launcher.py' in sys.argv[0]:
    input_fn = 'results/samplesheets/hicpro/2022.03.30.hicpro.samplesheet.with_header.tsv'   
    output_fn = 'results/samplesheets/fastq/2022.03.30.fastq.post-check.run_hicpro'
    split_fastqs = 'results/samplesheets/fastq/2022.03.30.fastq.post-check.split_fastqs.with_header.tsv'
else:
    input_fn = sys.argv[1]
    output_fn = sys.argv[2]

## Helper Functions

In [3]:
def get_log(x, mode='index'):
    
    if mode == 'index':
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(x))
        
    elif mode == 'path':
        info = x.split('/')
        sample_name = info[3]
        srr_id  = info[4].split('.')[0].split('_')[0]
        index = df.loc[(df[0] == sample_name) & (df[3] == srr_id)].iloc[0].name
        logs = glob.glob('results/fastqs/raw/logs/download_srr_fastqs.*-{}'.format(index))
        
    else:
        return('Try again.')
    
    return(logs)

def list_to_str(l, sep=' '):
    l = [str(x) for x in l]
    return(sep.join(l))

In [4]:
def read_hicpro_step2_log(log):
    
    """
    Read the log and extract status information including
    the sample_name, srr_id, status (as indicated by the end message),
    and the download status (as indicate by the "reads written" message).
    """
        
    # initializing the statuses to 0
    hicpro_status = 0

    if os.path.exists(log):
        with open(log, errors='ignore') as fr:
            for line in fr:
                info = line.strip()
                if 'Run ICE Normalization ...' in info:
                    hicpro_status = 1
    return(hicpro_status)

In [13]:
def long_to_short_index_list(serial_list):
    
    if len(serial_list) == 0:
        serial_list_short = []
        
    elif len(serial_list) == 1:
        serial_list_short = [str(serial_list[0])]
        
    else:
        small_num = serial_list[0]
        serial_list_short = []
        for i in range(1, len(serial_list)):

            prev_num = serial_list[i -1]
            next_num = serial_list[i]

            if (prev_num + 1) != next_num:

                if small_num == prev_num:
                    curr_range = '{}'.format(small_num)
                    serial_list_short.append(curr_range)
                    small_num = next_num

                else:
                    curr_range = '{}-{}'.format(small_num, prev_num)
                    serial_list_short.append(curr_range)
                    small_num = next_num

        if small_num == next_num:
            curr_range = '{}'.format(small_num,)
            serial_list_short.append(curr_range)
        else:
            curr_range = '{}-{}'.format(small_num, next_num)
            serial_list_short.append(curr_range)

    return(serial_list_short)

## Check HiCPro Logs

In [6]:
# load the data
df = pd.read_table(input_fn)
df['sample_index'] = df.index + 1

In [7]:
log_data = []
for i, sr in df.iterrows():
    
    s1_log = glob.glob('results/hicpro/{}/HiCpro_s1_.e*'.format(sr.std_sample_name))
    s2_log = glob.glob('results/hicpro/{}/HiCpro_s2_.e*'.format(sr.std_sample_name))
    
    # check if s1 steps have started
    if len(s1_log) > 0:
        hicpro_start_status = 1
    else:
        hicpro_start_status = 0
    
    # check if s2 steps have ended 
    if len(s2_log) == 1:
        s2_log = s2_log[0]
        hicpro_end_status = read_hicpro_step2_log(s2_log)
    else:
        hicpro_end_status = 0 
        
    log_data.append([sr.std_sample_name, hicpro_start_status, hicpro_end_status])

log_df = pd.DataFrame(log_data)
log_df.columns = ['std_sample_name', 'hicpro_start_status', 'hicpro_ended']    

In [8]:
# merge the log information
df = df.merge(log_df, how='left', on=['std_sample_name'], indicator='merge')

In [9]:
df.head()

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,ready_for_hicpro,sample_index,hicpro_start_status,hicpro_ended,merge
0,mES_25m_cells.GSE101498.Mus_Musculus.H3K27ac.b1,GSE101498,Mus_Musculus,1,H3K27ac,MboI,mES_25m_cells,0,1,1,1,both
1,HCT116.GSE173699.Homo_Sapiens.H3K27ac.b1,GSE173699,Homo_Sapiens,1,H3K27ac,MboI,HCT116,1,2,1,1,both
2,BC1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,Homo_Sapiens,1,H3K27ac,MboI,BC1,1,3,1,1,both
3,BCBL1.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,Homo_Sapiens,1,H3K27ac,MboI,BCBL1,1,4,1,1,both
4,BC3.GSE136090.Homo_Sapiens.H3K27ac.b1,GSE136090,Homo_Sapiens,1,H3K27ac,MboI,BC3,1,5,1,1,both


## Identifying problem samples

In [10]:
problems = (df.ready_for_hicpro == 1) & (df.hicpro_start_status == 0)
problems_df = df.loc[problems,:]
problems_df

Unnamed: 0,std_sample_name,gse_id,organism,bio_rep,antibody_target,restriction_enzyme,sample_name,ready_for_hicpro,sample_index,hicpro_start_status,hicpro_ended,merge
5,HK2.GSE147646.Homo_Sapiens.H3K27ac.b1,GSE147646,Homo_Sapiens,1,H3K27ac,MboI,HK2,1,6,0,0,both


## Re-run problematic/ready samples

In [14]:
# convert not run serial IDs to ranges/short format
rerun_indexes = sorted(problems_df.sample_index.tolist())
rerun_indexes_sort = long_to_short_index_list(rerun_indexes)

In [15]:
rerun_indexes_sort

[]

In [16]:
# generate a string version of not_run
final_rerun = ','.join(rerun_indexes_sort)
final_rerun = 'qsub -t {} workflow/scripts/hicpro/run_hicpro.qarray.sh'.format(final_rerun)

In [17]:
print(final_rerun)

qsub -t  workflow/scripts/hicpro/run_hicpro.qarray.sh


In [18]:
bash_indexes = ' '.join([str(x) for x in rerun_indexes])
bash_rerun = 'job_ids="{}"\n'.format(bash_indexes)
bash_rerun += 'for i in $job_ids; do bash workflow/scripts/hicpro/run_hicpro.qarray.sh $i; done'

In [19]:
print(bash_rerun)

job_ids=""
for i in $job_ids; do bash workflow/scripts/hicpro/run_hicpro.qarray.sh $i; done


In [20]:
problem_samples = problems_df.std_sample_name.values.tolist()
problem_samples = ' '.join(problem_samples)
problem_samples

''

## Notes on problems

**Updates 2022.04.03**
 - HK2.GSE147646.Homo_Sapiens.H3K27ac.b1 - splitting didn't work correctly. Missing 05_*
 - HARA.GSE147854.Homo_Sapiens.H3K27ac.b1 - now rerunning, fixed dependency as well
 - lgs102943.GSE116193.Homo_Sapiens.H3K27ac.b1 - now rerunning, had to fix dependency

Putting the whole list for 2022.04.03

- HK2.GSE147646.Homo_Sapiens.H3K27ac.b1
- HARA.GSE147854.Homo_Sapiens.H3K27ac.b1
- lgs102943.GSE116193.Homo_Sapiens.H3K27ac.b1
- lgs301315.GSE116193.Homo_Sapiens.CTCF.b1
- lgs102943.GSE116193.Homo_Sapiens.CTCF.b1
- lgs301315.GSE116193.Homo_Sapiens.H3K27ac.b1
- lgs102580.GSE116193.Homo_Sapiens.H3K27ac.b1
- lgs101645.GSE116193.Homo_Sapiens.H3K27ac.b1
- EBNA2_Plus_Ramos.GSE179755.Homo_Sapiens.H3K27ac.b1
- Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b1
- mES_50k_cells.GSE101498.Mus_Musculus.H3K27ac.b1
- mES_500k_cells.GSE101498.Mus_Musculus.H3K27ac.b1
- Naive_Tcells.GSE101498.Homo_Sapiens.H3K27ac.b2
- Treg.GSE101498.Homo_Sapiens.H3K27ac.b1
- lgs301283.GSE116193.Homo_Sapiens.CTCF.b1