In [3]:
from core.utils import Tibanna
from dcicutils import ff_utils
from core.utils import run_workflow
from datetime import datetime
from core.wfr import *

env = 'data'
tibanna = Tibanna(env=env)

ff = ff_utils.fdn_connection(key={"default" : tibanna.ff_keys})
exclude_miseq = True

# auth = ff_utils.get_authentication_with_server({},env)
# print auth

In [4]:
import time

# for a given experiment set and some parameters like instrument
# print set of files and their partA hic workflow status
# if there are one that are running report the number of running cases
# if there are file pairs that don't have a corresponding part A, report them separately

out_n = "This is an output file of the Hi-C processing pipeline"
int_n = "This is an intermediate file in the HiC processing pipeline"

def step_settings(seq, my_organism, lab):
    genome = ""
    mapper = {'human':'GRCh38','mouse':'GRCm38'}
    genome = mapper.get(my_organism)
    
    wf_dict =[{
        'wf_name': 'bwa-mem',
        'wf_uuid': '3feedadc-50f9-4bb4-919b-09a8b731d0cc',
        'parameters': {"nThreads": 16},
        'custom_pf_fields': {
            'out_bam': {
                'genome_assembly': genome,
                'file_type': 'intermediate file',
                'description': int_n,
                'contributing_labs': [lab,]}
        }},
        {
        'wf_name': 'hi-c-processing-bam',
        'wf_uuid': '023bfb3e-9a8b-42b9-a9d4-216079526f68',
        'parameters': {"nthreads_merge": 16, "nthreads_parse_sort": 16},
        'custom_pf_fields': {
            'annotated_bam': {
                'genome_assembly': genome,
                'file_type': 'alignment',
                'description': out_n,
                'contributing_labs': [lab,]},
            'filtered_pairs': {
                'genome_assembly': genome,
                'file_type': 'contact list-replicate',
                'description': out_n,
                'contributing_labs': [lab,]}
        }},
        {
        'wf_name': 'hi-c-processing-pairs',
        'wf_uuid': 'c9e0e6f7-b0ed-4a42-9466-cadc2dd84df0',
        'parameters': {"nthreads": 1, "maxmem": "32g"},
        'custom_pf_fields': {
            'cooler_normvector': {
                'genome_assembly': genome,
                'file_type': 'juicebox norm vector',
                'description': out_n,
                'contributing_labs': [lab,]},
            'hic': {
                'genome_assembly': genome,
                'file_type': 'contact matrix',
                'description': out_n,
                'contributing_labs': [lab,]},
            'mcool': {
                'genome_assembly': genome,
                'file_type': 'contact matrix',
                'description': out_n,
                'contributing_labs': [lab,]},
            'merged_pairs': {
                'genome_assembly': genome,
                'file_type': 'contact list-combined',
                'description': out_n,
                'contributing_labs': [lab,]}
        }}]
    
    return wf_dict[seq]
  
    
# url for hic exps
exp_types = ['in%20situ%20Hi-C', 'dilution%20Hi-C']
set_url = '/search/?'+'&'.join(['experiments_in_set.experiment_type='+i for i in exp_types])+ \
          '&type=ExperimentSetReplicate&limit=all'
run_sets = ff_utils.search_metadata(set_url , ff_env=env)

add_pc = False
add_rel = False
add_wfr = True

counter = 0
completed = 0
completed_acc = []
all_sets = len(run_sets)
print(str(all_sets)+' total number of sets')

run_sets = [i for i in run_sets if "HiC_Pipeline_0.2.5"  not in i.get('completed_processes', [])]

print(str(all_sets-len(run_sets))+ ' sets completed')


for a_set in run_sets: 
    counter += 1
    print
    fastqpairs, organism, enzyme, bwa_ref, chrsize_ref, enz_ref, f_size = find_pairs(a_set, exclude_miseq, env, tibanna)
    # skip some cases
    if not bwa_ref or not chrsize_ref:
        print counter, a_set['accession'], organism, enzyme, 'skipping set with not chrsize/bwa index'
        continue
    if not enz_ref:
        print counter, a_set['accession'], 'skipping not ready NZ', organism, enzyme
        continue
    if f_size < 15:
        print counter, a_set['accession'], 'skipping small file size', str(f_size) 
        continue
    print counter, a_set['accession']
    print enzyme, organism
    part3 = 'done'
    list_release = []
    set_pairs = []        
    # cycle through the experiments
    for exp in fastqpairs.keys():
        if not fastqpairs.get(exp):
            print(exp, 'does not have any fastq pairs')
            continue
        # Check Part 1 and See if all are okay
        exp_bams = []
        part1 = 'done'
        part2 = 'done'
        for pair in fastqpairs[exp]:
            #############
            bam1 = get_wfr_out(pair[0], 'bwa-mem 0.2.5', 'bam', env)
            bam2 = get_wfr_out(pair[1], 'bwa-mem 0.2.5', 'bam', env)
            # if run is not successful
            if bam1.startswith('no') or not bam1 or bam1 != bam2:
                part1 = 'not ready'
                if add_wfr:
                    if not bwa_index:
                        print 'not yet usable', organism
                        continue
                    inp_f = {'fastq1':pair[0], 'fastq2':pair[1], 'bwa_index':bwa_ref}
                    name_tag = pair[0].split('/')[2]+'_'+pair[1].split('/')[2]
                    run_missing_wfr(step_settings(0, organism), inp_f, name_tag, env, tibanna)
            elif bam1 == 'running':
                part1 = 'still running'
                print('part1 still running')
            # if successful
            else:
                exp_bams.append(bam1)
                list_release.append(bam1)
        # stop progress to part2 
        if part1 is not 'done':
            print exp, 'has missing Part1 runs'
            part2 = 'not ready'
            part3 = 'not ready'
            continue
        print exp, 'part1 complete'
        #check if part 2 is run already, it not start the run
        exp_com_bam = []
        exp_pairs = []
        for bam in exp_bams:
            com_bam = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'bam', env)
            pairs = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', 'pairs', env)
            # try to run if missing
            if pairs.startswith('no') or not pairs:
                part2 = 'not ready'
                part3 = 'not ready'
                
            elif pairs == 'running':
                print(bam)
                part2 = 'still running'
                part3 = 'not ready'
                
            else:
                exp_com_bam.append(com_bam)
                exp_pairs.append(pairs)
                
        # if still running, skip to next experiment
        if part2 == 'still running':
            print('part2 still running')
            continue
        
        # make sure all bams went through the same wfr and produces same file
        if part2 != 'done' or len(list(set(exp_com_bam))) != 1 or len(list(set(exp_pairs))) !=1:
            print exp, 'Part2 did not complete'
            part3 = 'not ready' 
        
            if add_wfr:
                if not chrsize_ref:
                    print 'not yet usable', organism
                    continue
                # make sure no duplicates
                inp_f = {'input_bams':exp_bams, 'chromsize':chrsize_ref}           
                run_missing_wfr(step_settings(1, organism), inp_f, exp, env, tibanna)   
            continue
            
        # add bam and pairs to exp proc file
        list_release.extend([exp_com_bam[0],exp_pairs[0]])
        if add_pc:
            add_processed_files(exp, [exp_com_bam[0],exp_pairs[0]], env)
        
        print exp, 'part2 complete'
        set_pairs.append(exp_pairs[0])
    
    if part3 != 'done':
        print 'Part3 not ready'
        continue
    
    if not set_pairs:
        print 'no pairs can be produced from this set'
        continue
        
    merged_pairs = []
    for set_pair in set_pairs:
        merged_pair = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'pairs', env)
        hic = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'hic', env)
        mcool = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'mcool', env)
        normvec = get_wfr_out(set_pair, 'hi-c-processing-pairs 0.2.5', 'normvector_juicerformat', env)
        if merged_pair.startswith('no') or not merged_pair:
            part3 = 'not ready'
            break
        elif merged_pair == 'running':
            part3 = 'still running'
            break
        else:
            merged_pairs.append(merged_pair)
    
    
    # if part3 is still running report it, and skip the rest of the script
    if part3 == 'still running':
        print 'part3', part3
        continue        
                
    if part3 != 'done' or len(list(set(merged_pairs))) != 1:
        print a_set['accession'], 'is missing Part3'
        
        # if it is not run, and add_wfr is true, go for it, then skip the rest of the script
        if add_wfr:
            if not chrsize_ref:
                print 'not yet usable', organism
                continue

            if not enz_ref:
                print 'restriction enzyme not ready for', organism, enzyme
                continue
            inp_f = {'input_pairs':set_pairs, 'chromsizes':chrsize_ref, 'restriction_file': enz_ref} 
            run_missing_wfr(step_settings(2, organism), inp_f, a_set['accession'], env, tibanna)
        continue
    #####
    #add competed flag to experiment
    if add_pc and add_rel:
        ff_utils.patch_metadata({"completed_processes":["HiC_Pipeline_0.2.5"]}, obj_id=a_set['accession'] , ff_env=env)
    
    # add processed files to set
    list_release.extend([merged_pair, hic, mcool, normvec])
    if add_pc:
        add_processed_files(a_set['accession'], [merged_pair, hic, mcool, normvec], env)
    
    #release files and wfrs
    if add_rel:
        release_files(a_set['accession'], list(set(list_release)), env)
    
    completed += 1
    completed_acc.append(a_set['accession'])
    print a_set['accession'], 'part3 complete'

    
print completed
print completed_acc

95 total number of sets
66 sets completed

1 4DNES1QUXG92
MboI mouse
4DNEX4J4Z9WE part1 complete
4DNEX4J4Z9WE part2 complete
4DNEXRI9JYW6 part1 complete
4DNEXRI9JYW6 part2 complete
4DNES1QUXG92 is missing Part3


TypeError: step_settings() takes exactly 3 arguments (2 given)

In [None]:
95 total number of sets
66 sets completed

1 4DNES1QUXG92
MboI mouse
4DNEX4J4Z9WE part1 complete
4DNEX4J4Z9WE part2 complete
4DNEXRI9JYW6 part1 complete
4DNEXRI9JYW6 part2 complete
4DNES1QUXG92 is missing Part3

2 4DNESWDLDMGN
MboI mouse
4DNEXA7ZMMK3 part1 complete
4DNEXA7ZMMK3 part2 complete
4DNEXVGS8FQQ part1 complete
4DNEXVGS8FQQ part2 complete
4DNESWDLDMGN is missing Part3

3 4DNESQMM4EBN
MboI mouse
4DNEXU87YSSY part1 complete
4DNEXU87YSSY part2 complete
4DNEXLWC9JX7 part1 complete
4DNEXLWC9JX7 part2 complete
4DNESQMM4EBN is missing Part3

4 4DNESKKSKG7Y
MboI mouse
4DNEXB65W9GS part1 complete
4DNEXB65W9GS part2 complete
4DNEXNT6XXHX has missing Part1 runs
Part3 not ready

(u'/files-fastq/4DNFI4O4CPDH/', 'does not have a pair')
(u'/files-fastq/4DNFICUHBSN5/', 'does not have a pair')
(u'/files-fastq/4DNFI6MXCEPW/', 'does not have a pair')
(u'/files-fastq/4DNFIEWPBMCR/', 'does not have a pair')
(u'/files-fastq/4DNFIHUVBRKH/', 'does not have a pair')
(u'/files-fastq/4DNFIPIVCJCP/', 'does not have a pair')
(u'/files-fastq/4DNFIJEUX4GL/', 'does not have a pair')
(u'/files-fastq/4DNFIOGYWJTW/', 'does not have a pair')
(u'/files-fastq/4DNFIIMOII76/', 'does not have a pair')
(u'/files-fastq/4DNFI4J8WZZU/', 'does not have a pair')
(u'/files-fastq/4DNFI4HMA9X6/', 'does not have a pair')
(u'/files-fastq/4DNFIJ5ILLUG/', 'does not have a pair')
5 4DNESQEND1X5 skipping small file size 0

(u'/files-fastq/4DNFIA461Y3W/', 'does not have a pair')
(u'/files-fastq/4DNFIPKJQVNX/', 'does not have a pair')
(u'/files-fastq/4DNFIX3EAAM5/', 'does not have a pair')
(u'/files-fastq/4DNFIENL1TT3/', 'does not have a pair')
(u'/files-fastq/4DNFIEMD37BR/', 'does not have a pair')
(u'/files-fastq/4DNFI1H7FBSD/', 'does not have a pair')
(u'/files-fastq/4DNFIQHRGDWB/', 'does not have a pair')
(u'/files-fastq/4DNFIWUHSWJZ/', 'does not have a pair')
(u'/files-fastq/4DNFIXGR9HO7/', 'does not have a pair')
(u'/files-fastq/4DNFIAUFS4OS/', 'does not have a pair')
(u'/files-fastq/4DNFIO2LPCU7/', 'does not have a pair')
(u'/files-fastq/4DNFIHQCPQIU/', 'does not have a pair')
(u'/files-fastq/4DNFIFEJNUNM/', 'does not have a pair')
(u'/files-fastq/4DNFIGWRLEZK/', 'does not have a pair')
(u'/files-fastq/4DNFILGWLYRC/', 'does not have a pair')
(u'/files-fastq/4DNFIS4FORWQ/', 'does not have a pair')
(u'/files-fastq/4DNFIG1WOWVE/', 'does not have a pair')
(u'/files-fastq/4DNFICXZS411/', 'does not have a pair')
6 4DNESKHB7GPW skipping small file size 1

7 4DNESH4UTRNL
DpnII mouse
4DNEX4KRGMAQ part1 complete
/files-processed/4DNFIZ32E4IH/
/files-processed/4DNFIC2GWTU7/
/files-processed/4DNFIV1M1SN8/
/files-processed/4DNFIEMLVQQK/
/files-processed/4DNFI537T8V6/
part2 still running
4DNEXOHPSJTN part1 complete
4DNEXOHPSJTN part2 complete
Part3 not ready

8 4DNESNYBDSLY
DpnII mouse
4DNEXDSNPZOU part1 complete
4DNEXDSNPZOU part2 complete
4DNEXH1YN2XB part1 complete
4DNEXH1YN2XB part2 complete
4DNESNYBDSLY part3 complete

9 4DNES54YB6TQ
DpnII mouse
4DNEXMM7MN7V part1 complete
4DNEXMM7MN7V part2 complete
4DNEXBSGTVWJ part1 complete
4DNEXBSGTVWJ part2 complete
4DNES54YB6TQ part3 complete

10 4DNES4DGHDMX
NcoI human
4DNEXX13N3PT part1 complete
4DNEXX13N3PT part2 complete
4DNES4DGHDMX is missing Part3

11 4DNES6V4HVDE skipping small file size 14

12 4DNESEYETNMX skipping small file size 0

13 4DNESX29AMHF skipping small file size 0

14 4DNESAZ12B8V skipping small file size 0

15 4DNESE9NXACG skipping small file size 0

16 4DNES4JNDDVX skipping small file size 0

17 4DNESDP9ECMN skipping small file size 0

18 4DNES21VTCK2 skipping small file size 0

19 4DNESJNPEKZD skipping small file size 0

20 4DNESETP2XUO skipping small file size 0

21 4DNESCI5VSPV skipping small file size 0

22 4DNES3RHDBBR skipping small file size 1

23 4DNESZ2PVZWR skipping small file size 0

24 4DNESSCS4D46 skipping small file size 0

25 4DNESBJ1KYYH skipping small file size 9

26 4DNESGW9DHH3 skipping not ready NZ human MspI

27 4DNESXTSP7H7 skipping not ready NZ human NcoI_MspI_BspHI

28 4DNES6YMW2WC skipping not ready NZ mouse HindIII

29 4DNESNZZR2VD
HindIII human
4DNEXVHF97YT part1 complete
4DNEXVHF97YT part2 complete
4DNEX17F64JD part1 complete
4DNEX17F64JD part2 complete
(u'4DNEX5SKVSJX', 'does not have any fastq pairs')
4DNEXJ78M1F7 part1 complete
4DNEXJ78M1F7 part2 complete
4DNESNZZR2VD is missing Part3
2
[u'4DNESNYBDSLY', u'4DNES54YB6TQ']

​