In [27]:
from dcicutils import ff_utils
from functions.wfr import *
from functions.wfr_settings import *
from functions.notebook_functions import *

# tibanna = Tibanna(env=env)
my_env = 'data'
my_auth = get_key('koray_data')

# different types of exps use different steps at the last step(3).
recipe = [
    [['in%20situ%20Hi-C', 'dilution%20Hi-C'], 'hi-c-processing-pairs'],
    [['micro-C',          'DNase%20Hi-C'],    'hi-c-processing-pairs-nore'],
    [['capture%20Hi-C',   'PLAC-seq'],        'hi-c-processing-pairs-nonorm'],
    [['CHIA-pet',         'TrAC-loop'],       'hi-c-processing-pairs-nore-nonorm']
]

In [33]:
## TODO make sure set_url is compatible with the set_url sets

#Choose the recipe element to run the pipeline on
recipe_no = 0
exp_type, step3 = recipe[recipe_no]

#Choose the type of operations you want
add_pc = False    #add processed files to 'other processed files
add_tag = False   #add the completed process tag if done with all steps
add_wfr = False   #start missing wfrs


set_url = '/search/?'+ \
          '&'.join(['experiments_in_set.experiment_type='+i for i in exp_type])+ \
          '&type=ExperimentSetReplicate&limit=all' + \
          '&status=released&status=released%20to%20project'

#set_url = '/search/?award.project=4DN&lab.display_title=John+Lis%2C+CORNELL&status=in+review+by+lab&type=ExperimentSetReplicate'

#set_url = '/search/?lab.display_title=Todd+Waldman%2C+GEORGETOWN&type=ExperimentSetReplicate'

#print set_url

run_sets = ff_utils.search_metadata(set_url , key=my_auth)
counter = 0
completed = 0
completed_acc = []

all_sets = len(run_sets)
run_sets = [i for i in run_sets if "HiC_Pipeline_0.2.5"  not in i.get('completed_processes', [])]
print(str(all_sets)+' total number of sets',
      str(all_sets-len(run_sets))+ ' sets completed')

for a_set in run_sets: 
    attributions = None
    print()
    counter += 1
    fastqpairs, organism, enzyme, bwa_ref, chrsize_ref, enz_ref, f_size, lab = find_pairs(a_set, my_env)
    # skip based on these conditions
    if not bwa_ref or not chrsize_ref:
        print(counter, a_set['accession'], organism, enzyme, 'skipping set with no chrsize/bwa index')
        continue
    if 'nonorm' not in step3:
        if f_size < 4:
            print(counter, a_set['accession'], 'skipping small file size', str(f_size))
            continue
    if 'nore' not in step3:
        if not enz_ref:
            print(counter, a_set['accession'], 'skipping not ready NZ', organism, enzyme)
            continue
    print(counter, a_set['accession'],enzyme, organism,f_size)
    part3 = 'done'
    set_pairs = []        
    # cycle through the experiments, skip the ones without usable files
    for exp in fastqpairs.keys():
        if not fastqpairs.get(exp):
            print(exp, 'does not have any fastq pairs')
            continue
        # Check Part 1 and See if all are okay
        exp_bams = []
        part1 = 'done'
        part2 = 'done'
        
        for pair in fastqpairs[exp]:
            #############
            if not attributions:
                attributions = get_attribution(ff_utils.get_metadata(pair[0], key = my_auth))
                
            step1_result = get_wfr_out(pair[0], 'bwa-mem 0.2.5', my_auth)
 
            # if successful
            if step1_result['status'] == 'complete':
                exp_bams.append(step1_result['bam'])
                continue
            # if still running
            elif step1_result['status'] == 'running':
                part1 = 'not done'
                print('part1 still running')
                continue
            # if run is not successful
            else:
                part1 = 'not done'
                if add_wfr:
                    # RUN PART 1
                    inp_f = {'fastq1':pair[0], 'fastq2':pair[1], 'bwa_index':bwa_ref}
                    name_tag = pair[0].split('/')[2]+'_'+pair[1].split('/')[2]
                    run_missing_wfr(step_settings('bwa-mem', organism, attributions), inp_f, name_tag, my_auth, my_env)
        # stop progress to part2 
        if part1 is not 'done':
            print(exp, 'has missing Part1 runs')
            part2 = 'not ready'
            part3 = 'not ready'
            continue
        print(exp, 'part1 complete')
           
        #make sure all input bams went through same last step2
        all_step2s = []
        for bam in exp_bams:
            step2_result = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', my_auth)
            all_step2s.append((step2_result['status'],step2_result.get('bam')))
        if len(list(set(all_step2s))) != 1:
            print('inconsistent step2 run for input bams')
            # this run will be repeated if add_wfr
            step2_result['status'] = 'inconsistent run'
            
        #check if part 2 is run already, it not start the run
        # if successful
        if step2_result['status'] == 'complete':
            set_pairs.append(step2_result['pairs'])
            if add_pc:
                add_preliminary_processed_files(exp, [step2_result['bam'],step2_result['pairs']], my_auth)
            print(exp, 'part2 complete')
            continue
        # if still running
        elif step2_result['status'] == 'running':
            part2 = 'not done'
            part3 = 'not ready'
            print(exp, 'part2 still running')
            continue
        # if run is not successful
        else:
            part2 = 'not done'
            part3 = 'not ready'
            print(exp, 'is missing Part2')
            if add_wfr:
                # RUN PART 2
                inp_f = {'input_bams':exp_bams, 'chromsize':chrsize_ref}           
                run_missing_wfr(step_settings('hi-c-processing-bam', organism, attributions), inp_f, exp, my_auth, my_env) 

                
    if part3 is not 'done':
        print('Part3 not ready')
        continue
    if not set_pairs:
        print('no pairs can be produced from this set')
        continue

    #make sure all input bams went through same last step3
    all_step3s = []
    for a_pair in set_pairs:
        step3_result = get_wfr_out(a_pair, step3 + " 0.2.5", my_auth)
        all_step3s.append((step3_result['status'], step3_result.get('mcool')))
    if len(list(set(all_step3s))) != 1:
        print('inconsistent step3 run for input pairs')
        # this run will be repeated if add_wfr
        step3_result['status'] = 'inconsistent run'
    #check if part 3 is run already, it not start the run
    # if successful
    if step3_result['status'] == 'complete':
        completed += 1
        completed_acc.append(a_set['accession'])
        #add competed flag to experiment
        if add_tag:
            ff_utils.patch_metadata({"completed_processes":["HiC_Pipeline_0.2.5"]}, obj_id=a_set['accession'] , key=my_auth)
        # add processed files to set
        if add_pc:
            add_preliminary_processed_files(a_set['accession'], 
                                            [step3_result['pairs'],
                                             step3_result['hic'],
                                             step3_result['mcool']], 
                                            my_auth)
        print(a_set['accession'], 'part3 complete')
    # if still running
    elif step3_result['status'] == 'running':
        print('part3 still running')
        continue
    # if run is not successful
    else:
        print(a_set['accession'], 'is missing Part3')
        if add_wfr:
            # RUN PART 3
            inp_f = {'input_pairs':set_pairs, 'chromsizes':chrsize_ref}
            if recipe_no in [0,2]:
                inp_f['restriction_file'] = enz_ref
            run_missing_wfr(step_settings(step3, organism, attributions), inp_f, a_set['accession'], my_auth, my_env)

print(completed)
print(completed_acc)

103 total number of sets 72 sets completed

1 4DNESRA7OFS4 HindIII human 42
4DNEXZ1ECEC1 part1 complete
4DNEXZ1ECEC1 part2 complete
4DNEXT83K6AM part1 complete
4DNEXT83K6AM part2 complete
4DNESRA7OFS4 is missing Part3



KeyboardInterrupt: 