In [2]:
from dcicutils import ff_utils
from functions.wfr import *
from functions.wfr_settings import *
from functions.notebook_functions import *

# tibanna = Tibanna(env=env)
my_env = 'data'
my_auth = get_key('koray_data')

# different types of exps use different steps at the last step(3).
recipe = [
    [['in%20situ%20Hi-C', 'dilution%20Hi-C'], 'hi-c-processing-pairs'],
    [['micro-C',          'DNase%20Hi-C'],    'hi-c-processing-pairs-nore'],
    [['capture%20Hi-C',   'PLAC-seq'],        'hi-c-processing-pairs-nonorm'],
    [['CHIA-pet',         'TrAC-loop'],       'hi-c-processing-pairs-nore-nonorm'],
    [['TCC'], 'hi-c-processing-pairs'],
    
]

# To Do assign core 8 and more memory ("instance_type": "c4.4xlarge",) 

In [3]:
## TODO make sure set_url is compatible with the set_url sets

#Choose the recipe element to run the pipeline on
recipe_no = 4
exp_type, step3 = recipe[recipe_no]

#Choose the type of operations you want
add_pc = False    #add processed files to 'other processed files
add_tag = False   #add the completed process tag if done with all steps
add_wfr = False   #start missing wfrs


set_url = '/search/?'+ \
          '&'.join(['experiments_in_set.experiment_type='+i for i in exp_type])+ \
          '&type=ExperimentSetReplicate&limit=all' + \
          '&status=pre-release&status=released&status=released%20to%20project'
print(set_url)
#set_url = '/search/?award.project=ENCODE&experimentset_type=replicate&type=ExperimentSetReplicate'

#set_url = '/search/?award.project=4DN&experiments_in_set.biosample.biosource.individual.organism.name=fruit-fly&experimentset_type=replicate&type=ExperimentSetReplicate'
#print set_url

all_sets = ff_utils.search_metadata(set_url , key=my_auth)
counter = 0
completed = 0
completed_acc = []

uncompleted_sets = [i for i in all_sets if "HiC_Pipeline_0.2.5"  not in i.get('completed_processes', [])]
run_sets = [i for i in uncompleted_sets if "HiC_Pipeline_0.2.5-skipped-small-set"  not in i.get('completed_processes', [])]

print(len(all_sets), 'total number of sets')
print(len(all_sets)-len(uncompleted_sets), 'sets completed')
print(len(uncompleted_sets)-len(run_sets), 'sets skipped for small size')
print(len(run_sets), 'ready for processing')

for a_set in run_sets: 
    attributions = None
    print()
    counter += 1
        
    fastqpairs, organism, enzyme, bwa_ref, chrsize_ref, enz_ref, f_size, lab = find_pairs(a_set, my_env)
    # skip based on these conditions
    if not bwa_ref or not chrsize_ref:
        print(counter, a_set['accession'], organism, enzyme, 'skipping set with no chrsize/bwa index')
        continue
    if 'nonorm' not in step3:
        if f_size < 4:
            print(counter, a_set['accession'], 'skipping small file size', str(f_size))
            continue
    if 'nore' not in step3:
        if not enz_ref:
            print(counter, a_set['accession'], 'skipping not ready NZ', organism, enzyme)
            continue
    print(counter, a_set['accession'],enzyme, organism,f_size)
    part3 = 'done'
    set_pairs = []        
    # cycle through the experiments, skip the ones without usable files
    for exp in fastqpairs.keys():
        if not fastqpairs.get(exp):
            print(exp, 'does not have any fastq pairs')
            continue
        # Check Part 1 and See if all are okay
        exp_bams = []
        part1 = 'done'
        part2 = 'done'
        
        for pair in fastqpairs[exp]:
            #############
            if not attributions:
                attributions = get_attribution(ff_utils.get_metadata(pair[0], key = my_auth))
            step1_result = get_wfr_out(pair[0], 'bwa-mem 0.2.5', my_auth)
 
            # if successful
            if step1_result['status'] == 'complete':
                exp_bams.append(step1_result['bam'])
                continue
            # if still running
            elif step1_result['status'] == 'running':
                part1 = 'not done'
                print('part1 still running')
                continue
            # if run is not successful
            else:
                part1 = 'not done'
                if add_wfr:
                    # RUN PART 1
                    inp_f = {'fastq1':pair[0], 'fastq2':pair[1], 'bwa_index':bwa_ref}
                    name_tag = pair[0].split('/')[2]+'_'+pair[1].split('/')[2]
                    run_missing_wfr(step_settings('bwa-mem', organism, attributions), inp_f, name_tag, my_auth, my_env)
        # stop progress to part2 
        if part1 is not 'done':
            print(exp, 'has missing Part1 runs')
            part2 = 'not ready'
            part3 = 'not ready'
            continue
        print(exp, 'part1 complete')
           
        #make sure all input bams went through same last step2
        all_step2s = []
        for bam in exp_bams:
            step2_result = get_wfr_out(bam, 'hi-c-processing-bam 0.2.5', my_auth)
            all_step2s.append((step2_result['status'],step2_result.get('bam')))
        if len(list(set(all_step2s))) != 1:
            print('inconsistent step2 run for input bams')
            # this run will be repeated if add_wfr
            step2_result['status'] = 'inconsistent run'
            
        #check if part 2 is run already, it not start the run
        # if successful
        if step2_result['status'] == 'complete':
            set_pairs.append(step2_result['pairs'])
            if add_pc:
                add_preliminary_processed_files(exp, [step2_result['bam'],step2_result['pairs']], my_auth)
            print(exp, 'part2 complete')
            continue
        # if still running
        elif step2_result['status'] == 'running':
            part2 = 'not done'
            part3 = 'not ready'
            print(exp, 'part2 still running')
            continue
        # if run is not successful
        else:
            part2 = 'not done'
            part3 = 'not ready'
            print(exp, 'is missing Part2')
            if add_wfr:
                # RUN PART 2
                inp_f = {'input_bams':exp_bams, 'chromsize':chrsize_ref}           
                run_missing_wfr(step_settings('hi-c-processing-bam', organism, attributions), inp_f, exp, my_auth, my_env) 

                
    if part3 is not 'done':
        print('Part3 not ready')
        continue
    if not set_pairs:
        print('no pairs can be produced from this set')
        continue

    #make sure all input bams went through same last step3
    all_step3s = []
    for a_pair in set_pairs:
        step3_result = get_wfr_out(a_pair, step3 + " 0.2.5", my_auth)
        all_step3s.append((step3_result['status'], step3_result.get('mcool')))
    if len(list(set(all_step3s))) != 1:
        print('inconsistent step3 run for input pairs')
        # this run will be repeated if add_wfr
        step3_result['status'] = 'inconsistent run'
    #check if part 3 is run already, it not start the run
    # if successful
    if step3_result['status'] == 'complete':
        completed += 1
        completed_acc.append(a_set['accession'])
        #add competed flag to experiment
        if add_tag:
            ff_utils.patch_metadata({"completed_processes":["HiC_Pipeline_0.2.5"]}, obj_id=a_set['accession'] , key=my_auth)
        # add processed files to set
        if add_pc:
            add_preliminary_processed_files(a_set['accession'], 
                                            [step3_result['pairs'],
                                             step3_result['hic'],
                                             step3_result['mcool']], 
                                            my_auth)
        print(a_set['accession'], 'part3 complete')
    # if still running
    elif step3_result['status'] == 'running':
        print('part3 still running')
        continue
    # if run is not successful
    else:
        print(a_set['accession'], 'is missing Part3')
        if add_wfr:
            # RUN PART 3
            inp_f = {'input_pairs':set_pairs, 'chromsizes':chrsize_ref}
            if recipe_no in [0,2,4]:
                inp_f['restriction_file'] = enz_ref
            run_missing_wfr(step_settings(step3, organism, attributions), inp_f, a_set['accession'], my_auth, my_env)

print(completed)
print(completed_acc)

/search/?experiments_in_set.experiment_type=TCC&type=ExperimentSetReplicate&limit=all&status=pre-release&status=released&status=released%20to%20project
3 total number of sets
0 sets completed
0 sets skipped for small size
3 ready for processing

1 4DNESIZMF58U HindIII mouse 21
4DNEXQAO3GNK part1 complete
4DNEXQAO3GNK part2 complete
4DNEXACCWMEN part1 complete
4DNEXACCWMEN part2 complete
part3 still running

2 4DNESLUJ3INP HindIII mouse 18
4DNEX1Y5NYNB part1 complete
4DNEX1Y5NYNB part2 complete
4DNEXGWOOI8I part1 complete
4DNEXGWOOI8I part2 complete
4DNESLUJ3INP part3 complete

3 4DNESAOU4AAQ HindIII mouse 20
4DNEXN25ESPT part1 complete
4DNEXN25ESPT part2 complete
4DNEXQ3PS6HH part1 complete
4DNEXQ3PS6HH part2 complete
4DNESAOU4AAQ is missing Part3
1
['4DNESLUJ3INP']



3 4DNESH4UTRNL DpnII mouse 800  -  4DNEX4KRGMAQ is missing Part2 - Fails at runtaskawsem
https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:hi-c-processing-bam_4DNEX4KRGMAQ9b484528-5651-4d72-a271-a9b37a42ab05



In [18]:
# Move files from opc to pc
from dcicutils import ff_utils
from functions.notebook_functions import *
from functions.wfr import *

recipe_no = 1
exp_type, step3 = recipe[recipe_no]
action = False
move_title = 'HiC Processing Pipeline - Preliminary Files'

set_url = '/search/?'+ \
          '&'.join(['experiments_in_set.experiment_type='+i for i in exp_type])+ \
          '&type=ExperimentSetReplicate&limit=all' + \
          '&status=released&status=released%20to%20project'

print(set_url)
set_url = '/search/?award.project=4DN&experimentset_type=replicate&lab.display_title=Chuck+Murry%2C+UW&status=pre-release&type=ExperimentSetReplicate'

# exp
# set_url = '/search/?'+ \
#           '&'.join(['experiment_type='+i for i in exp_type])+ \
#           '&type=Experiment&limit=all' + \
#           '&status=released&status=released%20to%20project'

all_sets = ff_utils.search_metadata(set_url , key=my_auth)

ready_sets_1 = [i for i in all_sets if "HiC_Pipeline_0.2.5" in i.get('completed_processes', [])]
print(len(ready_sets_1))
ready_sets_2 = []
for a_set in ready_sets_1:
    if a_set.get('other_processed_files'):
        print('a')
        print(a_set['accession'])
        if move_title in [i['title'] for i in a_set['other_processed_files']]:
            print('b')
            if a_set.get('processed_files'):
                print('c')
                print('WARN' ,a_set['accession'], 'has items in processed files, skipping ')
                continue
            else:
                ready_sets_2.append(a_set)
print(len(ready_sets_2), 'items are ready')

/search/?experiments_in_set.experiment_type=micro-C&experiments_in_set.experiment_type=DNase%20Hi-C&type=ExperimentSetReplicate&limit=all&status=released&status=released%20to%20project
4
a
4DNESIQ6IPCO
b
a
4DNESVW165TO
b
a
4DNES8BLXVP5
b
a
4DNESLOL2OR2
b
4 items are ready


In [20]:
# move other processed files to processed files field
action = True
def move_opc_to_pc(resp, move_title, con_key):
    opc = resp.get('other_processed_files')
    pc = resp.get('processed_files')
    # if processed_files field already has values, exit
    if pc:
        if opc:
            print('There are files in processed_files field, expected empty', resp['accession'])
            return False
        else:
            print('it is possible that move already happened, no opc but pc', resp['accession'])
    # see if there are other_processed_files to move
    if opc:
        titles = [i['title'] for i in opc]
        if move_title in titles:
            print(resp['accession'], 'files will move')
            move_item = [i for i in opc if i['title'] == move_title]
            assert len(move_item) == 1
            assert move_item[0]['type'] == 'preliminary'
            new_pc = move_item[0]['files']
            new_opc = [i for i in opc if i['title'] != move_title]
            # Time to patch
            patch_data = {}
            add_on = ""
            #if there is something left in opc, patch it, if not delete field
            if new_opc:
                patch_data['other_processed_files'] = opc
            else:
                add_on = 'delete_fields=other_processed_files'
            # patch with processed files
            patch_data['processed_files'] = new_pc
            if action:
                ff_utils.patch_metadata(patch_data, resp['uuid'], key = con_key, add_on = add_on)
                # update status of pc to status of set or exp
                release_files(resp['uuid'], new_pc, con_key)
            return True
        else:
            return False

        
set_w_apf = 0
exp_w_apf = 0
counter = 0
move_title = 'HiC Processing Pipeline - Preliminary Files'

print(len(ready_sets_2), 'experiment sets in scope')
for a_set in ready_sets_2:
    set_resp = ff_utils.get_metadata(a_set['uuid'],key=my_auth, add_on='frame=raw')
    counter += 1
    print(counter, set_resp['accession'])
    exps = set_resp['experiments_in_set']
    res =  move_opc_to_pc(set_resp, move_title, my_auth)
    if res:
        set_w_apf += 1
        print(set_resp['accession'], 'moved to pc')
  
    for exp in exps:
        exp_resp = ff_utils.get_metadata(exp, key=my_auth, add_on='frame=raw')
        res_e =  move_opc_to_pc(exp_resp,move_title,my_auth)
        if res_e:
            exp_w_apf += 1
            print(exp_resp['accession'], 'moved to pc')
    print()

print(set_w_apf)
print(exp_w_apf)

4 experiment sets in scope
1 4DNESIQ6IPCO
4DNESIQ6IPCO files will move
4DNESIQ6IPCO moved to pc
4DNEXXARC33I files will move
4DNEXXARC33I moved to pc
4DNEXQ83RPBN files will move
4DNEXQ83RPBN moved to pc

2 4DNESVW165TO
4DNESVW165TO files will move
4DNESVW165TO moved to pc
4DNEX4K24I1K files will move
4DNEX4K24I1K moved to pc
4DNEX31BLR1Y files will move
4DNEX31BLR1Y moved to pc

3 4DNES8BLXVP5
4DNES8BLXVP5 files will move
4DNES8BLXVP5 moved to pc
4DNEXGNEOZPK files will move
4DNEXGNEOZPK moved to pc
4DNEXX4FXLWH files will move
4DNEXX4FXLWH moved to pc

4 4DNESLOL2OR2
4DNESLOL2OR2 files will move
4DNESLOL2OR2 moved to pc
4DNEXBMMAFHR files will move
4DNEXBMMAFHR moved to pc
4DNEX6WMRN9E files will move
4DNEX6WMRN9E moved to pc

4
8
