Permalink
Please sign in to comment.
Browse files
Added a script that generates subsets of scatter intervals from a mas…
…ter list
- Loading branch information...
Showing
with
6,093 additions
and 0 deletions.
- +190 −0 scripts/utilities/intervals/create_scatter_intervals.py
- +714 −0 scripts/utilities/intervals/create_scatter_intervals_demo/demo.interval_list
- +59 −0 scripts/utilities/intervals/create_scatter_intervals_demo/demo_stdout.txt
- +100 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/10_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/11_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/12_of_50/scattered.interval_list
- +109 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/13_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/14_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/15_of_50/scattered.interval_list
- +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/16_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/17_of_50/scattered.interval_list
- +91 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/18_of_50/scattered.interval_list
- +118 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/19_of_50/scattered.interval_list
- +99 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/1_of_50/scattered.interval_list
- +115 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/20_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/21_of_50/scattered.interval_list
- +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/22_of_50/scattered.interval_list
- +134 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/23_of_50/scattered.interval_list
- +137 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/24_of_50/scattered.interval_list
- +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/25_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/26_of_50/scattered.interval_list
- +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/27_of_50/scattered.interval_list
- +141 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/28_of_50/scattered.interval_list
- +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/29_of_50/scattered.interval_list
- +90 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/2_of_50/scattered.interval_list
- +150 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/30_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/31_of_50/scattered.interval_list
- +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/32_of_50/scattered.interval_list
- +93 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/33_of_50/scattered.interval_list
- +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/34_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/35_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/36_of_50/scattered.interval_list
- +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/37_of_50/scattered.interval_list
- +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/38_of_50/scattered.interval_list
- +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/39_of_50/scattered.interval_list
- +129 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/3_of_50/scattered.interval_list
- +97 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/40_of_50/scattered.interval_list
- +119 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/41_of_50/scattered.interval_list
- +122 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/42_of_50/scattered.interval_list
- +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/43_of_50/scattered.interval_list
- +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/44_of_50/scattered.interval_list
- +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/45_of_50/scattered.interval_list
- +144 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/46_of_50/scattered.interval_list
- +104 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/47_of_50/scattered.interval_list
- +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/48_of_50/scattered.interval_list
- +93 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/49_of_50/scattered.interval_list
- +91 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/4_of_50/scattered.interval_list
- +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/50_of_50/scattered.interval_list
- +97 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/5_of_50/scattered.interval_list
- +109 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/6_of_50/scattered.interval_list
- +96 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/7_of_50/scattered.interval_list
- +90 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/8_of_50/scattered.interval_list
- +97 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/9_of_50/scattered.interval_list
- +54 −0 scripts/utilities/intervals/create_scatter_intervals_demo/scattered_intervals.json
| @@ -0,0 +1,190 @@ | ||
| +# usr/bin/python | ||
| + | ||
| +###################################################################################### | ||
| +# This script creates interval subset lists from a master list for scattering N-ways | ||
| +# | ||
| +# Usage: | ||
| +# python create_scatter_intervals.py \ | ||
| +# demo.interval_list 50 4 output_intervals "demo intervals scattered 50-ways" | ||
| +# | ||
| +###################################################################################### | ||
| + | ||
| +# INPUT REQUIREMENTS | ||
| +# | ||
| +# The script assumes that the master interval list is formatted according to | ||
| +# Picard conventions as defined below: | ||
| +# | ||
| +# Picard-style interval files have a SAM-like header that includes a sequence | ||
| +# dictionary. The intervals are given in the form of: | ||
| +# | ||
| +# <chr> <start> <stop> + <target_name> | ||
| +# | ||
| +# with fields separated by tabs, and the coordinates are 1-based (first position | ||
| +# in the genome is position 1, not position 0). | ||
| +# | ||
| +# Example: | ||
| +# | ||
| +# @HD VN:1.0 SO:coordinate | ||
| +# @SQ SN:1 LN:249250621 AS:GRCh37 [UR:. M5:. SP:.] | ||
| +# @SQ SN:2 LN:243199373 AS:GRCh37 [UR:. M5:. SP:.] | ||
| +# 1 30366 30503 + target_1 | ||
| +# 1 69089 70010 + target_2 | ||
| +# 1 367657 368599 + target_3 | ||
| + | ||
| +# LOGIC | ||
| +# | ||
| +# The script attempts to partition the intervals in the master list into N subsets | ||
| +# of consecutive intervals (set by "desired_N"), balanced so that the subsets all | ||
| +# add up to roughly the same amount of genomic territory. Based on the desired N and | ||
| +# the "wiggle_factor" value, the script defines a maximum length of territory allowed | ||
| +# per subset. It then iterates through all intervals, creating subsets and adding | ||
| +# intervals until the max size is exceeded and a new subset is warranted. | ||
| +# | ||
| +# Note that depending on the master intervals, the wiggle factor may need to be tweaked | ||
| +# in order to achieve exactly N subsets. | ||
| + | ||
| +# OUTPUT | ||
| +# | ||
| +# The script outputs a set of Picard-style interval files in the requested directory | ||
| +# as well as a JSON stub file that can be used as base list for a WDL's inputs JSON. | ||
| + | ||
| + | ||
| +import os | ||
| +import sys | ||
| + | ||
| +# CLI arguments | ||
| +master_list_file = sys.argv[1] | ||
| +desired_N = int(sys.argv[2]) | ||
| +wiggle_factor = int(sys.argv[3]) | ||
| +dir_name = sys.argv[4] | ||
| +comment = "@CO\t"+sys.argv[5]+"\n" | ||
| + | ||
| +# Read in the master list file contents: | ||
| +with open(master_list_file, "r") as master_list: | ||
| + | ||
| + header_lines = [] | ||
| + intervals_list = [] | ||
| + longest_interval = 0 | ||
| + | ||
| + for line in master_list: | ||
| + # store the header lines (starting with @) to serve as output stub | ||
| + if line.startswith("@"): | ||
| + header_lines.append(line) | ||
| + else: | ||
| + line_split = line.split("\t") | ||
| + length = int(line_split[2])-int(line_split[1]) | ||
| + intervals_list.append((line, length)) | ||
| + | ||
| + # keep track of what is the longest interval | ||
| + if length > longest_interval: | ||
| + longest_interval = length | ||
| + | ||
| +print "Number of intervals: "+str(len(intervals_list)) | ||
| +print "Longest interval was: "+str(longest_interval) | ||
| + | ||
| +# Determine what is the total territory covered by intervals | ||
| +total_length = 0 | ||
| +for interval in intervals_list: | ||
| + total_length = total_length + interval[1] | ||
| + | ||
| +print "Total length of covered territory: "+str(total_length) | ||
| + | ||
| +# Determine what should be the theoretical maximum territory per subset | ||
| +# based on the desired N | ||
| +max_length_per_subset = total_length / desired_N | ||
| + | ||
| +print "Theoretical max subset length: "+str(max_length_per_subset) | ||
| + | ||
| +# Distribute intervals to separate files | ||
| + | ||
| +interval_count = 0 | ||
| +batch_count = 0 | ||
| +current_batch = [] | ||
| +current_length = 0 | ||
| +length_so_far = 0 | ||
| +batches_list = [] | ||
| + | ||
| +print "Processing..." | ||
| + | ||
| +def dump_batch(msg): | ||
| + | ||
| + global batch_count | ||
| + global current_batch | ||
| + global current_length | ||
| + global length_so_far | ||
| + global interval_count | ||
| + global batches_list | ||
| + | ||
| + # increment appropriate counters | ||
| + batch_count +=1 | ||
| + length_so_far = length_so_far + current_length | ||
| + # report batch stats | ||
| + print "\t"+str(batch_count)+". \tBatch of "+str(len(current_batch))+"\t| "+str(current_length)+" \t|"+msg+" \t| "+str(interval_count)+" \t| So far "+str(length_so_far)+" \t| Remains "+str(total_length-length_so_far) | ||
| + # store batch | ||
| + batches_list.append(current_batch) | ||
| + # reset everything | ||
| + current_batch = [] | ||
| + current_length = 0 | ||
| + | ||
| +for interval in intervals_list: | ||
| + | ||
| + interval_count +=1 | ||
| + #print interval_count | ||
| + | ||
| + # Is this new interval above the length limit by itself? | ||
| + if interval[1] > max_length_per_subset: | ||
| + dump_batch("close-out") | ||
| + current_batch.append(interval) | ||
| + current_length = current_length + interval[1] | ||
| + dump_batch("godzilla") | ||
| + | ||
| + # Is this new interval putting us above the length limit when added to the batch? | ||
| + elif current_length + interval[1] > max_length_per_subset+max_length_per_subset/wiggle_factor: | ||
| + dump_batch("normal") | ||
| + current_batch.append(interval) | ||
| + current_length = current_length + interval[1] | ||
| + | ||
| + else: | ||
| + current_batch.append(interval) | ||
| + current_length = current_length + interval[1] | ||
| + | ||
| +dump_batch("finalize") | ||
| + | ||
| +print "Done.\nGrouped intervals into "+str(len(batches_list))+" batches." | ||
| + | ||
| +# Write batches to files and compose a JSON stub | ||
| +counter = 0 | ||
| +json_stub = ["{", "\t\"workflow.scattered_calling_intervals\": ["] | ||
| +os.mkdir(dir_name) | ||
| +for batch in batches_list: | ||
| + counter +=1 | ||
| + path = dir_name+"/"+str(counter)+"_of_"+str(len(batches_list)) | ||
| + os.mkdir(path) | ||
| + with open(path+"/scattered.interval_list", "w") as intervals_file: | ||
| + # Write out the header copied from the original | ||
| + for line in header_lines: | ||
| + intervals_file.write("%s" % line) | ||
| + # Add a comment to the header | ||
| + intervals_file.write("%s" % comment) | ||
| + # Write out the intervals | ||
| + for interval in batch: | ||
| + intervals_file.write("%s" % interval[0]) | ||
| + | ||
| + # add the json line | ||
| + json_stub.append("\t\t\"gs://bucket/dir/"+path+"/scattered.interval_list\",") | ||
| +json_stub.append("\t]") | ||
| +json_stub.append("}") | ||
| + | ||
| +print "Wrote "+str(counter)+" interval files to \""+dir_name+"/n_of_N/scattered.interval_list\"" | ||
| + | ||
| +# Write out the json stub | ||
| +with open("scattered_intervals.json", "w") as json_file: | ||
| + for line in json_stub: | ||
| + json_file.write("%s\n" % line) | ||
| + | ||
| +print "Wrote a JSON stub to \"scattered_intervals.json\"" | ||
| + | ||
| + | ||
| + | ||
| + | ||
| + |
Oops, something went wrong.
0 comments on commit
b3eb27d