Permalink
Browse files

Added a script that generates subsets of scatter intervals from a mas…

…ter list
  • Loading branch information...
1 parent 2cdd412 commit b3eb27d6af237211831ba3257f9ce41363d90160 @vdauwera vdauwera committed Jan 7, 2017
Showing with 6,093 additions and 0 deletions.
  1. +190 −0 scripts/utilities/intervals/create_scatter_intervals.py
  2. +714 −0 scripts/utilities/intervals/create_scatter_intervals_demo/demo.interval_list
  3. +59 −0 scripts/utilities/intervals/create_scatter_intervals_demo/demo_stdout.txt
  4. +100 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/10_of_50/scattered.interval_list
  5. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/11_of_50/scattered.interval_list
  6. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/12_of_50/scattered.interval_list
  7. +109 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/13_of_50/scattered.interval_list
  8. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/14_of_50/scattered.interval_list
  9. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/15_of_50/scattered.interval_list
  10. +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/16_of_50/scattered.interval_list
  11. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/17_of_50/scattered.interval_list
  12. +91 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/18_of_50/scattered.interval_list
  13. +118 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/19_of_50/scattered.interval_list
  14. +99 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/1_of_50/scattered.interval_list
  15. +115 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/20_of_50/scattered.interval_list
  16. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/21_of_50/scattered.interval_list
  17. +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/22_of_50/scattered.interval_list
  18. +134 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/23_of_50/scattered.interval_list
  19. +137 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/24_of_50/scattered.interval_list
  20. +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/25_of_50/scattered.interval_list
  21. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/26_of_50/scattered.interval_list
  22. +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/27_of_50/scattered.interval_list
  23. +141 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/28_of_50/scattered.interval_list
  24. +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/29_of_50/scattered.interval_list
  25. +90 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/2_of_50/scattered.interval_list
  26. +150 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/30_of_50/scattered.interval_list
  27. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/31_of_50/scattered.interval_list
  28. +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/32_of_50/scattered.interval_list
  29. +93 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/33_of_50/scattered.interval_list
  30. +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/34_of_50/scattered.interval_list
  31. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/35_of_50/scattered.interval_list
  32. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/36_of_50/scattered.interval_list
  33. +95 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/37_of_50/scattered.interval_list
  34. +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/38_of_50/scattered.interval_list
  35. +90 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/39_of_50/scattered.interval_list
  36. +129 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/3_of_50/scattered.interval_list
  37. +97 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/40_of_50/scattered.interval_list
  38. +119 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/41_of_50/scattered.interval_list
  39. +122 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/42_of_50/scattered.interval_list
  40. +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/43_of_50/scattered.interval_list
  41. +92 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/44_of_50/scattered.interval_list
  42. +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/45_of_50/scattered.interval_list
  43. +144 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/46_of_50/scattered.interval_list
  44. +104 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/47_of_50/scattered.interval_list
  45. +96 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/48_of_50/scattered.interval_list
  46. +93 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/49_of_50/scattered.interval_list
  47. +91 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/4_of_50/scattered.interval_list
  48. +94 −0 ...ilities/intervals/create_scatter_intervals_demo/output_intervals/50_of_50/scattered.interval_list
  49. +97 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/5_of_50/scattered.interval_list
  50. +109 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/6_of_50/scattered.interval_list
  51. +96 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/7_of_50/scattered.interval_list
  52. +90 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/8_of_50/scattered.interval_list
  53. +97 −0 ...tilities/intervals/create_scatter_intervals_demo/output_intervals/9_of_50/scattered.interval_list
  54. +54 −0 scripts/utilities/intervals/create_scatter_intervals_demo/scattered_intervals.json
@@ -0,0 +1,190 @@
+# usr/bin/python
+
+######################################################################################
+# This script creates interval subset lists from a master list for scattering N-ways
+#
+# Usage:
+# python create_scatter_intervals.py \
+# demo.interval_list 50 4 output_intervals "demo intervals scattered 50-ways"
+#
+######################################################################################
+
+# INPUT REQUIREMENTS
+#
+# The script assumes that the master interval list is formatted according to
+# Picard conventions as defined below:
+#
+# Picard-style interval files have a SAM-like header that includes a sequence
+# dictionary. The intervals are given in the form of:
+#
+# <chr> <start> <stop> + <target_name>
+#
+# with fields separated by tabs, and the coordinates are 1-based (first position
+# in the genome is position 1, not position 0).
+#
+# Example:
+#
+# @HD VN:1.0 SO:coordinate
+# @SQ SN:1 LN:249250621 AS:GRCh37 [UR:. M5:. SP:.]
+# @SQ SN:2 LN:243199373 AS:GRCh37 [UR:. M5:. SP:.]
+# 1 30366 30503 + target_1
+# 1 69089 70010 + target_2
+# 1 367657 368599 + target_3
+
+# LOGIC
+#
+# The script attempts to partition the intervals in the master list into N subsets
+# of consecutive intervals (set by "desired_N"), balanced so that the subsets all
+# add up to roughly the same amount of genomic territory. Based on the desired N and
+# the "wiggle_factor" value, the script defines a maximum length of territory allowed
+# per subset. It then iterates through all intervals, creating subsets and adding
+# intervals until the max size is exceeded and a new subset is warranted.
+#
+# Note that depending on the master intervals, the wiggle factor may need to be tweaked
+# in order to achieve exactly N subsets.
+
+# OUTPUT
+#
+# The script outputs a set of Picard-style interval files in the requested directory
+# as well as a JSON stub file that can be used as base list for a WDL's inputs JSON.
+
+
+import os
+import sys
+
+# CLI arguments
+master_list_file = sys.argv[1]
+desired_N = int(sys.argv[2])
+wiggle_factor = int(sys.argv[3])
+dir_name = sys.argv[4]
+comment = "@CO\t"+sys.argv[5]+"\n"
+
+# Read in the master list file contents:
+with open(master_list_file, "r") as master_list:
+
+ header_lines = []
+ intervals_list = []
+ longest_interval = 0
+
+ for line in master_list:
+ # store the header lines (starting with @) to serve as output stub
+ if line.startswith("@"):
+ header_lines.append(line)
+ else:
+ line_split = line.split("\t")
+ length = int(line_split[2])-int(line_split[1])
+ intervals_list.append((line, length))
+
+ # keep track of what is the longest interval
+ if length > longest_interval:
+ longest_interval = length
+
+print "Number of intervals: "+str(len(intervals_list))
+print "Longest interval was: "+str(longest_interval)
+
+# Determine what is the total territory covered by intervals
+total_length = 0
+for interval in intervals_list:
+ total_length = total_length + interval[1]
+
+print "Total length of covered territory: "+str(total_length)
+
+# Determine what should be the theoretical maximum territory per subset
+# based on the desired N
+max_length_per_subset = total_length / desired_N
+
+print "Theoretical max subset length: "+str(max_length_per_subset)
+
+# Distribute intervals to separate files
+
+interval_count = 0
+batch_count = 0
+current_batch = []
+current_length = 0
+length_so_far = 0
+batches_list = []
+
+print "Processing..."
+
+def dump_batch(msg):
+
+ global batch_count
+ global current_batch
+ global current_length
+ global length_so_far
+ global interval_count
+ global batches_list
+
+ # increment appropriate counters
+ batch_count +=1
+ length_so_far = length_so_far + current_length
+ # report batch stats
+ print "\t"+str(batch_count)+". \tBatch of "+str(len(current_batch))+"\t| "+str(current_length)+" \t|"+msg+" \t| "+str(interval_count)+" \t| So far "+str(length_so_far)+" \t| Remains "+str(total_length-length_so_far)
+ # store batch
+ batches_list.append(current_batch)
+ # reset everything
+ current_batch = []
+ current_length = 0
+
+for interval in intervals_list:
+
+ interval_count +=1
+ #print interval_count
+
+ # Is this new interval above the length limit by itself?
+ if interval[1] > max_length_per_subset:
+ dump_batch("close-out")
+ current_batch.append(interval)
+ current_length = current_length + interval[1]
+ dump_batch("godzilla")
+
+ # Is this new interval putting us above the length limit when added to the batch?
+ elif current_length + interval[1] > max_length_per_subset+max_length_per_subset/wiggle_factor:
+ dump_batch("normal")
+ current_batch.append(interval)
+ current_length = current_length + interval[1]
+
+ else:
+ current_batch.append(interval)
+ current_length = current_length + interval[1]
+
+dump_batch("finalize")
+
+print "Done.\nGrouped intervals into "+str(len(batches_list))+" batches."
+
+# Write batches to files and compose a JSON stub
+counter = 0
+json_stub = ["{", "\t\"workflow.scattered_calling_intervals\": ["]
+os.mkdir(dir_name)
+for batch in batches_list:
+ counter +=1
+ path = dir_name+"/"+str(counter)+"_of_"+str(len(batches_list))
+ os.mkdir(path)
+ with open(path+"/scattered.interval_list", "w") as intervals_file:
+ # Write out the header copied from the original
+ for line in header_lines:
+ intervals_file.write("%s" % line)
+ # Add a comment to the header
+ intervals_file.write("%s" % comment)
+ # Write out the intervals
+ for interval in batch:
+ intervals_file.write("%s" % interval[0])
+
+ # add the json line
+ json_stub.append("\t\t\"gs://bucket/dir/"+path+"/scattered.interval_list\",")
+json_stub.append("\t]")
+json_stub.append("}")
+
+print "Wrote "+str(counter)+" interval files to \""+dir_name+"/n_of_N/scattered.interval_list\""
+
+# Write out the json stub
+with open("scattered_intervals.json", "w") as json_file:
+ for line in json_stub:
+ json_file.write("%s\n" % line)
+
+print "Wrote a JSON stub to \"scattered_intervals.json\""
+
+
+
+
+
Oops, something went wrong.

0 comments on commit b3eb27d

Please sign in to comment.