Permalink
b3eb27d Jan 15, 2017
191 lines (151 sloc) 6.27 KB
# usr/bin/python
######################################################################################
# This script creates interval subset lists from a master list for scattering N-ways
#
# Usage:
# python create_scatter_intervals.py \
# demo.interval_list 50 4 output_intervals "demo intervals scattered 50-ways"
#
######################################################################################
# INPUT REQUIREMENTS
#
# The script assumes that the master interval list is formatted according to
# Picard conventions as defined below:
#
# Picard-style interval files have a SAM-like header that includes a sequence
# dictionary. The intervals are given in the form of:
#
# <chr> <start> <stop> + <target_name>
#
# with fields separated by tabs, and the coordinates are 1-based (first position
# in the genome is position 1, not position 0).
#
# Example:
#
# @HD VN:1.0 SO:coordinate
# @SQ SN:1 LN:249250621 AS:GRCh37 [UR:. M5:. SP:.]
# @SQ SN:2 LN:243199373 AS:GRCh37 [UR:. M5:. SP:.]
# 1 30366 30503 + target_1
# 1 69089 70010 + target_2
# 1 367657 368599 + target_3
# LOGIC
#
# The script attempts to partition the intervals in the master list into N subsets
# of consecutive intervals (set by "desired_N"), balanced so that the subsets all
# add up to roughly the same amount of genomic territory. Based on the desired N and
# the "wiggle_factor" value, the script defines a maximum length of territory allowed
# per subset. It then iterates through all intervals, creating subsets and adding
# intervals until the max size is exceeded and a new subset is warranted.
#
# Note that depending on the master intervals, the wiggle factor may need to be tweaked
# in order to achieve exactly N subsets.
# OUTPUT
#
# The script outputs a set of Picard-style interval files in the requested directory
# as well as a JSON stub file that can be used as base list for a WDL's inputs JSON.
import os
import sys
# CLI arguments
master_list_file = sys.argv[1]
desired_N = int(sys.argv[2])
wiggle_factor = int(sys.argv[3])
dir_name = sys.argv[4]
comment = "@CO\t"+sys.argv[5]+"\n"
# Read in the master list file contents:
with open(master_list_file, "r") as master_list:
header_lines = []
intervals_list = []
longest_interval = 0
for line in master_list:
# store the header lines (starting with @) to serve as output stub
if line.startswith("@"):
header_lines.append(line)
else:
line_split = line.split("\t")
length = int(line_split[2])-int(line_split[1])
intervals_list.append((line, length))
# keep track of what is the longest interval
if length > longest_interval:
longest_interval = length
print "Number of intervals: "+str(len(intervals_list))
print "Longest interval was: "+str(longest_interval)
# Determine what is the total territory covered by intervals
total_length = 0
for interval in intervals_list:
total_length = total_length + interval[1]
print "Total length of covered territory: "+str(total_length)
# Determine what should be the theoretical maximum territory per subset
# based on the desired N
max_length_per_subset = total_length / desired_N
print "Theoretical max subset length: "+str(max_length_per_subset)
# Distribute intervals to separate files
interval_count = 0
batch_count = 0
current_batch = []
current_length = 0
length_so_far = 0
batches_list = []
print "Processing..."
def dump_batch(msg):
global batch_count
global current_batch
global current_length
global length_so_far
global interval_count
global batches_list
# increment appropriate counters
batch_count +=1
length_so_far = length_so_far + current_length
# report batch stats
print "\t"+str(batch_count)+". \tBatch of "+str(len(current_batch))+"\t| "+str(current_length)+" \t|"+msg+" \t| "+str(interval_count)+" \t| So far "+str(length_so_far)+" \t| Remains "+str(total_length-length_so_far)
# store batch
batches_list.append(current_batch)
# reset everything
current_batch = []
current_length = 0
for interval in intervals_list:
interval_count +=1
#print interval_count
# Is this new interval above the length limit by itself?
if interval[1] > max_length_per_subset:
dump_batch("close-out")
current_batch.append(interval)
current_length = current_length + interval[1]
dump_batch("godzilla")
# Is this new interval putting us above the length limit when added to the batch?
elif current_length + interval[1] > max_length_per_subset+max_length_per_subset/wiggle_factor:
dump_batch("normal")
current_batch.append(interval)
current_length = current_length + interval[1]
else:
current_batch.append(interval)
current_length = current_length + interval[1]
dump_batch("finalize")
print "Done.\nGrouped intervals into "+str(len(batches_list))+" batches."
# Write batches to files and compose a JSON stub
counter = 0
json_stub = ["{", "\t\"workflow.scattered_calling_intervals\": ["]
os.mkdir(dir_name)
for batch in batches_list:
counter +=1
path = dir_name+"/"+str(counter)+"_of_"+str(len(batches_list))
os.mkdir(path)
with open(path+"/scattered.interval_list", "w") as intervals_file:
# Write out the header copied from the original
for line in header_lines:
intervals_file.write("%s" % line)
# Add a comment to the header
intervals_file.write("%s" % comment)
# Write out the intervals
for interval in batch:
intervals_file.write("%s" % interval[0])
# add the json line
json_stub.append("\t\t\"gs://bucket/dir/"+path+"/scattered.interval_list\",")
json_stub.append("\t]")
json_stub.append("}")
print "Wrote "+str(counter)+" interval files to \""+dir_name+"/n_of_N/scattered.interval_list\""
# Write out the json stub
with open("scattered_intervals.json", "w") as json_file:
for line in json_stub:
json_file.write("%s\n" % line)
print "Wrote a JSON stub to \"scattered_intervals.json\""