Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-848] TwoBitFile now support nBlocks and maskBlocks #850

Merged
merged 1 commit into from Oct 6, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -0,0 +1,211 @@
/**
* Licensed to Big Data Genomics (BDG) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The BDG licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.models
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, add the license header. (You can do this with ./scripts/format-source).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My favorite script :) Haven't run that guy in a while


/**
* The evaluation of a regionJoin takes place with respect to a complete partition on the total space
* of the genome. NonoverlappingRegions is a class to compute the value of that partition, and to allow
* us to assign one or more elements of that partition to a new ReferenceRegion (see the 'regionsFor' method).
*
* NonoverlappingRegions takes, as input, and 'input-set' of regions. These are arbitrary ReferenceRegions,
* which may be overlapping, identical, disjoint, etc. The input-set of regions _must_ all be located on
* the same reference chromosome (i.e. must all have the same refName); the generalization to reference
* regions from multiple chromosomes is in MultiContigNonoverlappingRegions, below.
*
* NonoverlappingRegions produces, internally, a 'nonoverlapping-set' of regions. This is basically
* the set of _distinct unions_ of the input-set regions.
*
* @param regions The input-set of regions.
*/
class NonoverlappingRegions(regions: Iterable[ReferenceRegion]) extends Serializable {

assert(regions != null, "regions parameter cannot be null")

// The regions Seq needs to be of non-zero size, since otherwise we have to add special-case
// checks to all the methods below to make sure that 'endpoints' isn't empty. Also, it shouldn't
// make any sense to have a set of non-overlapping regions for ... no regions.
// Also, without this check, we can't tell which chromosome this NonoverlappingRegions object is for.
assert(regions.size > 0, "regions list must be non-empty")
assert(regions.head != null, "regions must have at least one non-null entry")

val referenceName: String = regions.head.referenceName

// invariant: all the values in the 'regions' list have the same referenceId
assert(regions.forall(_.referenceName == referenceName))

// We represent the distinct unions, the 'nonoverlapping-set' of regions, as a set of endpoints,
// so that we can do reasonably-fast binary searching on them to determine the slice of nonoverlapping-set
// regions that are overlapped by a new, query region (see findOverlappingRegions, below).
val endpoints: Array[Long] =
mergeRegions(regions.toSeq.sortBy(r => r.start)).flatMap(r => Seq(r.start, r.end)).distinct.sorted.toArray

private def updateListWithRegion(list: List[ReferenceRegion], newRegion: ReferenceRegion): List[ReferenceRegion] = {
list match {
case head :: tail =>

// using overlaps || isAdjacent is an important feature! it means that
// we can use the "alternating" optimization, described below, which reduces
// the number of regions returned as keys (by a factor of 2) and therefore
// the number of partitions that need to be examined during a regionJoin.
if (head.overlaps(newRegion) || head.isAdjacent(newRegion)) {
head.hull(newRegion) :: tail
} else {
newRegion :: list
}
case _ => List(newRegion)
}
}

def mergeRegions(regs: Seq[(ReferenceRegion)]): List[ReferenceRegion] =
regs.aggregate(List[ReferenceRegion]())(
(lst: List[ReferenceRegion], p: (ReferenceRegion)) => updateListWithRegion(lst, p),
(a, b) => a ++ b)

def binaryPointSearch(pos: Long, lessThan: Boolean): Int = {
var i = 0
var j = endpoints.size - 1

while (j - i > 1) {
val ij2 = (i + j) / 2
val mid = endpoints(ij2)
if (mid < pos) {
i = ij2
} else {
j = ij2
}
}

if (lessThan) i else j
}

def findOverlappingRegions(query: ReferenceRegion): Seq[ReferenceRegion] = {

assert(query != null, "query region was null")
assert(endpoints != null, "endpoints field was null")

if (query.end <= endpoints.head || query.start >= endpoints.last) {
Seq()
} else {
val firsti = binaryPointSearch(query.start, lessThan = true)
val lasti = binaryPointSearch(query.end, lessThan = false)

// Slice is an inclusive start, exclusive end operation
val firstRegionIsHit = firsti % 2 == 0

val startSlice = endpoints.slice(firsti, lasti)
val endSlice = endpoints.slice(firsti + 1, lasti + 1)

/*
* The use of NonoverlappingRegions.alternating is an important optimization --
* basically, because we used "overlaps || isAdjacent" as the predicate for
* when to join two regions in the input-set into a single nonoverlapping-region, above,
* then we know that the set of nonoverlapping-regions defined by the points in 'endpoints'
* are "alternating." In other words, we know that each nonoverlapping-region
* defined by an overlapping set of input-regions (in the constructor) is followed by
* a nonoverlapping-region which had _no_ input-set regions within it, and vice-versa.
*
* And _this_ is important because it means that, here, we only need to return
* _half_ the regions we otherwise would have -- because this is being used for a
* regionJoin, we don't need to return the 'empty' regions since we know a priori
* that these don't correspond to any region the input-set, and therefore
* will never result in any pairs in the ultimate join result.
*/
NonoverlappingRegions.alternating(startSlice.zip(endSlice).map {
case (start, end) =>
ReferenceRegion(referenceName, start, end)
}.toSeq, firstRegionIsHit)
}
}

/**
* Given a "regionable" value (corresponds to a ReferencRegion through an implicit ReferenceMapping),
* return the set of nonoverlapping-regions to be used as partitions for the input value in a
* region-join. Basically, return the set of any non-empty nonoverlapping-regions that overlap the
* region corresponding to this input.
*
* @param regionable The input, which corresponds to a region
* @tparam U The type of the input
* @return An Iterable[ReferenceRegion], where each element of the Iterable is a nonoverlapping-region
* defined by 1 or more input-set regions.
*/
def regionsFor[U](regionable: (ReferenceRegion, U)): Iterable[ReferenceRegion] =
findOverlappingRegions(regionable._1)

/**
* A quick filter, to find out if we even need to examine a particular input value for keying by
* nonoverlapping-regions. Basically, reject the input value if its corresponding region is
* completely outside the hull of all the input-set regions.
*
* @param regionable The input value
* @tparam U
* @return a boolean -- the input value should only participate in the regionJoin if the return value
* here is 'true'.
*/
def hasRegionsFor[U](regionable: (ReferenceRegion, U)): Boolean = {
!(regionable._1.end <= endpoints.head || regionable._1.start >= endpoints.last)
}

override def toString: String =
"%s:%d-%d (%s)".format(referenceName, endpoints.head, endpoints.last, endpoints.mkString(","))
}

object NonoverlappingRegions {
def apply[T](values: Seq[(ReferenceRegion, T)]) =
new NonoverlappingRegions(values.map(_._1))

def alternating[T](seq: Seq[T], includeFirst: Boolean): Seq[T] = {
val inds = if (includeFirst) { 0 until seq.size } else { 1 until seq.size + 1 }
seq.zip(inds).filter(p => p._2 % 2 == 0).map(_._1)
}
}

/**
* Creates a multi-reference-region collection of NonoverlappingRegions -- see
* the scaladocs to NonoverlappingRegions.
*
* @param regions A Seq of ReferencRegions, pre-partitioned by their
* referenceNames. So, for a given pair (x, regs) in
* this Seq, all regions R in regs must satisfy
* R.referenceName == x. Furthermore, all the x's must
* be valid reference names with respect to the sequence
* dictionary.
*/
class MultiContigNonoverlappingRegions(regions: Seq[(String, Iterable[ReferenceRegion])]) extends Serializable {
assert(regions != null,
"Regions was set to null")

val regionMap: Map[String, NonoverlappingRegions] =
Map(regions.map(r => (r._1, new NonoverlappingRegions(r._2))): _*)

def regionsFor[U](regionable: (ReferenceRegion, U)): Iterable[ReferenceRegion] =
regionMap.get(regionable._1.referenceName).fold(Iterable[ReferenceRegion]())(_.regionsFor(regionable))

def filter[U](value: (ReferenceRegion, U)): Boolean =
regionMap.get(value._1.referenceName).fold(false)(_.hasRegionsFor(value))
}

object MultiContigNonoverlappingRegions {
def apply[T](values: Seq[(ReferenceRegion, T)]): MultiContigNonoverlappingRegions = {
new MultiContigNonoverlappingRegions(
values.map(kv => (kv._1.referenceName, kv._1))
.groupBy(t => t._1)
.map(t => (t._1, t._2.map(k => k._2)))
.toSeq)
}
}