Skip to content

Commit

Permalink
feat: speed up 2bit file extract
Browse files Browse the repository at this point in the history
On a single node with 20 workers, BaseRecalibration time is reduced from 5.8 min to 1.7 min.
  • Loading branch information
Blaok authored and fnothaft committed Mar 14, 2017
1 parent 07c1982 commit 1eed8e8
Showing 1 changed file with 30 additions and 19 deletions.
49 changes: 30 additions & 19 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala
Expand Up @@ -119,28 +119,32 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
val offset = record.dnaOffset
val sb = StringBuilder.newBuilder

// define predicate for N blocks
val isNBlock = if (record.nBlocks.forall(!_.hasRegionsFor(region -> None))) {
// our region has no overlap with an N block, so the predicate is trivial
pos: Long => false
val nBlocks: Array[Long] = if (record.nBlocks.isEmpty) {
Array(-1L)
} else {
// our region does have some kind of overlap with N blocks, so we need to check each position
pos: Long => record.nBlocks.get.findOverlappingRegions(ReferencePosition(region.referenceName, pos)).nonEmpty
record.nBlocks.get.endpoints ++ Array(-1L)
}

// define predicate for mask blocks
val isMaskBlock = if (record.maskBlocks.forall(!_.hasRegionsFor(region -> None))) {
// our region has no overlap with a mask block, so the predicate is trivial
pos: Long => false
val maskBlocks: Array[Long] = if (record.maskBlocks.isEmpty) {
Array(-1L)
} else {
// our region does have some kind of overlap with mask blocks, so we need to check each position
pos: Long => record.maskBlocks.get.findOverlappingRegions(ReferencePosition(region.referenceName, pos)).nonEmpty
record.maskBlocks.get.endpoints ++ Array(-1L)
}

// iterate over every position in the query region
(0 until region.width.toInt).foreach(i => {
// check whether we're in an N block
val nt = if (isNBlock(region.start + i)) {
var currentNBlock = 0
var currentMaskBlock = 0
while (nBlocks(currentNBlock) != -1 && region.start.toInt >= nBlocks(currentNBlock + 1)) {
currentNBlock += 2
}
while (maskBlocks(currentMaskBlock) != -1 && region.start.toInt >= maskBlocks(currentMaskBlock + 1)) {
currentMaskBlock += 2
}

for (i <- 0 until region.width.toInt) {
// we step into an N block
val nt = if (nBlocks(currentNBlock) != -1 && region.start.toInt + i >= nBlocks(currentNBlock)) {
if (region.start.toInt + i + 1 == nBlocks(currentNBlock + 1)) {
currentNBlock += 2
}
'N'
} else {
// TODO: this redundantly reads the byte at a given offset
Expand All @@ -160,9 +164,16 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
}
}
// if nt is masked then make it lower case
val maskedNt = if (mask && isMaskBlock(region.start + i)) nt.toLower else nt
val maskedNt = if (mask && maskBlocks(currentMaskBlock) != -1 && region.start.toInt + i >= maskBlocks(currentMaskBlock)) {
if (region.start.toInt + i + 1 == maskBlocks(currentMaskBlock + 1)) {
currentMaskBlock += 2
}
nt.toLower
} else {
nt
}
sb += maskedNt
})
}
sb.toString()
}

Expand Down

0 comments on commit 1eed8e8

Please sign in to comment.