From 1eed8e8e464f8f92a6e87afc1d334e751423e810 Mon Sep 17 00:00:00 2001 From: Blaok Date: Tue, 7 Mar 2017 11:06:21 -0800 Subject: [PATCH] feat: speed up 2bit file extract On a single node with 20 workers, BaseRecalibration time is reduced from 5.8 min to 1.7 min. --- .../org/bdgenomics/adam/util/TwoBitFile.scala | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala index b4fb0f371c..782328adda 100644 --- a/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala +++ b/adam-core/src/main/scala/org/bdgenomics/adam/util/TwoBitFile.scala @@ -119,28 +119,32 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile { val offset = record.dnaOffset val sb = StringBuilder.newBuilder - // define predicate for N blocks - val isNBlock = if (record.nBlocks.forall(!_.hasRegionsFor(region -> None))) { - // our region has no overlap with an N block, so the predicate is trivial - pos: Long => false + val nBlocks: Array[Long] = if (record.nBlocks.isEmpty) { + Array(-1L) } else { - // our region does have some kind of overlap with N blocks, so we need to check each position - pos: Long => record.nBlocks.get.findOverlappingRegions(ReferencePosition(region.referenceName, pos)).nonEmpty + record.nBlocks.get.endpoints ++ Array(-1L) } - - // define predicate for mask blocks - val isMaskBlock = if (record.maskBlocks.forall(!_.hasRegionsFor(region -> None))) { - // our region has no overlap with a mask block, so the predicate is trivial - pos: Long => false + val maskBlocks: Array[Long] = if (record.maskBlocks.isEmpty) { + Array(-1L) } else { - // our region does have some kind of overlap with mask blocks, so we need to check each position - pos: Long => record.maskBlocks.get.findOverlappingRegions(ReferencePosition(region.referenceName, pos)).nonEmpty + record.maskBlocks.get.endpoints ++ Array(-1L) } - // iterate over every position in the query region - (0 until region.width.toInt).foreach(i => { - // check whether we're in an N block - val nt = if (isNBlock(region.start + i)) { + var currentNBlock = 0 + var currentMaskBlock = 0 + while (nBlocks(currentNBlock) != -1 && region.start.toInt >= nBlocks(currentNBlock + 1)) { + currentNBlock += 2 + } + while (maskBlocks(currentMaskBlock) != -1 && region.start.toInt >= maskBlocks(currentMaskBlock + 1)) { + currentMaskBlock += 2 + } + + for (i <- 0 until region.width.toInt) { + // we step into an N block + val nt = if (nBlocks(currentNBlock) != -1 && region.start.toInt + i >= nBlocks(currentNBlock)) { + if (region.start.toInt + i + 1 == nBlocks(currentNBlock + 1)) { + currentNBlock += 2 + } 'N' } else { // TODO: this redundantly reads the byte at a given offset @@ -160,9 +164,16 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile { } } // if nt is masked then make it lower case - val maskedNt = if (mask && isMaskBlock(region.start + i)) nt.toLower else nt + val maskedNt = if (mask && maskBlocks(currentMaskBlock) != -1 && region.start.toInt + i >= maskBlocks(currentMaskBlock)) { + if (region.start.toInt + i + 1 == maskBlocks(currentMaskBlock + 1)) { + currentMaskBlock += 2 + } + nt.toLower + } else { + nt + } sb += maskedNt - }) + } sb.toString() }