Permalink
Browse files

[ADAM-1502] Preserve contig ordering in TwoBitFile sequence dictionary.

Resolves #1502.
  • Loading branch information...
fnothaft authored and heuermh committed Apr 29, 2017
1 parent dbe5c97 commit 36d8e0b2a739629314da4c7f03487b8f228a6ec4
@@ -63,17 +63,20 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
private[util] val numSeq = readHeader()
// hold current byte position of start of current index record
var indexRecordStart = TwoBitFile.FILE_INDEX_OFFSET
private val seqRecordStarts = (0 until numSeq).map(i => {
val tup = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + tup._1.length + TwoBitFile.OFFSET_SIZE
tup
}).toMap
private[util] val seqRecords = seqRecordStarts.map(tup => tup._1 -> TwoBitRecord(bytes, tup._1, tup._2))
private[util] val seqRecords = (0 until numSeq).map(idx => {
val (name, startIdx) = readIndexEntry(indexRecordStart)
indexRecordStart += TwoBitFile.NAME_SIZE_SIZE + name.length + TwoBitFile.OFFSET_SIZE
(name, TwoBitRecord(bytes, name, startIdx, idx))
})
private val seqRecordsMap = seqRecords.toMap
/**
* The sequence dictionary corresponding to the contigs in this two bit file.
*/
val sequences = new SequenceDictionary(seqRecords.toVector.map(r => SequenceRecord(r._1, r._2.dnaSize)))
val sequences = new SequenceDictionary(seqRecords.toVector
.map(r => SequenceRecord(r._1,
r._2.dnaSize,
referenceIndex = Some(r._2.seqIdx))))
private def readHeader(): Int = {
// figure out proper byte order
@@ -107,10 +110,10 @@ class TwoBitFile(byteAccess: ByteAccess) extends ReferenceFile {
*/
def extract(region: ReferenceRegion, mask: Boolean): String = {
val record =
seqRecords.getOrElse(
seqRecordsMap.getOrElse(
region.referenceName,
throw new Exception(
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecords.keys.toList.sortBy(x => x).mkString(", ")}"
s"Contig ${region.referenceName} not found in reference map with keys: ${seqRecordsMap.keys.toList.sortBy(x => x).mkString(", ")}"
)
)
val contigLength = record.dnaSize
@@ -203,7 +206,7 @@ class TwoBitFileSerializer extends Serializer[TwoBitFile] {
}
private object TwoBitRecord {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int): TwoBitRecord = {
def apply(twoBitBytes: ByteBuffer, name: String, seqRecordStart: Int, seqIdx: Int): TwoBitRecord = {
val dnaSize = twoBitBytes.getInt(seqRecordStart)
val nBlockCount = twoBitBytes.getInt(seqRecordStart + TwoBitFile.DNA_SIZE_SIZE)
val nBlockArraysOffset = seqRecordStart + TwoBitFile.DNA_SIZE_SIZE + TwoBitFile.BLOCK_COUNT_SIZE
@@ -222,12 +225,13 @@ private object TwoBitRecord {
ReferenceRegion(name, maskBlockStart, maskBlockStart + maskBlockSize) -> None
})))
val dnaOffset = maskBlockArraysOffset + (maskBlockCount * TwoBitFile.PER_BLOCK_SIZE) + TwoBitFile.SEQ_RECORD_RESERVED_SIZE
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset)
TwoBitRecord(dnaSize, nBlocks, maskBlocks, dnaOffset, seqIdx)
}
}
private case class TwoBitRecord(dnaSize: Int,
nBlocks: Option[NonoverlappingRegions],
maskBlocks: Option[NonoverlappingRegions],
dnaOffset: Int) {
dnaOffset: Int,
seqIdx: Int) {
}
@@ -27,7 +27,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val byteAccess = new LocalFileByteAccess(file)
val twoBitFile = new TwoBitFile(byteAccess)
assert(twoBitFile.numSeq == 1)
assert(twoBitFile.seqRecords.toSeq.length == 1)
assert(twoBitFile.seqRecords.size == 1)
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 0, 10)) == "GATCACAGGT")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 503, 513)) == "CATCCTACCC")
assert(twoBitFile.extract(ReferenceRegion("hg19_chrM", 16561, 16571)) == "CATCACGATG")
@@ -55,5 +55,7 @@ class TwoBitFileSuite extends ADAMFunSuite {
val dict = twoBitFile.sequences
assert(dict.records.length == 1)
assert(dict.records.head.length == 16571)
assert(dict.records.head.referenceIndex.isDefined)
assert(dict.records.head.referenceIndex.get === 0)
}
}

0 comments on commit 36d8e0b

Please sign in to comment.