Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for mdtag issues with insertions #748

Merged
merged 1 commit into from
Jul 30, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class RichAlignmentRecord(val record: AlignmentRecord) {
}

lazy val samtoolsCigar: Cigar = {
TextCigarCodec.decode(record.getCigar.toString)
TextCigarCodec.decode(record.getCigar)
}

// Returns the MdTag if the read is mapped, None otherwise
Expand Down
103 changes: 64 additions & 39 deletions adam-core/src/main/scala/org/bdgenomics/adam/util/MdTag.scala
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ object MdTag {
var cigarIdx = 0
var mdTagStringOffset = 0
var referencePos = referenceStart
var cigarReferencePosition = referenceStart

var cigarOperatorIndex = 0
var usedMatchingBases = 0
if (mdTagInput == null || mdTagInput == "0") {
new MdTag(referenceStart, List(), Map(), Map())
} else {
Expand All @@ -68,74 +68,99 @@ object MdTag {
val cigarElement = cigar.getCigarElement(cigarIdx)
val nextDigit = digitPattern.findPrefixOf(mdTag.substring(mdTagStringOffset))
(cigarElement.getOperator, nextDigit) match {
case (_, None) if mdTagStringOffset == 0 => throw new IllegalArgumentException(s"MdTag $mdTagInput does not start with a digit")
case (_, Some(matchingBases)) if matchingBases.toInt == 0 => {

case (_, None) if mdTagStringOffset == 0 =>
throw new IllegalArgumentException(s"MdTag $mdTagInput does not start with a digit")

case (_, Some(matchingBases)) if matchingBases.toInt == 0 =>
mdTagStringOffset += 1
}
case (CigarOperator.MATCH_OR_MISMATCH | CigarOperator.M | CigarOperator.EQ, Some(matchingBases)) => {
val numMatchingBases = math.min(cigarElement.getLength, matchingBases.toInt)

case (CigarOperator.M | CigarOperator.EQ, Some(matchingBases)) =>
val numMatchingBases = math.min(cigarElement.getLength - cigarOperatorIndex, matchingBases.toInt - usedMatchingBases)

if (numMatchingBases > 0) {
matches ::= NumericRange(referencePos, referencePos + numMatchingBases, 1L)

// Move the reference position the length of the matching sequence
referencePos += numMatchingBases

// Move ahead in the current CIGAR operator
cigarOperatorIndex += numMatchingBases

// Move ahead in the current MdTag digit
usedMatchingBases += numMatchingBases
}

if (matchingBases.toInt == usedMatchingBases) {
mdTagStringOffset += matchingBases.length
usedMatchingBases = 0
}
// Move the reference position the length of the matching sequence
referencePos += numMatchingBases.toInt
if (matchingBases.toInt <= cigarElement.getLength) mdTagStringOffset += matchingBases.size

// If the M operator has been fully read move on to the next operator
if (referencePos >= cigarReferencePosition + cigarElement.getLength) {
if (cigarOperatorIndex == cigarElement.getLength) {
cigarIdx += 1
cigarReferencePosition += cigarElement.getLength
cigarOperatorIndex = 0
}
}
case (CigarOperator.MATCH_OR_MISMATCH | CigarOperator.M | CigarOperator.X, None) => {

case (CigarOperator.M | CigarOperator.X, None) =>
basesPattern.findPrefixOf(mdTag.substring(mdTagStringOffset)) match {
// Must have found a base or digit pattern in CigarOperator M
case None => throw new IllegalArgumentException(s"No match or mismatched bases found for ${cigar.toString} in MDTag $mdTag")
case Some(mismatchedBases) => {
case None =>
throw new IllegalArgumentException(
s"No match or mismatched bases found for ${cigar.toString} in MDTag $mdTag"
)
case Some(mismatchedBases) =>
mismatchedBases.foreach {
base =>
mismatches += (referencePos -> base)
referencePos += 1
}
mdTagStringOffset += mismatchedBases.size
}
mdTagStringOffset += mismatchedBases.length
cigarOperatorIndex += mismatchedBases.length
}
// If the M operator has been fully read move on to the next operator
if (referencePos >= cigarReferencePosition + cigarElement.getLength) {
if (cigarOperatorIndex == cigarElement.getLength) {
cigarIdx += 1
cigarReferencePosition += cigarElement.getLength
cigarOperatorIndex = 0
}
}
case (CigarOperator.DELETION, None) => {

case (CigarOperator.DELETION, None) =>
mdTag.charAt(mdTagStringOffset) match {
case '^' => {
case '^' =>
// Skip ahead of the deletion '^' character
mdTagStringOffset += 1
basesPattern.findPrefixOf(mdTag.substring(mdTagStringOffset)) match {
case None => throw new IllegalArgumentException(s"No deleted bases found ${cigar.toString} in MDTag $mdTag")
case Some(deletedBases) => {
case None =>
throw new IllegalArgumentException(s"No deleted bases found ${cigar.toString} in MDTag $mdTag")
case Some(deletedBases) if deletedBases.length == cigarElement.getLength =>
deletedBases.foreach {
base =>
deletions += (referencePos -> base)
referencePos += 1
}
mdTagStringOffset += deletedBases.size
}
mdTagStringOffset += deletedBases.length
case Some(deletedBases) =>
throw new IllegalArgumentException(
s"Element ${cigarElement.getLength}${cigarElement.getOperator.toString} in cigar ${cigar.toString} contradicts number of bases listed in MDTag: ^${deletedBases}"
)
}
cigarReferencePosition += cigarElement.getLength
cigarIdx += 1
}
case _ => throw new IllegalArgumentException(s"CIGAR ${cigar.toString} indicates deletion found but no deleted bases in MDTag $mdTagInput")
cigarOperatorIndex = 0

case _ =>
throw new IllegalArgumentException(
s"CIGAR ${cigar.toString} indicates deletion found but no deleted bases in MDTag $mdTagInput"
)
}
}
case _ if cigarElement.getOperator.consumesReferenceBases() => {

case (CigarOperator.N, _) =>
referencePos += cigarElement.getLength
cigarReferencePosition += cigarElement.getLength
cigarIdx += 1
}
case _ => {
cigarOperatorIndex = 0

case (CigarOperator.INSERTION | CigarOperator.H | CigarOperator.S, _) =>
cigarIdx += 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the cigarOperatorIndex = 0 not necessary here?

mind commenting these last two cases a little bit? I assume the main interesting cigar operator being passed over here is I/insertion, which MDTags don't care about... any other cases being handled here worth noting?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cigarOperatorIndex = 0 is not necessary, it shouldn't be updated, but just added it for clarity.

The last case is insertion or clipped bases

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it doesn't matter whether you set cigarOperatorIndex = 0 here or not? That's also confusing to me, I don't fully have my head around this.

Is the second-to-last case one that you specifically added for Ns? Is that the main focus there?

Could you add comments clarifying which operators you expect to hit each of these last two? At this point in the match I'm struggling to keep in my head what's left.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes the second to last case is for N

}

}
}
new MdTag(referenceStart, matches, mismatches, deletions)
Expand Down Expand Up @@ -400,7 +425,7 @@ class MdTag(
* @return True if this read has mismatches. We do not return true if the read has no mismatches but has deletions.
*/
def hasMismatches: Boolean = {
!mismatches.isEmpty
mismatches.nonEmpty
}

/**
Expand All @@ -419,7 +444,7 @@ class MdTag(
*/
def end(): Long = {
val ends = matches.map(_.end - 1) ::: mismatches.keys.toList ::: deletions.keys.toList
ends.reduce(_ max _)
ends.max
}

/**
Expand Down Expand Up @@ -496,7 +521,7 @@ class MdTag(
* @return MD string corresponding to [0-9]+(([A-Z]|\&#94;[A-Z]+)[0-9]+)
* @see http://zenfractal.com/2013/06/19/playing-with-matches/
*/
override def toString(): String = {
override def toString: String = {
if (matches.isEmpty && mismatches.isEmpty && deletions.isEmpty) {
"0"
} else {
Expand Down
151 changes: 150 additions & 1 deletion adam-core/src/test/scala/org/bdgenomics/adam/util/MdTagSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,138 @@ class MdTagSuite extends FunSuite {
assert(tag.toString === "20")
}

test("Get correct matches for mdtag with insertion") {
val tag = MdTag("10", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

(0 until 9).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "10")
}

test("Get correct matches for mdtag with mismatches and insertion") {
val tag = MdTag("2A7", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2) === 'A')
(3 until 9).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "2A7")
}

test("Get correct matches for mdtag with insertion between mismatches") {
val tag = MdTag("2A4A2", 0L, TextCigarCodec.decode("5M3I5M"))
assert(tag.end === 9)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

assert(tag.mismatches(7L) === 'A')
assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.toString === "2A4A2")
}

test("Get correct matches for mdtag with intron between mismatches") {
val tag = MdTag("2A4A2", 0L, TextCigarCodec.decode("5M3N5M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')
assert(tag.isMatch(3))
assert(tag.isMatch(4))

assert(!tag.isMatch(5))
assert(!tag.isMatch(6))
assert(!tag.isMatch(7))

assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.mismatches(10L) === 'A')
assert(tag.isMatch(11))
assert(tag.isMatch(12))

assert(tag.toString === "2A4A2")
}

test("Get correct matches for mdtag with intron and deletion between mismatches") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, copy mcpastington? same test as above, different name?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! I should was trying to think of more cases, but didn't add the test actually

val tag = MdTag("2A4A0^AAA2", 0L, TextCigarCodec.decode("5M3N3M3D2M"))
assert(tag.end === 15)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')
assert(tag.isMatch(3))
assert(tag.isMatch(4))

assert(!tag.isMatch(5))
assert(!tag.isMatch(6))
assert(!tag.isMatch(7))

assert(tag.isMatch(8))
assert(tag.isMatch(9))

assert(tag.mismatches(10L) === 'A')

assert(tag.deletions(11L) === 'A')
assert(tag.deletions(12L) === 'A')
assert(tag.deletions(13L) === 'A')

assert(tag.toString === "2A4A0^AAA2")
}

test("Throw exception when number of deleted bases in mdtag disagrees with CIGAR") {
intercept[IllegalArgumentException] {
MdTag("2A4A0^AAA2", 0L, TextCigarCodec.decode("5M3N3M4D2M"))
}
}

test("Get correct matches for mdtag with mismatch, insertion and deletion") {
val tag = MdTag("2A3^AAA4", 0L, TextCigarCodec.decode("5M3I1M3D4M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

(3 to 5).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.deletedBase(6L) === Some('A'))
assert(tag.deletedBase(7L) === Some('A'))
assert(tag.deletedBase(8L) === Some('A'))

(9 to 12).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.toString === "2A3^AAA4")
}

test("Get correct matches for mdtag with mismatches, insertion and deletion") {
val tag = MdTag("2A3^AAA2A1", 0L, TextCigarCodec.decode("5M3I1M3D4M"))
assert(tag.end === 12)

assert(tag.isMatch(0))
assert(tag.isMatch(1))
assert(tag.mismatches(2L) === 'A')

(3 to 5).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.deletedBase(6L) === Some('A'))
assert(tag.deletedBase(7L) === Some('A'))
assert(tag.deletedBase(8L) === Some('A'))

(9 to 10).foreach(locus => assert(tag.isMatch(locus)))

assert(tag.mismatches(11L) === 'A')

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: missing a last "match" check on idx 12?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea figured it was repetitive at this point, but I'll add it

assert(tag.toString === "2A3^AAA2A1")
}

test("Get correct matches for MDTag with mismatches and deletions") {

val tag1 = MdTag("40A5^TTT54", 0L, TextCigarCodec.decode("46M3D54M"))
Expand All @@ -237,6 +369,7 @@ class MdTagSuite extends FunSuite {
assert(tag1.isMatch(41) === true)

assert(tag1.mismatchedBase(40) === Some('A'))
assert(tag1.toString === "40A5^TTT54")

val tag2 = MdTag("40A5^TTT0G53", 0L, TextCigarCodec.decode("46M3D54M"))
assert(tag2.hasMismatches === true)
Expand All @@ -249,6 +382,22 @@ class MdTagSuite extends FunSuite {
assert(tag2.mismatchedBase(40) === Some('A'))
assert(tag2.mismatchedBase(49) === Some('G'))
assert(tag2.isMatch(50) === true)
assert(tag2.toString === "40A5^TTT0G53")

val tag3 = MdTag("2^GA5^TC6", 0L, TextCigarCodec.decode("2M2D1M2I2M4I2M2D6M"))
(0 to 1).foreach(l => assert(tag3.isMatch(l)))
(2 to 3).foreach(l => assert(!tag3.isMatch(l)))
(4 to 8).foreach(l => assert(tag3.isMatch(l)))
(9 to 10).foreach(l => assert(!tag3.isMatch(l)))
(11 to 16).foreach(l => assert(tag3.isMatch(l)))

assert(tag3.deletedBase(2) == Some('G'))
assert(tag3.deletedBase(3) == Some('A'))

assert(tag3.deletedBase(9) == Some('T'))
assert(tag3.deletedBase(10) == Some('C'))

assert(tag3.toString === "2^GA5^TC6")
}

test("Get correct matches base from MDTag and CIGAR with N") {
Expand Down Expand Up @@ -453,4 +602,4 @@ class MdTagSuite extends FunSuite {
test("handle '=' and 'X' operators") {
testTag("ACCCAAGT", "ACCATAGA", "3=2X2=1X", 0, "3A0T2A0", 0, 7)
}
}
}