Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-578] Update end of read when trimming. #579

Merged
merged 1 commit into from Feb 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -252,7 +252,7 @@ private[correction] class TrimReads extends Serializable {
* @param startPos Start position of the alignment.
* @return Returns a tuple containing the (updated cigar, updated alignment start position).
*/
def trimCigar(cigar: String, trimStart: Int, trimEnd: Int, startPos: Long): (String, Long) = TrimCigar.time {
def trimCigar(cigar: String, trimStart: Int, trimEnd: Int, startPos: Long, endPos: Long): (String, Long, Long) = TrimCigar.time {
@tailrec def trimFront(c: String, trim: Int, start: Long): (String, Long) = {
if (trim <= 0) {
(c, start)
Expand Down Expand Up @@ -289,32 +289,43 @@ private[correction] class TrimReads extends Serializable {
}
}

def trimBack(c: String, trim: Int): String = {
@tailrec def trimBackHelper(ch: String, t: Int): String = {
def trimBack(c: String, trim: Int, end: Long): (String, Long) = {
@tailrec def trimBackHelper(ch: String, t: Int, e: Long): (String, Long) = {
if (t <= 0) {
ch.reverse
(ch.reverse, e)
} else {
val count = ch.drop(1).takeWhile(_.isDigit).reverse.toInt
val operator = ch.head
val withoutOp = ch.drop(1).dropWhile(_.isDigit)

val (cNew, tNew) = if (operator == 'D' || operator == 'N') {
def eNext(): Long = {
if (operator == 'M' ||
operator == '=' ||
operator == 'X') {
// if we are trimming into an alignment match, we must update the start position
e - 1
} else {
e
}
}

val (cNew, tNew, eNew) = if (operator == 'D' || operator == 'N') {
// must trim all the way through a reference deletion or skip
(withoutOp, t)
(withoutOp, t, e - count)
} else if (count == 1) {
(withoutOp, t - 1)
(withoutOp, t - 1, eNext())
} else {
(operator.toString + (count - 1).toString.reverse + withoutOp, t - 1)
(operator.toString + (count - 1).toString.reverse + withoutOp, t - 1, eNext())
}

trimBackHelper(cNew, tNew)
trimBackHelper(cNew, tNew, eNew)
}
}

if (trim <= 0) {
c
(c, end)
} else {
trimBackHelper(c.reverse, trim)
trimBackHelper(c.reverse, trim, end)
}
}

Expand All @@ -331,8 +342,9 @@ private[correction] class TrimReads extends Serializable {
}

val (cigarFrontTrimmed, newStart) = trimFront(cigar, trimStart, startPos)
val (cigarBackTrimmed, newEnd) = trimBack(cigarFrontTrimmed, trimEnd, endPos)

(trimPrefix + trimBack(cigarFrontTrimmed, trimEnd) + trimSuffix, newStart)
(trimPrefix + cigarBackTrimmed + trimSuffix, newStart, newEnd)
}

/**
Expand All @@ -358,11 +370,12 @@ private[correction] class TrimReads extends Serializable {

// clean up cigar and read start position
Option(read.getCigar).filter(_ != "*")
.map(trimCigar(_, trimStart, trimEnd, read.getStart))
.map(trimCigar(_, trimStart, trimEnd, read.getStart, read.getEnd))
.foreach(p => {
val (c, s) = p
val (c, s, e) = p
builder.setCigar(c)
.setStart(s)
.setEnd(e)

// also, clean up md tag
Option(read.getMismatchingPositions).map(trimMdTag(_, trimStart, trimEnd))
Expand Down
Expand Up @@ -45,31 +45,31 @@ class TrimReadsSuite extends ADAMFunSuite {

test("trim a few cigars that just have clipping and matches") {
// trim a single element incompletely
assert(ec.trimCigar("2S10M", 1, 0, 0L) === ("1H1S10M", 0L))
assert(ec.trimCigar("10M3S", 0, 2, 0L) === ("10M1S2H", 0L))
assert(ec.trimCigar("2S10M3S", 1, 2, 0L) === ("1H1S10M1S2H", 0L))
assert(ec.trimCigar("2S10M", 1, 0, 0L, 10L) === ("1H1S10M", 0L, 10L))
assert(ec.trimCigar("10M3S", 0, 2, 0L, 10L) === ("10M1S2H", 0L, 10L))
assert(ec.trimCigar("2S10M3S", 1, 2, 0L, 10L) === ("1H1S10M1S2H", 0L, 10L))

// trim a single element completely
assert(ec.trimCigar("2S10M", 2, 0, 0L) === ("2H10M", 0L))
assert(ec.trimCigar("10M3S", 0, 3, 0L) === ("10M3H", 0L))
assert(ec.trimCigar("2S10M3S", 2, 3, 0L) === ("2H10M3H", 0L))
assert(ec.trimCigar("2S10M", 2, 0, 0L, 10L) === ("2H10M", 0L, 10L))
assert(ec.trimCigar("10M3S", 0, 3, 0L, 10L) === ("10M3H", 0L, 10L))
assert(ec.trimCigar("2S10M3S", 2, 3, 0L, 10L) === ("2H10M3H", 0L, 10L))

// trim into multiple elements
assert(ec.trimCigar("2S10M", 3, 0, 0L) === ("3H9M", 1L))
assert(ec.trimCigar("10M3S", 0, 4, 0L) === ("9M4H", 0L))
assert(ec.trimCigar("2S10M3S", 3, 4, 0L) === ("3H8M4H", 1L))
assert(ec.trimCigar("2S10M", 3, 0, 0L, 10L) === ("3H9M", 1L, 10L))
assert(ec.trimCigar("10M3S", 0, 4, 0L, 10L) === ("9M4H", 0L, 9L))
assert(ec.trimCigar("2S10M3S", 3, 4, 0L, 10L) === ("3H8M4H", 1L, 9L))
}

test("trim cigars with indels") {
// trim through a deletion
assert(ec.trimCigar("2S2M2D4M", 5, 0, 0L) === ("5H3M", 5L))
assert(ec.trimCigar("4M1D1M", 0, 3, 0L) === ("2M3H", 0L))
assert(ec.trimCigar("2S2M2N4M", 5, 0, 0L) === ("5H3M", 5L))
assert(ec.trimCigar("4M1N1M", 0, 3, 0L) === ("2M3H", 0L))
assert(ec.trimCigar("2S2M2D4M", 5, 0, 0L, 8L) === ("5H3M", 5L, 8L))
assert(ec.trimCigar("4M1D1M", 0, 3, 0L, 6L) === ("2M3H", 0L, 2L))
assert(ec.trimCigar("2S2M2N4M", 5, 0, 0L, 8L) === ("5H3M", 5L, 8L))
assert(ec.trimCigar("4M1N1M", 0, 3, 0L, 6L) === ("2M3H", 0L, 2L))

// trim into an insert
assert(ec.trimCigar("2M2I10M", 3, 0, 0L) === ("3H1I10M", 2L))
assert(ec.trimCigar("10M3I1M", 0, 3, 0L) === ("10M1I3H", 0L))
assert(ec.trimCigar("2M2I10M", 3, 0, 0L, 12L) === ("3H1I10M", 2L, 12L))
assert(ec.trimCigar("10M3I1M", 0, 3, 0L, 11L) === ("10M1I3H", 0L, 10L))
}

test("trim a few reads") {
Expand All @@ -89,13 +89,31 @@ class TrimReadsSuite extends ADAMFunSuite {
.setQual("##/9:::::::::##")
.setCigar("2S11M2S")
.setStart(5L)
.setEnd(16L)
.build
val trimmedRead2 = ec.trimRead(read2, 2, 2)

assert(trimmedRead2.getSequence.toString === "TCGCCCACTCA")
assert(trimmedRead2.getQual.toString === "/9:::::::::")
assert(trimmedRead2.getStart === 5L)
assert(trimmedRead2.getEnd === 16L)
assert(trimmedRead2.getCigar.toString === "2H11M2H")

// trim from both ends, read with cigar
val read3 = AlignmentRecord.newBuilder
.setSequence("ACTCGCCCACTCAAA")
.setQual("##/9:::::::::##")
.setCigar("15M")
.setStart(5L)
.setEnd(20L)
.build
val trimmedRead3 = ec.trimRead(read3, 4, 3)

assert(trimmedRead3.getSequence.toString === "GCCCACTC")
assert(trimmedRead3.getQual.toString === "::::::::")
assert(trimmedRead3.getStart === 9L)
assert(trimmedRead3.getEnd === 17L)
assert(trimmedRead3.getCigar.toString === "4H8M3H")
}

sparkTest("correctly trim an RDD of reads") {
Expand Down