Skip to content

Commit

Permalink
[AVOCADO-243] Score unknown alleles seperately from other-alts.
Browse files Browse the repository at this point in the history
Resolves #243.
  • Loading branch information
fnothaft committed Jun 3, 2017
1 parent ce3a038 commit edbbfca
Show file tree
Hide file tree
Showing 8 changed files with 320 additions and 36 deletions.
Expand Up @@ -240,7 +240,22 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
}
})

obsMap
// check for overlapping other-alts
obsMap.map(p => {
val (dv, observed) = p
if (observed.isNonRef) {
val overlappingObs = obsMap.filter(kv => kv._1.overlaps(dv))
.map(_._2)

if (overlappingObs.exists(o => !o.isRef && !o.isOther && !o.isNonRef)) {
(dv, observed.otherAlt)
} else {
p
}
} else {
p
}
})
}
} catch {
case t: Throwable => ProcessException.time {
Expand Down Expand Up @@ -296,7 +311,8 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
observationsDf("_2.forwardStrand").as("forwardStrand"),
observationsDf("_2.optQuality").as("optQuality"),
observationsDf("_2.mapQ").as("mapQ"),
observationsDf("_2.isOther").as("isOther"))
observationsDf("_2.isOther").as("isOther"),
observationsDf("_2.isNonRef").as("isNonRef"))
val flatObservationsDf = observationsDf.select(flatFields: _*)

// create scored table and prepare for join
Expand All @@ -307,6 +323,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
val joinedObservationsDf = scoredDf.join(flatObservationsDf,
Seq("isRef",
"isOther",
"isNonRef",
"forwardStrand",
"optQuality",
"mapQ"))
Expand All @@ -324,6 +341,9 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
}).toSeq ++ (0 to ploidy).map(i => {
val field = "otherLogLikelihoods%d".format(i)
sum(field).as(field)
}).toSeq ++ (0 to ploidy).map(i => {
val field = "nonRefLogLikelihoods%d".format(i)
sum(field).as(field)
}).toSeq ++ Seq(
sum("alleleCoverage").as("alleleCoverage"),
sum("otherCoverage").as("otherCoverage"),
Expand Down Expand Up @@ -362,6 +382,11 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
aggregatedObservationsDf("otherLogLikelihoods%d".format(i))
})
array(fields: _*).as("otherLogLikelihoods")
}, {
val fields = (0 to ploidy).map(i => {
aggregatedObservationsDf("nonRefLogLikelihoods%d".format(i))
})
array(fields: _*).as("nonRefLogLikelihoods")
},
aggregatedObservationsDf("alleleCoverage")
.cast(IntegerType)
Expand Down Expand Up @@ -555,7 +580,7 @@ private[avocado] object BiallelicGenotyper extends Serializable with Logging {
}
}
mergeArrays(obs.alleleLogLikelihoods, obs.referenceLogLikelihoods, gl)
mergeArrays(obs.otherLogLikelihoods, obs.referenceLogLikelihoods, ol)
mergeArrays(obs.nonRefLogLikelihoods, obs.referenceLogLikelihoods, ol)

Genotype.newBuilder()
.setVariant(v)
Expand Down
Expand Up @@ -51,16 +51,22 @@ case class DiscoveredVariant(
referenceAllele: String,
alternateAllele: String) {

lazy val end: Int = start + referenceAllele.length

/**
* @return Returns an avro representation of this variant.
*/
def toVariant: Variant = {
Variant.newBuilder
.setContigName(contigName)
.setStart(start.toLong)
.setEnd((start + referenceAllele.length).toLong)
.setEnd(end.toLong)
.setReferenceAllele(referenceAllele)
.setAlternateAllele(alternateAllele)
.build
}

def overlaps(v: DiscoveredVariant): Boolean = {
contigName == v.contigName && start < v.end && end > v.start
}
}
Expand Up @@ -35,6 +35,7 @@ private[genotyping] object ScoredObservation extends Serializable {
*
* @param isRef Is this the reference allele?
* @param isOther Is this an other-alt allele?
* @param isNonRef Is this not a known allele?
* @param forwardStrand Was this on the forward strand?
* @param optQuality What is the quality of this observation, if defined?
* @param mapQ What is the mapping quality of the read this is from?
Expand All @@ -43,29 +44,33 @@ private[genotyping] object ScoredObservation extends Serializable {
*/
def apply(isRef: Boolean,
isOther: Boolean,
isNonRef: Boolean,
forwardStrand: Boolean,
optQuality: Option[Int],
mapQ: Int,
ploidy: Int): ScoredObservation = {
val mapSuccessProbability = PhredUtils.phredToSuccessProbability(mapQ)
val (altLikelihoods, nonRefLikelihoods) = Observer.likelihoods(
val (altLikelihoods, _) = Observer.likelihoods(
ploidy,
mapSuccessProbability,
optQuality)

val zeros = Array.fill(ploidy + 1)({ 0.0 })
val (referenceLikelihoods, alleleLikelihoods, otherLikelihoods) = if (isOther) {
(zeros, zeros, altLikelihoods)
val (referenceLikelihoods, alleleLikelihoods, otherLikelihoods, nonRefLikelihoods) = if (isOther) {
(zeros, zeros, altLikelihoods, zeros)
} else if (isNonRef) {
(zeros, zeros, zeros, altLikelihoods)
} else {
if (isRef) {
(altLikelihoods, zeros, zeros)
(altLikelihoods, zeros, zeros, zeros)
} else {
(zeros, altLikelihoods, zeros)
(zeros, altLikelihoods, zeros, zeros)
}
}

ScoredObservation(isRef,
isOther,
isNonRef,
forwardStrand,
optQuality,
mapQ,
Expand All @@ -75,6 +80,7 @@ private[genotyping] object ScoredObservation extends Serializable {
referenceLikelihoods,
alleleLikelihoods,
otherLikelihoods,
nonRefLikelihoods,
if (!isRef && !isOther) 1 else 0,
if (isRef) 1 else 0,
1)
Expand All @@ -100,28 +106,52 @@ private[genotyping] object ScoredObservation extends Serializable {
(1 to maxQuality).map(q => Some(q))).flatMap(optQ => {
(1 to maxMapQ).flatMap(mq => {
Seq(
ScoredObservation(true, true, true,
ScoredObservation(true, true, true, true,
optQ, mq,
ploidy),
ScoredObservation(false, true, true,
ScoredObservation(false, true, true, true,
optQ, mq,
ploidy),
ScoredObservation(true, false, true,
ScoredObservation(true, false, true, true,
optQ, mq,
ploidy),
ScoredObservation(false, false, true,
ScoredObservation(false, false, true, true,
optQ, mq,
ploidy),
ScoredObservation(true, true, false,
ScoredObservation(true, true, false, true,
optQ, mq,
ploidy),
ScoredObservation(false, true, false,
ScoredObservation(false, true, false, true,
optQ, mq,
ploidy),
ScoredObservation(true, false, false,
ScoredObservation(true, false, false, true,
optQ, mq,
ploidy),
ScoredObservation(false, false, false,
ScoredObservation(false, false, false, true,
optQ, mq,
ploidy),
ScoredObservation(true, true, true, false,
optQ, mq,
ploidy),
ScoredObservation(false, true, true, false,
optQ, mq,
ploidy),
ScoredObservation(true, false, true, false,
optQ, mq,
ploidy),
ScoredObservation(false, false, true, false,
optQ, mq,
ploidy),
ScoredObservation(true, true, false, false,
optQ, mq,
ploidy),
ScoredObservation(false, true, false, false,
optQ, mq,
ploidy),
ScoredObservation(true, false, false, false,
optQ, mq,
ploidy),
ScoredObservation(false, false, false, false,
optQ, mq,
ploidy))
})
Expand All @@ -148,6 +178,7 @@ private[genotyping] object ScoredObservation extends Serializable {
scoreDf.select((
Seq(scoreDf("isRef"),
scoreDf("isOther"),
scoreDf("isNonRef"),
scoreDf("forwardStrand"),
scoreDf("optQuality"),
scoreDf("mapQ"),
Expand All @@ -162,6 +193,9 @@ private[genotyping] object ScoredObservation extends Serializable {
}) ++ (0 to ploidy).map(p => {
scoreDf("otherLogLikelihoods").getItem(p)
.as("otherLogLikelihoods%d".format(p))
}) ++ (0 to ploidy).map(p => {
scoreDf("nonRefLogLikelihoods").getItem(p)
.as("nonRefLogLikelihoods%d".format(p))
}) ++ Seq(scoreDf("alleleCoverage"),
scoreDf("otherCoverage"),
scoreDf("totalCoverage"))): _*)
Expand All @@ -173,6 +207,7 @@ private[genotyping] object ScoredObservation extends Serializable {
*
* @param isRef Is this a reference allele?
* @param isOther Is this an other-alt allele?
* @param isNonRef Is this an unknown allele?
* @param forwardStrand Is this on the forward strand?
* @param optQuality What is the base quality of this observation, if defined?
* @param mapQ What is the mapping quality of this observation?
Expand All @@ -187,6 +222,8 @@ private[genotyping] object ScoredObservation extends Serializable {
* allele were observed.
* @param otherLogLikelihoods The log likelihoods that 0...n copies of another
* allele were observed.
* @param nonRefLogLikelihoods The log likelihoods that 0...n copies of an
* unknown allele were observed.
* @param alleleCoverage The total number of reads observed that cover the
* site and match the allele.
* @param otherCoverage The total number of reads observed that cover the site
Expand All @@ -196,6 +233,7 @@ private[genotyping] object ScoredObservation extends Serializable {
private[genotyping] case class ScoredObservation(
isRef: Boolean,
isOther: Boolean,
isNonRef: Boolean,
forwardStrand: Boolean,
optQuality: Option[Int],
mapQ: Int,
Expand All @@ -205,6 +243,7 @@ private[genotyping] case class ScoredObservation(
referenceLogLikelihoods: Array[Double],
alleleLogLikelihoods: Array[Double],
otherLogLikelihoods: Array[Double],
nonRefLogLikelihoods: Array[Double],
alleleCoverage: Int,
otherCoverage: Int,
totalCoverage: Int) {
Expand Down
Expand Up @@ -31,13 +31,15 @@ import org.bdgenomics.avocado.models.Observation
* @param optQuality What was the quality score of the bases in the observation,
* if known.
* @param mapQ What was the mapping quality of the bases in the observation?
* @param isOther Was this observation of a non-ref/non-alt allele?
* @param isOther Was this observation of an other-alt allele?
* @param isOther Was this observation of an unknown allele?
*/
private[genotyping] case class SummarizedObservation(isRef: Boolean,
forwardStrand: Boolean,
optQuality: Option[Int],
mapQ: Int,
isOther: Boolean = false) {
isOther: Boolean = false,
isNonRef: Boolean = false) {

/**
* @return Returns an observation that agrees with the reference allele.
Expand All @@ -60,16 +62,27 @@ private[genotyping] case class SummarizedObservation(isRef: Boolean,
}

/**
* @return Returns an observation that is neither alt nor ref.
* @return Returns an observation that is another alternate allele.
*/
def nullOut: SummarizedObservation = {
def otherAlt: SummarizedObservation = {
SummarizedObservation(false,
forwardStrand,
optQuality,
mapQ,
isOther = true)
}

/**
* @return Returns an observation that is an unknown allele.
*/
def nullOut: SummarizedObservation = {
SummarizedObservation(false,
forwardStrand,
optQuality,
mapQ,
isNonRef = true)
}

/**
* @param scores The previously scored observations.
* @return Returns a fully scored observation.
Expand All @@ -80,7 +93,8 @@ private[genotyping] case class SummarizedObservation(isRef: Boolean,
score.forwardStrand == forwardStrand &&
score.optQuality == optQuality &&
score.mapQ == mapQ &&
score.isOther == isOther
score.isOther == isOther &&
score.isNonRef == isNonRef
})
assert(optScore.isDefined)
val score = optScore.get
Expand All @@ -91,6 +105,7 @@ private[genotyping] case class SummarizedObservation(isRef: Boolean,
score.referenceLogLikelihoods,
score.alleleLogLikelihoods,
score.otherLogLikelihoods,
score.nonRefLogLikelihoods,
score.alleleCoverage,
score.otherCoverage,
score.totalCoverage,
Expand Down
Expand Up @@ -31,6 +31,8 @@ package org.bdgenomics.avocado.models
* allele were observed.
* @param otherLogLikelihoods The log likelihoods that 0...n copies of another
* allele were observed.
* @param otherLogLikelihoods The log likelihoods that 0...n copies of an
* unknown allele were observed.
* @param alleleCoverage The total number of reads observed that cover the
* site and match the allele.
* @param otherCoverage The total number of reads observed that cover the site
Expand All @@ -44,19 +46,21 @@ case class Observation(alleleForwardStrand: Int,
referenceLogLikelihoods: Array[Double],
alleleLogLikelihoods: Array[Double],
otherLogLikelihoods: Array[Double],
nonRefLogLikelihoods: Array[Double],
alleleCoverage: Int,
otherCoverage: Int,
totalCoverage: Int = 1,
isRef: Boolean = true) {

override def toString: String = {
"Observation(%d, %d, %f, Array(%s), Array(%s), Array(%s), %d, %d, %d, %s)".format(
"Observation(%d, %d, %f, Array(%s), Array(%s), Array(%s), Array(%s), %d, %d, %d, %s)".format(
alleleForwardStrand,
otherForwardStrand,
squareMapQ,
referenceLogLikelihoods.mkString(","),
alleleLogLikelihoods.mkString(","),
otherLogLikelihoods.mkString(","),
nonRefLogLikelihoods.mkString(","),
alleleCoverage,
otherCoverage,
totalCoverage,
Expand Down Expand Up @@ -91,6 +95,7 @@ case class Observation(alleleForwardStrand: Int,
referenceLogLikelihoods.map(v => v),
alleleLogLikelihoods.map(v => v),
otherLogLikelihoods.map(v => v),
nonRefLogLikelihoods.map(v => v),
alleleCoverage,
otherCoverage,
totalCoverage = totalCoverage,
Expand All @@ -109,6 +114,7 @@ case class Observation(alleleForwardStrand: Int,
referenceLogLikelihoods.map(v => v),
otherLogLikelihoods.map(v => v),
alleleLogLikelihoods.map(v => v),
nonRefLogLikelihoods.map(v => v),
otherCoverage,
alleleCoverage,
totalCoverage = totalCoverage,
Expand All @@ -128,6 +134,7 @@ case class Observation(alleleForwardStrand: Int,
referenceLogLikelihoods.map(v => v),
Array.fill(alleleLogLikelihoods.length)({ 0.0 }),
alleleLogLikelihoods.map(v => v),
nonRefLogLikelihoods.map(v => v),
0,
0,
totalCoverage = totalCoverage,
Expand Down

0 comments on commit edbbfca

Please sign in to comment.