New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BQSR updates #213

Merged
merged 7 commits into from Apr 22, 2014
Jump to file or symbol
Failed to load files and symbols.
+16,311 −474
Diff settings

Always

Just for now

Viewing a subset of changes. View all

Add CycleCovariate

  • Loading branch information...
jey committed Apr 9, 2014
commit 3db1484335f41ff9efe2648a843c0b11f9119c04
@@ -47,8 +47,7 @@ class BaseQualityRecalibration(
// Additional covariates to use when computing the correction
// TODO: parameterize
val covariates = CovariateSpace(
new DinucCovariate)
val covariates = CovariateSpace(new CycleCovariate, new DinucCovariate)
// Bases with quality less than this will be skipped and left alone
// TODO: parameterize
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2014 The Regents of the University of California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.bdgenomics.adam.rdd.recalibration
import org.bdgenomics.adam.rich.DecadentRead
import org.bdgenomics.adam.rich.DecadentRead._
// This is based on the CycleCovariate in GATK 1.6.
class CycleCovariate extends AbstractCovariate[Int] {
def compute(read: DecadentRead): Seq[Option[Int]] = {
val (initial, increment) = initialization(read)
Range(0, read.residues.length).map(pos => Some(initial + increment * pos))
}
// Returns (initialValue, increment)
private def initialization(read: DecadentRead): (Int, Int) = {
if (!read.isNegativeRead) {
if (read.isSecondOfPair) {
(-1, -1)
} else {
(1, 1)
}
} else {
if (read.isSecondOfPair) {
(-read.residues.length, 1)
} else {
(read.residues.length, -1)
}
}
}
override def csvFieldName: String = "Cycle"
override def equals(other: Any) = other match {
case that: CycleCovariate => true
case _ => false
}
override def hashCode = 0x83EFAB61
}
@@ -64,6 +64,11 @@ class RecalibrationTable(
val extraTables: IndexedSeq[Map[(String, QualityScore, Option[Covariate#Value]), Aggregate]])
extends (DecadentRead => Seq[QualityScore]) with Serializable {
// TODO: parameterize?
val maxQualScore = QualityScore(50)
val maxLogP = log(maxQualScore.errorProbability)
def apply(read: DecadentRead): Seq[QualityScore] =
covariates(read).map(lookup)
@@ -73,7 +78,12 @@ class RecalibrationTable(
val qualityDelta = computeQualityDelta(key, residueLogP + globalDelta)
val extrasDelta = computeExtrasDelta(key, residueLogP + globalDelta + qualityDelta)
val correctedLogP = residueLogP + globalDelta + qualityDelta + extrasDelta
QualityScore.fromErrorProbability(exp(correctedLogP))
qualityFromLogP(correctedLogP)
}
def qualityFromLogP(logP: Double): QualityScore = {
val boundedLogP = math.min(0.0, math.max(maxLogP, logP))
QualityScore.fromErrorProbability(exp(boundedLogP))
}
def computeGlobalDelta(key: CovariateKey): Double = {
@@ -139,6 +139,12 @@ class DecadentRead(val record: RichADAMRecord) extends Logging {
def isDuplicate: Boolean = record.getDuplicateRead
def isPaired: Boolean = record.getReadPaired

This comment has been minimized.

@fnothaft

fnothaft Apr 9, 2014

Member

Why do we keep these values around, since they can be accessed directly from the read?

This comment has been minimized.

@jey

jey Apr 22, 2014

Contributor

The intention here is to provide a unified clean interface to the underlying data, as opposed to having to know when to use a DecadentRead method and when to drop down to the ADAMRecord.

def isFirstOfPair: Boolean = isPaired && !record.getSecondOfPair
def isSecondOfPair: Boolean = isPaired && record.getSecondOfPair
def isNegativeRead: Boolean = record.getReadNegativeStrand
// Is this the most representative record for this read?
@@ -18,7 +18,7 @@ package org.bdgenomics.adam.util
class QualityScore(val phred: Int) extends Ordered[QualityScore] with Serializable {
// Valid range of phred + 33 is described by the regex "[!-~]".
require(phred + 33 >= '!'.toInt && phred + 33 <= '~'.toInt)
require(phred + 33 >= '!'.toInt && phred + 33 <= '~'.toInt, "Phred %s out of range".format(phred))
def successProbability = PhredUtils.phredToSuccessProbability(phred)
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.