From a066a44f5d2023e0c3c7c3e5c7dedc58e9158bd8 Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Sun, 2 Apr 2017 23:33:49 -0700 Subject: [PATCH 1/2] Fix SNP->INDEL error in filters. --- .../org/bdgenomics/avocado/util/HardFilterGenotypes.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala index c130fe68..944c81b5 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala @@ -226,13 +226,13 @@ private[avocado] object HardFilterGenotypes extends Serializable { Option(args.minHomSnpAltAllelicFraction).filter(_ > 0) .map(af => new VCFFilterHeaderLine("HOMSNPMINAF", "Allelic fraction was below %f for a hom SNP.".format(af))), - Option(args.minHetSnpAltAllelicFraction).filter(_ > 0) + Option(args.minHetIndelAltAllelicFraction).filter(_ > 0) .map(af => new VCFFilterHeaderLine("HETINDELMINAF", "Allelic fraction was below %f for a het INDEL.".format(af))), - Option(args.maxHetSnpAltAllelicFraction).filter(_ > 0) + Option(args.maxHetIndelAltAllelicFraction).filter(_ > 0) .map(af => new VCFFilterHeaderLine("HETINDELMAXAF", "Allelic fraction was above %f for a het INDEL.".format(af))), - Option(args.minHomSnpAltAllelicFraction).filter(_ > 0) + Option(args.minHomIndelAltAllelicFraction).filter(_ > 0) .map(af => new VCFFilterHeaderLine("HOMINDELMINAF", "Allelic fraction was below %f for a hom INDEL.".format(af)))) .flatten From dd1803ad3ce970ffe52e4016e12367b466670b8f Mon Sep 17 00:00:00 2001 From: Frank Austin Nothaft Date: Thu, 6 Apr 2017 12:07:29 -0700 Subject: [PATCH 2/2] [AVOCADO-222] Add code to process high allelic fraction het calls. Resolves #222. --- .../avocado/cli/BiallelicGenotyper.scala | 17 +- .../avocado/util/HardFilterGenotypes.scala | 2 +- .../bdgenomics/avocado/util/RewriteHets.scala | 166 +++++++++++++++ .../avocado/util/RewriteHetsSuite.scala | 195 ++++++++++++++++++ 4 files changed, 376 insertions(+), 4 deletions(-) create mode 100644 avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala create mode 100644 avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala index 5ae5a180..98d27a35 100644 --- a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala +++ b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/BiallelicGenotyper.scala @@ -26,7 +26,9 @@ import org.bdgenomics.avocado.util.{ HardFilterGenotypes, HardFilterGenotypesArgs, PrefilterReads, - PrefilterReadsArgs + PrefilterReadsArgs, + RewriteHets, + RewriteHetsArgs } import org.bdgenomics.formats.avro.AlignmentRecord import org.bdgenomics.utils.cli._ @@ -41,7 +43,7 @@ object BiallelicGenotyper extends BDGCommandCompanion { } } -class BiallelicGenotyperArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs with PrefilterReadsArgs with HardFilterGenotypesArgs { +class BiallelicGenotyperArgs extends Args4jBase with ADAMSaveAnyArgs with ParquetArgs with PrefilterReadsArgs with HardFilterGenotypesArgs with RewriteHetsArgs { @Argument(required = true, metaVar = "INPUT", usage = "The ADAM, BAM or SAM file to call", @@ -172,6 +174,14 @@ class BiallelicGenotyperArgs extends Args4jBase with ADAMSaveAnyArgs with Parque name = "-min_hom_indel_allelic_fraction", usage = "Minimum (alt) allelic fraction for calling a hom INDEL. Default is 0.666. Set to a negative value to omit.") var minHomIndelAltAllelicFraction: Float = 0.666f + @Args4jOption(required = false, + name = "-disable_het_snp_rewriting", + usage = "If true, disables rewriting of high allelic fraction het SNPs as hom alt SNPs.") + var disableHetSnpRewriting: Boolean = false + @Args4jOption(required = false, + name = "-disable_het_indel_rewriting", + usage = "If true, disables rewriting of high allelic fraction het INDELs as hom alt INDELs.") + var disableHetIndelRewriting: Boolean = false // required by HardFilterGenotypesArgs var maxSnpPhredStrandBias: Float = -1.0f @@ -236,7 +246,8 @@ class BiallelicGenotyper( }) // hard filter the genotypes - val filteredGenotypes = HardFilterGenotypes(genotypes, args) + val filteredGenotypes = HardFilterGenotypes(RewriteHets(genotypes, args), + args) // save the variant calls filteredGenotypes.saveAsParquet(args) diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala index 944c81b5..25b59b52 100644 --- a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/HardFilterGenotypes.scala @@ -352,7 +352,7 @@ private[avocado] object HardFilterGenotypes extends Serializable { */ private[util] def filterQuality(genotype: Genotype, minQuality: Int): Boolean = { - genotype.getGenotypeQuality > minQuality + Option(genotype.getGenotypeQuality).fold(true)(_ > minQuality) } /** diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala new file mode 100644 index 00000000..539bd6a2 --- /dev/null +++ b/avocado-core/src/main/scala/org/bdgenomics/avocado/util/RewriteHets.scala @@ -0,0 +1,166 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.avocado.util + +import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele } +import scala.collection.JavaConversions._ + +private[avocado] trait RewriteHetsArgs extends Serializable { + + /** + * The maximum allele fraction for the alternate allele in a het SNP call. + * + * Set to a negative value to omit. + */ + var maxHetSnpAltAllelicFraction: Float + + /** + * The maximum allele fraction for the alternate allele in a het SNP call. + * + * Set to a negative value to omit. + */ + var maxHetIndelAltAllelicFraction: Float + + /** + * If true, does not attempt to rewrite het SNPs. + */ + var disableHetSnpRewriting: Boolean + + /** + * If true, does not attempt to rewrite het INDELs. + */ + var disableHetIndelRewriting: Boolean +} + +/** + * Rewrites high allelic fraction het genotypes as homozygous alternate calls. + */ +object RewriteHets extends Serializable { + + /** + * Identifies high allelic fraction het calls in an RDD of genotypes and + * rewrites them as homozygous alt calls. + * + * @param rdd The RDD of genotypes to filter. + * @param args The arguments to configure the rewriter. + * @return Returns a new RDD of genotypes. + */ + def apply(rdd: GenotypeRDD, + args: RewriteHetsArgs): GenotypeRDD = { + + val maxSnpAllelicFraction = args.maxHetSnpAltAllelicFraction + val maxIndelAllelicFraction = args.maxHetIndelAltAllelicFraction + val rewriteHetSnps = !args.disableHetSnpRewriting + val rewriteHetIndels = !args.disableHetIndelRewriting + + if (rewriteHetSnps || rewriteHetIndels) { + rdd.transform(gtRdd => gtRdd.map(processGenotype(_, + maxSnpAllelicFraction, + maxIndelAllelicFraction, + rewriteHetSnps, + rewriteHetIndels))) + } else { + rdd + } + } + + /** + * Examines a single genotype to see if it should be rewritten. + * + * @param gt The genotype to examine. + * @param maxSnpAllelicFraction The threshold for considering a het SNP call + * to be a miscalled homozygous SNP. + * @param maxIndelAllelicFraction The threshold for considering a het INDEL + * call to be a miscalled homozygous INDEL + * @param rewriteHetSnps If false, disables SNP checking. + * @param rewriteHetIndels If false, disables INDEL checking. + * @return Returns true if the genotype should be rewritten.. + */ + private[util] def shouldRewrite(gt: Genotype, + maxSnpAllelicFraction: Float, + maxIndelAllelicFraction: Float, + rewriteHetSnps: Boolean, + rewriteHetIndels: Boolean): Boolean = { + val isSnp = ((gt.getVariant.getReferenceAllele.length == 1) && + (gt.getVariant.getAlternateAllele.length == 1)) + val numAlts = gt.getAlleles.count(_ == GenotypeAllele.ALT) + val isHet = (numAlts != 0) && (numAlts != gt.getAlleles.length) + + def checkAf(af: Float): Boolean = { + (Option(gt.getReadDepth), Option(gt.getAlternateReadDepth)) match { + case (Some(dp), Some(alt)) => (alt.toFloat / dp.toFloat) >= af + case _ => false + } + } + + if (rewriteHetSnps && isSnp && isHet) { + checkAf(maxSnpAllelicFraction) + } else if (rewriteHetIndels && !isSnp && isHet) { + checkAf(maxIndelAllelicFraction) + } else { + false + } + } + + /** + * Rewrites a het genotype as a hom alt call. + * + * @param gt The genotype to rewrite. + * @return Returns the rewritten genotype. + */ + private[util] def rewriteGenotype(gt: Genotype): Genotype = { + val numAlleles = gt.getAlleles.length + val newAlleles = List.fill(numAlleles) { GenotypeAllele.ALT } + + Genotype.newBuilder(gt) + .setGenotypeQuality(null) + .setAlleles(newAlleles) + .build + } + + /** + * Processes a single genotype, and rewrites it if it appears to be a + * miscalled hom alt. + * + * @param gt The genotype to examine. + * @param maxSnpAllelicFraction The threshold for considering a het SNP call + * to be a miscalled homozygous SNP. + * @param maxIndelAllelicFraction The threshold for considering a het INDEL + * call to be a miscalled homozygous INDEL + * @param rewriteHetSnps If false, disables SNP checking. + * @param rewriteHetIndels If false, disables INDEL checking. + * @return Returns the rewritten genotype if the genotype should be rewritten, + * else returns the original genotype. + */ + private[util] def processGenotype(gt: Genotype, + maxSnpAllelicFraction: Float, + maxIndelAllelicFraction: Float, + rewriteHetSnps: Boolean, + rewriteHetIndels: Boolean): Genotype = { + if (shouldRewrite(gt, + maxSnpAllelicFraction, + maxIndelAllelicFraction, + rewriteHetSnps, + rewriteHetIndels)) { + rewriteGenotype(gt) + } else { + gt + } + } +} diff --git a/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala new file mode 100644 index 00000000..05084351 --- /dev/null +++ b/avocado-core/src/test/scala/org/bdgenomics/avocado/util/RewriteHetsSuite.scala @@ -0,0 +1,195 @@ +/** + * Licensed to Big Data Genomics (BDG) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The BDG licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.bdgenomics.avocado.util + +import org.bdgenomics.avocado.AvocadoFunSuite +import org.bdgenomics.adam.models.SequenceDictionary +import org.bdgenomics.adam.rdd.variant.GenotypeRDD +import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele, Variant } +import scala.collection.JavaConversions._ + +case class TestRewriteHetsArgs( + var maxHetSnpAltAllelicFraction: Float = 0.75f, + var maxHetIndelAltAllelicFraction: Float = 0.65f, + var disableHetSnpRewriting: Boolean = false, + var disableHetIndelRewriting: Boolean = false) extends RewriteHetsArgs { +} + +class RewriteHetsSuite extends AvocadoFunSuite { + + def buildGt(ref: String, + alt: String, + gq: Int, + dp: Int, + adp: Int, + alleles: List[GenotypeAllele]): Genotype = { + Genotype.newBuilder + .setGenotypeQuality(gq) + .setVariant(Variant.newBuilder + .setReferenceAllele(ref) + .setAlternateAllele(alt) + .build) + .setAlleles(alleles) + .setReadDepth(dp) + .setAlternateReadDepth(adp) + .build + } + + val goodHetSnp = buildGt("A", "T", 30, 30, 15, + List(GenotypeAllele.REF, GenotypeAllele.ALT)) + val badHetSnp = buildGt("A", "T", 30, 30, 25, + List(GenotypeAllele.REF, GenotypeAllele.ALT)) + val goodHetIndel = buildGt("AA", "T", 30, 50, 30, + List(GenotypeAllele.REF, GenotypeAllele.ALT)) + val badHetIndel = buildGt("A", "TCG", 30, 20, 20, + List(GenotypeAllele.REF, GenotypeAllele.ALT, GenotypeAllele.ALT)) + val homRefSnp = buildGt("A", "T", 30, 30, 3, + List(GenotypeAllele.REF, GenotypeAllele.REF)) + val homRefIndel = buildGt("A", "TCC", 30, 50, 5, + List(GenotypeAllele.REF)) + val homAltSnp = buildGt("A", "T", 30, 30, 29, + List(GenotypeAllele.ALT)) + val homAltIndel = buildGt("A", "TCC", 30, 50, 45, + List(GenotypeAllele.ALT, GenotypeAllele.ALT)) + + def shouldRewrite(gt: Genotype, + snps: Boolean = true, + indels: Boolean = true): Boolean = { + RewriteHets.shouldRewrite(gt, 0.75f, 0.65f, snps, indels) + } + + test("should rewrite a bad het snp") { + assert(shouldRewrite(badHetSnp)) + } + + test("should not rewrite het snp if snp filtering is disabled") { + assert(!shouldRewrite(badHetSnp, snps = false)) + } + + test("should rewrite a bad het indel") { + assert(shouldRewrite(badHetIndel)) + } + + test("should not rewrite het indel if indel filtering is disabled") { + assert(!shouldRewrite(badHetIndel, indels = false)) + } + + test("don't rewrite good het calls") { + assert(!shouldRewrite(goodHetSnp)) + assert(!shouldRewrite(goodHetIndel)) + } + + test("don't rewrite homozygous calls") { + assert(!shouldRewrite(homRefSnp)) + assert(!shouldRewrite(homRefIndel)) + assert(!shouldRewrite(homAltSnp)) + assert(!shouldRewrite(homAltIndel)) + } + + test("rewrite a het call as a hom alt snp") { + val rewrittenGt = RewriteHets.rewriteGenotype(badHetSnp) + assert(rewrittenGt.getVariant.getReferenceAllele === "A") + assert(rewrittenGt.getVariant.getAlternateAllele === "T") + assert(Option(rewrittenGt.getGenotypeQuality).isEmpty) + assert(rewrittenGt.getAlleles.length === 2) + assert(rewrittenGt.getAlleles.get(0) === GenotypeAllele.ALT) + assert(rewrittenGt.getAlleles.get(1) === GenotypeAllele.ALT) + assert(rewrittenGt.getReadDepth === 30) + assert(rewrittenGt.getAlternateReadDepth === 25) + } + + def processGenotype(gt: Genotype, + snps: Boolean = true, + indels: Boolean = true): Genotype = { + RewriteHets.processGenotype(gt, 0.75f, 0.65f, snps, indels) + } + + test("processing a valid call should not change the call") { + val goodCalls = Seq(goodHetSnp, goodHetIndel, + homRefSnp, homRefIndel, + homAltSnp, homAltIndel) + goodCalls.foreach(gt => assert(gt === processGenotype(gt))) + } + + test("if processing is disabled, don't rewrite bad calls") { + val badCalls = Seq(badHetSnp, badHetIndel) + badCalls.foreach(gt => assert(gt === processGenotype(gt, + snps = false, + indels = false))) + } + + test("process a bad het snp call") { + val rewrittenGt = processGenotype(badHetSnp) + assert(rewrittenGt != badHetSnp) + assert(rewrittenGt.getVariant.getReferenceAllele === "A") + assert(rewrittenGt.getVariant.getAlternateAllele === "T") + assert(Option(rewrittenGt.getGenotypeQuality).isEmpty) + assert(rewrittenGt.getAlleles.length === 2) + assert(rewrittenGt.getAlleles.get(0) === GenotypeAllele.ALT) + assert(rewrittenGt.getAlleles.get(1) === GenotypeAllele.ALT) + assert(rewrittenGt.getReadDepth === 30) + assert(rewrittenGt.getAlternateReadDepth === 25) + } + + test("process a bad het indel call") { + val rewrittenGt = processGenotype(badHetIndel) + assert(rewrittenGt != badHetIndel) + assert(rewrittenGt.getVariant.getReferenceAllele === "A") + assert(rewrittenGt.getVariant.getAlternateAllele === "TCG") + assert(Option(rewrittenGt.getGenotypeQuality).isEmpty) + assert(rewrittenGt.getAlleles.length === 3) + assert(rewrittenGt.getAlleles.get(0) === GenotypeAllele.ALT) + assert(rewrittenGt.getAlleles.get(1) === GenotypeAllele.ALT) + assert(rewrittenGt.getAlleles.get(2) === GenotypeAllele.ALT) + assert(rewrittenGt.getReadDepth === 20) + assert(rewrittenGt.getAlternateReadDepth === 20) + } + + val genotypes = Seq(badHetSnp, badHetIndel, + goodHetSnp, goodHetIndel, + homRefSnp, homRefIndel, + homAltSnp, homAltIndel) + + def gtRdd: GenotypeRDD = { + val rdd = sc.parallelize(genotypes) + GenotypeRDD(rdd, + SequenceDictionary.empty, + Seq.empty, + Seq.empty) + } + + sparkTest("disable processing for a whole rdd") { + val rewrittenRdd = RewriteHets(gtRdd, + TestRewriteHetsArgs(disableHetSnpRewriting = true, + disableHetIndelRewriting = true)) + val newGts = rewrittenRdd.rdd.collect + val oldGtSet = genotypes.toSet + newGts.foreach(gt => assert(oldGtSet(gt))) + } + + sparkTest("process a whole rdd") { + val rewrittenRdd = RewriteHets(gtRdd, TestRewriteHetsArgs()) + val newGts = rewrittenRdd.rdd.collect + val oldGtSet = genotypes.toSet + val (touchedGts, untouchedGts) = newGts.partition(_.getGenotypeQuality == null) + assert(untouchedGts.size === 6) + assert(touchedGts.size === 2) + untouchedGts.foreach(gt => assert(oldGtSet(gt))) + touchedGts.foreach(gt => assert(gt.getAlleles.forall(_ == GenotypeAllele.ALT))) + } +}