From cf6ce1b751126f7fd27baf55356af8b9b607b34b Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 5 Mar 2020 12:02:50 -0500 Subject: [PATCH] Mutect2 can filter MNVs with orientation bias (#6486) --- .../filtering/ReadOrientationFilter.java | 23 ++++-- .../ReadOrientationFilterUnitTest.java | 73 ++++++++++--------- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilter.java index 130c2e562dd..dc9820a626b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilter.java @@ -15,6 +15,7 @@ import java.io.File; import java.util.*; +import java.util.stream.IntStream; public class ReadOrientationFilter extends Mutect2VariantFilter { private Map artifactPriorCollections = new HashMap<>(); @@ -39,8 +40,7 @@ public static int[] getF2R1(final Genotype g) { public ErrorType errorType() { return ErrorType.ARTIFACT; } public double calculateErrorProbability(final VariantContext vc, final Mutect2FilteringEngine filteringEngine, ReferenceContext referenceContext) { - - if (! vc.isSNP()){ + if (!vc.isSNP() && !vc.isMNP()){ return 0; } @@ -75,7 +75,7 @@ public Optional phredScaledPosteriorAnnotationName() { double artifactProbability(final ReferenceContext referenceContext, final VariantContext vc, final Genotype g) { // As of June 2018, genotype is hom ref iff we have the normal sample, but this may change in the future // TODO: handle MNVs - if (g.isHomRef() || !vc.isSNP() ){ + if (g.isHomRef() || (!vc.isSNP() && !vc.isMNP()) ){ return 0; } else if (!artifactPriorCollections.containsKey(g.getSampleName())) { return 0; @@ -84,9 +84,19 @@ public Optional phredScaledPosteriorAnnotationName() { final double[] tumorLods = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, -1); final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods); final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod); - final Nucleotide altBase = Nucleotide.valueOf(altAllele.toString()); + final byte[] altBases = altAllele.getBases(); + + // for MNVs, treat each base as an independent substitution and take the maximum of all error probabilities + return IntStream.range(0, altBases.length).mapToDouble(n -> { + final Nucleotide altBase = Nucleotide.valueOf(new String(new byte[] {altBases[n]})); + + return artifactProbability(referenceContext, vc.getStart() + n, g, indexOfMaxTumorLod, altBase); + }).max().orElse(0.0); + + } - final String refContext = referenceContext.getKmerAround(vc.getStart(), F1R2FilterConstants.REF_CONTEXT_PADDING); + private double artifactProbability(final ReferenceContext referenceContext, final int refPosition, final Genotype g, final int indexOfMaxTumorLod, final Nucleotide altBase) { + final String refContext = referenceContext.getKmerAround(refPosition, F1R2FilterConstants.REF_CONTEXT_PADDING); if (refContext == null || refContext.contains("N")){ return 0; } @@ -95,9 +105,6 @@ public Optional phredScaledPosteriorAnnotationName() { String.format("kmer must have length %d but got %d", 2 * F1R2FilterConstants.REF_CONTEXT_PADDING + 1, refContext.length())); final Nucleotide refAllele = F1R2FilterUtils.getMiddleBase(refContext); - Utils.validate(refAllele == Nucleotide.valueOf(vc.getReference().toString().replace("*", "")), - String.format("ref allele in the kmer, %s, does not match the ref allele in the variant context, %s", - refAllele, vc.getReference().toString().replace("*", ""))); if (!(g.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY) && g.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY))) { return 0; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilterUnitTest.java index f9c131c14dc..d4c682a498f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ReadOrientationFilterUnitTest.java @@ -8,6 +8,7 @@ import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.ReferenceFileSource; import org.broadinstitute.hellbender.tools.walkers.readorientation.*; +import org.broadinstitute.hellbender.utils.BaseUtils; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -59,50 +60,56 @@ public void test(final int depth, final double alleleFraction, final double altF final Optional expectedArtifactType) throws IOException { final ReferenceContext reference = new ReferenceContext(new ReferenceFileSource(IOUtils.getPath(hg19_chr1_1M_Reference)), new SimpleInterval("1", variantPosition -5, variantPosition +5)); - final String refContext = reference.getKmerAround(variantPosition, F1R2FilterConstants.REF_CONTEXT_PADDING); - final Allele refAllele = Allele.create(F1R2FilterUtils.getMiddleBase(refContext).toString(), true); - final Nucleotide altBase = Nucleotide.A; - final Allele altAllele = Allele.create(altBase.toString(), false); // C -> A transition - final List alleles = Arrays.asList(refAllele, altAllele); - final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition, alleles) - .attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{ 7.0 }) - .make(); - - /** - * The prior distribution of the states predicts that most of the time a site is hom ref, and the remaining - * probability mass is distributed equally among the remaining states - */ - double[] pi = createReasonablePrior(altBase); + + final int altCount = (int) (depth * alleleFraction); + final int refCount = depth - altCount; + final int altF1R2 = (int) (altCount * altF1R2Fraction); + final int[] f1r2 = new int[] { refCount/2, altF1R2}; + final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2}; // Arbitrarily choose the number of ref and alt examples observed to compute the priors. They do not affect inference. int numRefExamples = 1_000_000; int numAltExamples = 1000; + + final Nucleotide[] altBases = new Nucleotide[] {Nucleotide.A, Nucleotide.C, Nucleotide.A}; + final StringBuilder altBasesStringBuilder = new StringBuilder(); + for (final Nucleotide nuc : altBases) { + altBasesStringBuilder.append((char) nuc.encodeAsByte()); + } + final String altBasesString = altBasesStringBuilder.toString(); + final ArtifactPriorCollection priors = new ArtifactPriorCollection(sampleName); - priors.set(new ArtifactPrior(refContext, pi, numRefExamples, numAltExamples)); + + for (int n = 0; n < altBases.length; n++) { + // most of the time a site is hom ref, and the remaining probability mass is distributed equally among the remaining states + double[] pi = createReasonablePrior(altBases[n]); + priors.set(new ArtifactPrior(reference.getKmerAround(variantPosition + n, F1R2FilterConstants.REF_CONTEXT_PADDING), pi, numRefExamples, numAltExamples)); + } final File table = File.createTempFile("prior", "table"); priors.writeArtifactPriors(table); final ReadOrientationFilter filter = new ReadOrientationFilter(Collections.singletonList(table)); - - final int altCount = (int) (depth * alleleFraction); - final int refCount = depth - altCount; - final int altF1R2 = (int) (altCount * altF1R2Fraction); - final int[] f1r2 = new int[] { refCount/2, altF1R2}; - final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2}; - - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName) - .attribute(GATKVCFConstants.F1R2_KEY, f1r2) - .attribute(GATKVCFConstants.F2R1_KEY, f2r1); - - genotypeBuilder.alleles(alleles); - final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make()); - - if (expectedArtifactType.isPresent()){ - Assert.assertEquals(posterior, expectedArtifactProb, epsilon); - } else { - Assert.assertTrue(posterior < 0.01); + for (int mnpLength = 1; mnpLength <= altBases.length; mnpLength++) { + final Allele refAllele = Allele.create(reference.getBases(new SimpleInterval("1", variantPosition, variantPosition + mnpLength - 1)), true); + final Allele altAllele = Allele.create(altBasesString.substring(0, mnpLength), false); // C -> A transition + final List alleles = Arrays.asList(refAllele, altAllele); + final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition + mnpLength - 1, alleles) + .attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{7.0}) + .make(); + + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName) + .attribute(GATKVCFConstants.F1R2_KEY, f1r2) + .attribute(GATKVCFConstants.F2R1_KEY, f2r1); + genotypeBuilder.alleles(alleles); + final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make()); + + if (expectedArtifactType.isPresent()) { + Assert.assertEquals(posterior, expectedArtifactProb, epsilon); + } else { + Assert.assertTrue(posterior < 0.01); + } } }