Skip to content

Commit

Permalink
Mutect2 can filter MNVs with orientation bias (#6486)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin committed Mar 5, 2020
1 parent 4d0ca27 commit cf6ce1b
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 41 deletions.
Expand Up @@ -15,6 +15,7 @@

import java.io.File;
import java.util.*;
import java.util.stream.IntStream;

public class ReadOrientationFilter extends Mutect2VariantFilter {
private Map<String, ArtifactPriorCollection> artifactPriorCollections = new HashMap<>();
Expand All @@ -39,8 +40,7 @@ public static int[] getF2R1(final Genotype g) {
public ErrorType errorType() { return ErrorType.ARTIFACT; }

public double calculateErrorProbability(final VariantContext vc, final Mutect2FilteringEngine filteringEngine, ReferenceContext referenceContext) {

if (! vc.isSNP()){
if (!vc.isSNP() && !vc.isMNP()){
return 0;
}

Expand Down Expand Up @@ -75,7 +75,7 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
double artifactProbability(final ReferenceContext referenceContext, final VariantContext vc, final Genotype g) {
// As of June 2018, genotype is hom ref iff we have the normal sample, but this may change in the future
// TODO: handle MNVs
if (g.isHomRef() || !vc.isSNP() ){
if (g.isHomRef() || (!vc.isSNP() && !vc.isMNP()) ){
return 0;
} else if (!artifactPriorCollections.containsKey(g.getSampleName())) {
return 0;
Expand All @@ -84,9 +84,19 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
final double[] tumorLods = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, -1);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);
final Nucleotide altBase = Nucleotide.valueOf(altAllele.toString());
final byte[] altBases = altAllele.getBases();

// for MNVs, treat each base as an independent substitution and take the maximum of all error probabilities
return IntStream.range(0, altBases.length).mapToDouble(n -> {
final Nucleotide altBase = Nucleotide.valueOf(new String(new byte[] {altBases[n]}));

return artifactProbability(referenceContext, vc.getStart() + n, g, indexOfMaxTumorLod, altBase);
}).max().orElse(0.0);

}

final String refContext = referenceContext.getKmerAround(vc.getStart(), F1R2FilterConstants.REF_CONTEXT_PADDING);
private double artifactProbability(final ReferenceContext referenceContext, final int refPosition, final Genotype g, final int indexOfMaxTumorLod, final Nucleotide altBase) {
final String refContext = referenceContext.getKmerAround(refPosition, F1R2FilterConstants.REF_CONTEXT_PADDING);
if (refContext == null || refContext.contains("N")){
return 0;
}
Expand All @@ -95,9 +105,6 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
String.format("kmer must have length %d but got %d", 2 * F1R2FilterConstants.REF_CONTEXT_PADDING + 1, refContext.length()));

final Nucleotide refAllele = F1R2FilterUtils.getMiddleBase(refContext);
Utils.validate(refAllele == Nucleotide.valueOf(vc.getReference().toString().replace("*", "")),
String.format("ref allele in the kmer, %s, does not match the ref allele in the variant context, %s",
refAllele, vc.getReference().toString().replace("*", "")));

if (!(g.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY) && g.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY))) {
return 0;
Expand Down
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceFileSource;
import org.broadinstitute.hellbender.tools.walkers.readorientation.*;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -59,50 +60,56 @@ public void test(final int depth, final double alleleFraction, final double altF
final Optional<ReadOrientation> expectedArtifactType) throws IOException {
final ReferenceContext reference = new ReferenceContext(new ReferenceFileSource(IOUtils.getPath(hg19_chr1_1M_Reference)),
new SimpleInterval("1", variantPosition -5, variantPosition +5));
final String refContext = reference.getKmerAround(variantPosition, F1R2FilterConstants.REF_CONTEXT_PADDING);
final Allele refAllele = Allele.create(F1R2FilterUtils.getMiddleBase(refContext).toString(), true);
final Nucleotide altBase = Nucleotide.A;
final Allele altAllele = Allele.create(altBase.toString(), false); // C -> A transition
final List<Allele> alleles = Arrays.asList(refAllele, altAllele);
final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition, alleles)
.attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{ 7.0 })
.make();

/**
* The prior distribution of the states predicts that most of the time a site is hom ref, and the remaining
* probability mass is distributed equally among the remaining states
*/
double[] pi = createReasonablePrior(altBase);

final int altCount = (int) (depth * alleleFraction);
final int refCount = depth - altCount;
final int altF1R2 = (int) (altCount * altF1R2Fraction);
final int[] f1r2 = new int[] { refCount/2, altF1R2};
final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2};

// Arbitrarily choose the number of ref and alt examples observed to compute the priors. They do not affect inference.
int numRefExamples = 1_000_000;
int numAltExamples = 1000;


final Nucleotide[] altBases = new Nucleotide[] {Nucleotide.A, Nucleotide.C, Nucleotide.A};
final StringBuilder altBasesStringBuilder = new StringBuilder();
for (final Nucleotide nuc : altBases) {
altBasesStringBuilder.append((char) nuc.encodeAsByte());
}
final String altBasesString = altBasesStringBuilder.toString();

final ArtifactPriorCollection priors = new ArtifactPriorCollection(sampleName);
priors.set(new ArtifactPrior(refContext, pi, numRefExamples, numAltExamples));

for (int n = 0; n < altBases.length; n++) {
// most of the time a site is hom ref, and the remaining probability mass is distributed equally among the remaining states
double[] pi = createReasonablePrior(altBases[n]);
priors.set(new ArtifactPrior(reference.getKmerAround(variantPosition + n, F1R2FilterConstants.REF_CONTEXT_PADDING), pi, numRefExamples, numAltExamples));
}
final File table = File.createTempFile("prior", "table");
priors.writeArtifactPriors(table);

final ReadOrientationFilter filter = new ReadOrientationFilter(Collections.singletonList(table));


final int altCount = (int) (depth * alleleFraction);
final int refCount = depth - altCount;
final int altF1R2 = (int) (altCount * altF1R2Fraction);
final int[] f1r2 = new int[] { refCount/2, altF1R2};
final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2};

final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName)
.attribute(GATKVCFConstants.F1R2_KEY, f1r2)
.attribute(GATKVCFConstants.F2R1_KEY, f2r1);

genotypeBuilder.alleles(alleles);
final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make());

if (expectedArtifactType.isPresent()){
Assert.assertEquals(posterior, expectedArtifactProb, epsilon);
} else {
Assert.assertTrue(posterior < 0.01);
for (int mnpLength = 1; mnpLength <= altBases.length; mnpLength++) {
final Allele refAllele = Allele.create(reference.getBases(new SimpleInterval("1", variantPosition, variantPosition + mnpLength - 1)), true);
final Allele altAllele = Allele.create(altBasesString.substring(0, mnpLength), false); // C -> A transition
final List<Allele> alleles = Arrays.asList(refAllele, altAllele);
final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition + mnpLength - 1, alleles)
.attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{7.0})
.make();

final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName)
.attribute(GATKVCFConstants.F1R2_KEY, f1r2)
.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
genotypeBuilder.alleles(alleles);
final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make());

if (expectedArtifactType.isPresent()) {
Assert.assertEquals(posterior, expectedArtifactProb, epsilon);
} else {
Assert.assertTrue(posterior < 0.01);
}
}
}

Expand Down

0 comments on commit cf6ce1b

Please sign in to comment.