Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mutect2 can filter MNVs with orientation bias #6486

Merged
merged 2 commits into from Mar 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -15,6 +15,7 @@

import java.io.File;
import java.util.*;
import java.util.stream.IntStream;

public class ReadOrientationFilter extends Mutect2VariantFilter {
private Map<String, ArtifactPriorCollection> artifactPriorCollections = new HashMap<>();
Expand All @@ -39,8 +40,7 @@ public static int[] getF2R1(final Genotype g) {
public ErrorType errorType() { return ErrorType.ARTIFACT; }

public double calculateErrorProbability(final VariantContext vc, final Mutect2FilteringEngine filteringEngine, ReferenceContext referenceContext) {

if (! vc.isSNP()){
if (!vc.isSNP() && !vc.isMNP()){
return 0;
}

Expand Down Expand Up @@ -75,7 +75,7 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
double artifactProbability(final ReferenceContext referenceContext, final VariantContext vc, final Genotype g) {
// As of June 2018, genotype is hom ref iff we have the normal sample, but this may change in the future
// TODO: handle MNVs
if (g.isHomRef() || !vc.isSNP() ){
if (g.isHomRef() || (!vc.isSNP() && !vc.isMNP()) ){
return 0;
} else if (!artifactPriorCollections.containsKey(g.getSampleName())) {
return 0;
Expand All @@ -84,9 +84,19 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
final double[] tumorLods = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, -1);
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);
final Nucleotide altBase = Nucleotide.valueOf(altAllele.toString());
final byte[] altBases = altAllele.getBases();

// for MNVs, treat each base as an independent substitution and take the maximum of all error probabilities
return IntStream.range(0, altBases.length).mapToDouble(n -> {
final Nucleotide altBase = Nucleotide.valueOf(new String(new byte[] {altBases[n]}));

return artifactProbability(referenceContext, vc.getStart() + n, g, indexOfMaxTumorLod, altBase);
}).max().orElse(0.0);

}

final String refContext = referenceContext.getKmerAround(vc.getStart(), F1R2FilterConstants.REF_CONTEXT_PADDING);
private double artifactProbability(final ReferenceContext referenceContext, final int refPosition, final Genotype g, final int indexOfMaxTumorLod, final Nucleotide altBase) {
final String refContext = referenceContext.getKmerAround(refPosition, F1R2FilterConstants.REF_CONTEXT_PADDING);
if (refContext == null || refContext.contains("N")){
return 0;
}
Expand All @@ -95,9 +105,6 @@ public Optional<String> phredScaledPosteriorAnnotationName() {
String.format("kmer must have length %d but got %d", 2 * F1R2FilterConstants.REF_CONTEXT_PADDING + 1, refContext.length()));

final Nucleotide refAllele = F1R2FilterUtils.getMiddleBase(refContext);
Utils.validate(refAllele == Nucleotide.valueOf(vc.getReference().toString().replace("*", "")),
String.format("ref allele in the kmer, %s, does not match the ref allele in the variant context, %s",
refAllele, vc.getReference().toString().replace("*", "")));

if (!(g.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY) && g.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY))) {
return 0;
Expand Down
Expand Up @@ -8,6 +8,7 @@
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceFileSource;
import org.broadinstitute.hellbender.tools.walkers.readorientation.*;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand Down Expand Up @@ -59,50 +60,56 @@ public void test(final int depth, final double alleleFraction, final double altF
final Optional<ReadOrientation> expectedArtifactType) throws IOException {
final ReferenceContext reference = new ReferenceContext(new ReferenceFileSource(IOUtils.getPath(hg19_chr1_1M_Reference)),
new SimpleInterval("1", variantPosition -5, variantPosition +5));
final String refContext = reference.getKmerAround(variantPosition, F1R2FilterConstants.REF_CONTEXT_PADDING);
final Allele refAllele = Allele.create(F1R2FilterUtils.getMiddleBase(refContext).toString(), true);
final Nucleotide altBase = Nucleotide.A;
final Allele altAllele = Allele.create(altBase.toString(), false); // C -> A transition
final List<Allele> alleles = Arrays.asList(refAllele, altAllele);
final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition, alleles)
.attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{ 7.0 })
.make();

/**
* The prior distribution of the states predicts that most of the time a site is hom ref, and the remaining
* probability mass is distributed equally among the remaining states
*/
double[] pi = createReasonablePrior(altBase);

final int altCount = (int) (depth * alleleFraction);
final int refCount = depth - altCount;
final int altF1R2 = (int) (altCount * altF1R2Fraction);
final int[] f1r2 = new int[] { refCount/2, altF1R2};
final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2};

// Arbitrarily choose the number of ref and alt examples observed to compute the priors. They do not affect inference.
int numRefExamples = 1_000_000;
int numAltExamples = 1000;


final Nucleotide[] altBases = new Nucleotide[] {Nucleotide.A, Nucleotide.C, Nucleotide.A};
final StringBuilder altBasesStringBuilder = new StringBuilder();
for (final Nucleotide nuc : altBases) {
altBasesStringBuilder.append((char) nuc.encodeAsByte());
}
final String altBasesString = altBasesStringBuilder.toString();

final ArtifactPriorCollection priors = new ArtifactPriorCollection(sampleName);
priors.set(new ArtifactPrior(refContext, pi, numRefExamples, numAltExamples));

for (int n = 0; n < altBases.length; n++) {
// most of the time a site is hom ref, and the remaining probability mass is distributed equally among the remaining states
double[] pi = createReasonablePrior(altBases[n]);
priors.set(new ArtifactPrior(reference.getKmerAround(variantPosition + n, F1R2FilterConstants.REF_CONTEXT_PADDING), pi, numRefExamples, numAltExamples));
}
final File table = File.createTempFile("prior", "table");
priors.writeArtifactPriors(table);

final ReadOrientationFilter filter = new ReadOrientationFilter(Collections.singletonList(table));


final int altCount = (int) (depth * alleleFraction);
final int refCount = depth - altCount;
final int altF1R2 = (int) (altCount * altF1R2Fraction);
final int[] f1r2 = new int[] { refCount/2, altF1R2};
final int[] f2r1 = new int[] { refCount - refCount/2, altCount - altF1R2};

final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName)
.attribute(GATKVCFConstants.F1R2_KEY, f1r2)
.attribute(GATKVCFConstants.F2R1_KEY, f2r1);

genotypeBuilder.alleles(alleles);
final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make());

if (expectedArtifactType.isPresent()){
Assert.assertEquals(posterior, expectedArtifactProb, epsilon);
} else {
Assert.assertTrue(posterior < 0.01);
for (int mnpLength = 1; mnpLength <= altBases.length; mnpLength++) {
final Allele refAllele = Allele.create(reference.getBases(new SimpleInterval("1", variantPosition, variantPosition + mnpLength - 1)), true);
final Allele altAllele = Allele.create(altBasesString.substring(0, mnpLength), false); // C -> A transition
final List<Allele> alleles = Arrays.asList(refAllele, altAllele);
final VariantContext vc = new VariantContextBuilder("source", Integer.toString(chromosomeIndex), variantPosition, variantPosition + mnpLength - 1, alleles)
.attribute(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, new double[]{7.0})
.make();

final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sampleName)
.attribute(GATKVCFConstants.F1R2_KEY, f1r2)
.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
genotypeBuilder.alleles(alleles);
final double posterior = filter.artifactProbability(reference, vc, genotypeBuilder.make());

if (expectedArtifactType.isPresent()) {
Assert.assertEquals(posterior, expectedArtifactProb, epsilon);
} else {
Assert.assertTrue(posterior < 0.01);
}
}
}

Expand Down