Port of GATK3 version of VariantAnnotator with the necessary changes …

…to the annotation engine and client annotations to support annotating variants independant of the haplotype caller.
broadinstitute · Jan 19, 2018 · a42e5c6 · a42e5c6
1 parent 9c35a3d
commit a42e5c6
Show file tree

Hide file tree

Showing 111 changed files with 4,956 additions and 143 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/WellformedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/WellformedReadFilter.java
@@ -46,7 +46,7 @@ public final class WellformedReadFilter extends ReadFilter {
     private ReadFilter wellFormedFilter = null;
 
     // Command line parser requires a no-arg constructor
-    public WellformedReadFilter() {
+    public  WellformedReadFilter() {
     }
 
     @Override

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/HaplotypeCallerSpark.java
@@ -221,7 +221,7 @@ public static JavaRDD<VariantContext> callVariantsWithHaplotypeCaller(
         final Broadcast<ReferenceMultiSource> referenceBroadcast = ctx.broadcast(reference);
         final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast = ctx.broadcast(hcArgs);
 
-        final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps);
+        final VariantAnnotatorEngine variantAnnotatorEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(hcArgs.variantAnnotationArgumentCollection, hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence == ReferenceConfidenceMode.GVCF);
         final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast = ctx.broadcast(variantAnnotatorEngine);
 
         final List<ShardBoundary> shardBoundaries = getShardBoundaries(header, intervals, shardingArgs.readShardSize, shardingArgs.readShardPadding);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java
@@ -207,7 +207,7 @@ private void resizeReferenceIfNeeded(SimpleInterval intervalToClose) {
     @Override
     public void onTraversalStart() {
         // create the annotation engine
-        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList());
+        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList(), false);
 
         vcfWriter = getVCFWriter();
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
@@ -153,7 +153,7 @@ public void onTraversalStart() {
 
         final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder?
 
-        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList());
+        annotationEngine = VariantAnnotatorEngine.ofSelectedMinusExcluded(variantAnnotationArgumentCollection, dbsnp.dbsnp, Collections.emptyList(), false);
 
         // We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations.
         genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY));

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -1,6 +1,13 @@
 package org.broadinstitute.hellbender.tools.walkers.annotator;
 
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
 import org.apache.commons.lang.StringUtils;
+import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
+import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -30,4 +37,97 @@ public static String encodeStringList( final List<String> stringList) {
         return StringUtils.join(stringList, ",");
     }
 
+    /**
+     * Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends
+     * Used by ReadPosRankSum annotations
+     *
+     * @param read  a read containing the variant
+     * @param initialReadPosition   the position based on the modified, post-hard-clipped CIGAR
+     * @return read position
+     */
+    public static int getFinalVariantReadPosition(final GATKRead read, final int initialReadPosition) {
+        final int numAlignedBases = getNumAlignedBases(read);
+
+        int readPos = initialReadPosition;
+        //TODO: this doesn't work for the middle-right position if we index from zero
+        if (initialReadPosition > numAlignedBases / 2) {
+            readPos = numAlignedBases - (initialReadPosition + 1);
+        }
+        return readPos;
+
+    }
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  the number of hard clipped and low qual bases at the read start (where start is the leftmost end w.r.t. the reference)
+     */
+    public static int getNumClippedBasesAtStart(final GATKRead read) {
+        // check for hard clips (never consider these bases):
+        final Cigar c = read.getCigar();
+        final CigarElement first = c.getCigarElement(0);
+
+        int numStartClippedBases = 0;
+        if (first.getOperator() == CigarOperator.H) {
+            numStartClippedBases = first.getLength();
+        }
+        final byte[] unclippedReadBases = read.getBases();
+        final byte[] unclippedReadQuals = read.getBaseQualities();
+
+        // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+        // and may leave a string of Q2 bases still hanging off the reads.
+        //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+        for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
+            if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
+                numStartClippedBases++;
+            } else {
+                break;
+            }
+        }
+
+        return numStartClippedBases;
+    }
+
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  number of non-hard clipped, aligned bases (excluding low quality bases at either end)
+     */
+    //TODO: this is bizarre -- this code counts hard clips, but then subtracts them from the read length, which already doesn't count hard clips
+    public static int getNumAlignedBases(final GATKRead read) {
+        return read.getLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
+    }
+
+    /**
+     *
+     * @param read  a read containing the variant
+     * @return  number of hard clipped and low qual bases at the read end (where end is right end w.r.t. the reference)
+     */
+    public static int getNumClippedBasesAtEnd(final GATKRead read) {
+        // check for hard clips (never consider these bases):
+        final Cigar c = read.getCigar();
+        CigarElement last = c.getCigarElement(c.numCigarElements() - 1);
+
+        int numEndClippedBases = 0;
+        if (last.getOperator() == CigarOperator.H) {
+            numEndClippedBases = last.getLength();
+        }
+        final byte[] unclippedReadBases = read.getBases();
+        final byte[] unclippedReadQuals = read.getBaseQualities();
+
+        // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+        // and may leave a string of Q2 bases still hanging off the reads.
+        //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+        for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
+            if (unclippedReadQuals[i] < 20) { //TODO the 20 hard value here is in lieu of PairHMM.BASE_QUALITY_SCORE_THRESHOLD in order to directly match GATK3 output
+
+                numEndClippedBases++;
+            } else {
+                break;
+            }
+        }
+
+        return numEndClippedBases;
+    }
 }
diff --git a/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java b/...in/java/org/broadinstitute/hellbender/tools/walkers/annotator/DepthPerAlleleBySample.java
@@ -12,6 +12,7 @@
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
 import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 
 import java.util.*;
 import java.util.stream.Collectors;
@@ -56,6 +57,43 @@ public void annotate(final ReferenceContext ref,
         // make sure that there's a meaningful relationship between the alleles in the likelihoods and our VariantContext
         Utils.validateArg(likelihoods.alleles().containsAll(alleles), () -> "VC alleles " + alleles + " not a  subset of ReadLikelihoods alleles " + likelihoods.alleles());
 
+        int[] counts;
+        if (likelihoods.hasFilledLikelihoods()) {
+            counts = annotateWithLikelihoods(vc, g, alleles, likelihoods);
+        } else if (likelihoods.readCount()==0) {
+            return;
+        } else if (vc.isSNP()) {
+            counts = annotateWithPileup(vc, likelihoods.getStratifiedPileups(vc).get(g.getSampleName()));
+        } else {
+            counts = new int[vc.getNAlleles()];
+        }
+
+        gb.AD(counts);
+    }
+
+    private int[] annotateWithPileup(final VariantContext vc, List<PileupElement> pileupElements) {
+
+        final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
+        for ( final Allele allele : vc.getAlleles() ) {
+            alleleCounts.put(allele.getBases()[0], 0);
+        }
+        for ( final PileupElement p : pileupElements) {
+            if ( alleleCounts.containsKey(p.getBase()) ) {
+                alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase()) + 1);
+            }
+        }
+
+        // we need to add counts in the correct order
+        final int[] counts = new int[alleleCounts.size()];
+        counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
+        for (int i = 0; i < vc.getAlternateAlleles().size(); i++) {
+            counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
+        }
+        return counts;
+    }
+
+    private int[] annotateWithLikelihoods(VariantContext vc, Genotype g, Set<Allele> alleles, ReadLikelihoods<Allele> likelihoods) {
+
         final Map<Allele, Integer> alleleCounts = new LinkedHashMap<>();
         for ( final Allele allele : vc.getAlleles() ) {
             alleleCounts.put(allele, 0);
@@ -72,7 +110,7 @@ public void annotate(final ReferenceContext ref,
             counts[i + 1] = alleleCounts.get(vc.getAlternateAllele(i));
         }
 
-        gb.AD(counts);
+        return counts;
     }
 
     @Override

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java
@@ -4,14 +4,13 @@
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.GenotypesContext;
 import htsjdk.variant.variantcontext.VariantContext;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.utils.FisherExactTest;
 import org.broadinstitute.hellbender.utils.QualityUtils;
 import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
 import org.broadinstitute.hellbender.utils.help.HelpConstants;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
 
 import java.util.Collections;
 import java.util.List;
@@ -44,6 +43,7 @@ public final class FisherStrand extends StrandBiasTest implements StandardAnnota
 
     static final double MIN_PVALUE = 1E-320;
     private static final int MIN_COUNT = ARRAY_DIM;
+    private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
 
     // how large do we want the normalized table to be? (ie, sum of all entries must be smaller that this)
     private static final double TARGET_TABLE_SIZE = 200.0;
@@ -59,6 +59,14 @@ protected Map<String, Object> calculateAnnotationFromGTfield(final GenotypesCont
         return ( tableFromPerSampleAnnotations != null )? annotationForOneTable(pValueForContingencyTable(tableFromPerSampleAnnotations)) : null;
     }
 
+    @Override
+    protected Map<String, Object> calculateAnnotationFromStratifiedContexts(final Map<String, List<PileupElement>> stratifiedContexts,
+                                                                            final VariantContext vc){
+        final int[][] tableNoFiltering = getPileupContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
+        final int[][] tableFiltering = getPileupContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
+        return annotationForOneTable(Math.max(pValueForContingencyTable(tableFiltering), pValueForContingencyTable(tableNoFiltering)));
+    }
+
     @Override
     protected Map<String, Object> calculateAnnotationFromLikelihoods(final ReadLikelihoods<Allele> likelihoods,
                                                                      final VariantContext vc){

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/walkers/annotator/LikelihoodRankSumTest.java b/...ain/java/org/broadinstitute/hellbender/tools/walkers/annotator/LikelihoodRankSumTest.java
@@ -45,6 +45,7 @@ protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc
     @Override
     protected OptionalDouble getElementForRead(final GATKRead read, final int refLoc) {
         Utils.nonNull(read);
-        throw new IllegalStateException("This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overriden");
+        //throw new IllegalStateException("This method should never have been called as getElementForRead(read,refloc,mostLikelyAllele) was overriden");
+        return OptionalDouble.empty();
     }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java
@@ -85,8 +85,11 @@ public Map<String, Object> annotate(final ReferenceContext ref,
                         ADrestrictedDepth += totalADdepth;
                     }
                     depth += totalADdepth;
+                    continue;
                 }
-            } else if (likelihoods != null) {
+            }
+
+            if (likelihoods != null) {
                 depth += likelihoods.sampleReadCount(likelihoods.indexOfSample(genotype.getSampleName()));
             } else if ( genotype.hasDP() ) {
                 depth += genotype.getDP();

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/RMSMappingQuality.java
@@ -114,21 +114,21 @@ public Map<String, Object> finalizeRawData(final VariantContext vc, final Varian
         ReducibleAnnotationData myData = new ReducibleAnnotationData(rawMQdata);
         parseRawDataString(myData);
 
-        String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
+        String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc), myData.getAttributeMap());
         return Collections.singletonMap(getKeyNames().get(0), (Object)annotationString);
     }
 
-    public String makeFinalizedAnnotationString(final VariantContext vc, final Map<Allele, Number> perAlleleData) {
-        int numOfReads = getNumOfReads(vc);
+    public String makeFinalizedAnnotationString(final int numOfReads, final Map<Allele, Number> perAlleleData) {
         return String.format("%.2f", Math.sqrt((double)perAlleleData.get(Allele.NO_CALL)/numOfReads));
     }
 
 
     public void combineAttributeMap(ReducibleAnnotationData<Number> toAdd, ReducibleAnnotationData<Number> combined) {
-        if (combined.getAttribute(Allele.NO_CALL) != null)
+        if (combined.getAttribute(Allele.NO_CALL) != null) {
             combined.putAttribute(Allele.NO_CALL, (Double) combined.getAttribute(Allele.NO_CALL) + (Double) toAdd.getAttribute(Allele.NO_CALL));
-        else
+        } else {
             combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
+        }
 
     }
 
@@ -150,7 +150,17 @@ public void calculateRawData(final VariantContext vc,
     public Map<String, Object> annotate(final ReferenceContext ref,
                                         final VariantContext vc,
                                         final ReadLikelihoods<Allele> likelihoods) {
-        return annotateRawData(ref, vc, likelihoods);
+        Utils.nonNull(vc);
+        if (likelihoods == null || likelihoods.readCount() < 1 ) {
+            return new HashMap<>();
+        }
+
+        final Map<String, Object> annotations = new HashMap<>();
+        final ReducibleAnnotationData<Number> myData = new ReducibleAnnotationData<>(null);
+        calculateRawData(vc, likelihoods, myData);
+        final String annotationString = makeFinalizedAnnotationString(getNumOfReads(vc, likelihoods), myData.getAttributeMap());
+        annotations.put(getKeyNames().get(0), annotationString);
+        return annotations;
     }
 
     @VisibleForTesting