Fix non-symmetry bug in FingerprintChecker (#1455)

* Found a bug in the fingerprinting code that resulted in non-symmetric results: LOD(a,b) != LOD(b,a) * The problem was that the __normlized__ posterior was used as the prior in the denominator of the LOD calculation. * The correct thing to do is to use the __unnormalized__ posterior array which is what this commit does. Other changes include: - more tests - cleanup - Found a disabled test laying around...enabled it and more cleanup. -move "randomSublist" to a more appropriate place
broadinstitute · Feb 21, 2020 · a7f9963 · a7f9963
1 parent ab0435a
commit a7f9963
Show file tree

Hide file tree

Showing 12 changed files with 558 additions and 270 deletions.
diff --git a/src/main/java/picard/fingerprint/FingerprintChecker.java b/src/main/java/picard/fingerprint/FingerprintChecker.java
@@ -47,6 +47,7 @@
 import htsjdk.variant.vcf.VCFFileReader;
 import picard.PicardException;
 import picard.util.AlleleSubsettingUtils;
+import picard.util.MathUtil;
 import picard.util.ThreadPoolExecutorWithExceptions;
 
 import java.io.File;
@@ -89,6 +90,9 @@ public class FingerprintChecker {
     public static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;
     public static final int DEFAULT_MAXIMAL_PL_DIFFERENCE = 30;
 
+    // used sometimes to subset loci. Fix the random seed so that the results are deterministic
+    private static final Random random = new Random(42);
+
     private final HaplotypeMap haplotypes;
     private int minimumBaseQuality = DEFAULT_MINIMUM_BASE_QUALITY;
     private int minimumMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY;
@@ -104,7 +108,9 @@ public void setValidationStringency(final ValidationStringency validationStringe
         this.validationStringency = validationStringency;
     }
 
-    public File getReferenceFasta() { return referenceFasta;}
+    public File getReferenceFasta() {
+        return referenceFasta;
+    }
 
     private ValidationStringency validationStringency = ValidationStringency.DEFAULT_STRINGENCY;
 
@@ -197,9 +203,7 @@ public Map<String, Fingerprint> loadFingerprints(final Path fingerprintFile, fin
             fingerprints = loadFingerprintsFromVariantContexts(reader, specificSample, fingerprintFile);
         }
         //add an entry for each sample which was not fingerprinted
-        reader.getFileHeader().getGenotypeSamples().forEach(sample -> {
-            fingerprints.computeIfAbsent(sample, s -> new Fingerprint(s, fingerprintFile, null));
-        });
+        reader.getFileHeader().getGenotypeSamples().forEach(sample -> fingerprints.computeIfAbsent(sample, s -> new Fingerprint(s, fingerprintFile, null)));
 
         return fingerprints;
     }
@@ -235,7 +239,6 @@ private static SAMSequenceDictionary getActiveDictionary(final HaplotypeMap hapl
      * @param specificSample  - null to load genotypes for all samples contained in the file or the name
      *                        of an individual sample to load (and exclude all others).
      * @return a Map of Sample name to Fingerprint
-     *
      */
     public Map<String, Fingerprint> loadFingerprintsFromNonIndexedVcf(final Path fingerprintFile, final String specificSample) {
         final VCFFileReader reader = new VCFFileReader(fingerprintFile, false);
@@ -261,7 +264,9 @@ public Map<String, Fingerprint> loadFingerprintsFromVariantContexts(final Iterab
 
         for (final VariantContext ctx : iterable) {
             // Setup the sample names set if needed
-            if (ctx == null) continue;
+            if (ctx == null) {
+                continue;
+            }
 
             if (samples == null) {
                 if (specificSample != null) {
@@ -280,7 +285,7 @@ public Map<String, Fingerprint> loadFingerprintsFromVariantContexts(final Iterab
             try {
                 getFingerprintFromVc(fingerprints, ctx);
             } catch (final IllegalArgumentException e) {
-                log.warn(e,"There was a genotyping error in File: " + source.toUri().toString() + "\n" + e.getMessage());
+                log.warn(e, "There was a genotyping error in File: " + source.toUri().toString() + "\n" + e.getMessage());
             }
         }
 
@@ -336,7 +341,9 @@ public Map<String, Fingerprint> loadFingerprintsFromQueriableReader(final VCFFil
      */
     private void getFingerprintFromVc(final Map<String, Fingerprint> fingerprints, final VariantContext ctx) throws IllegalArgumentException {
         final HaplotypeBlock h = this.haplotypes.getHaplotype(ctx.getContig(), ctx.getStart());
-        if (h == null) return;
+        if (h == null) {
+            return;
+        }
 
         final Snp snp = this.haplotypes.getSnp(ctx.getContig(), ctx.getStart());
 
@@ -388,13 +395,17 @@ private void getFingerprintFromVc(final Map<String, Fingerprint> fingerprints, f
                 fp.add(hFp);
             } else {
 
-                if (genotype.isNoCall()) continue;
+                if (genotype.isNoCall()) {
+                    continue;
+                }
 
                 // TODO: when multiple genotypes are available for a Haplotype check that they
                 // TODO: agree. Not urgent since DownloadGenotypes already does this.
                 // TODO: more urgent now as we convert vcfs to haplotypeProbabilities and
                 // TODO: there could be different VCs with information we'd like to use...
-                if (fp.containsKey(h)) continue;
+                if (fp.containsKey(h)) {
+                    continue;
+                }
 
                 final boolean hom = genotype.isHom();
                 final byte allele = StringUtil.toUpperCase(genotype.getAllele(0).getBases()[0]);
@@ -556,7 +567,6 @@ public Map<FingerprintIdDetails, Fingerprint> fingerprintSamFile(final Path samF
                     log.error(e);
                     throw e;
                 }
-
             }
         }
 
@@ -572,13 +582,13 @@ private FingerprintIdDetails createUnknownFP(final Path samFile, final SAMRecord
             readGroupRecord.setPlatformUnit("<UNKNOWN>.0.ZZZ");
 
             if (validationStringency == ValidationStringency.LENIENT) {
-                log.warn(e);
+                log.warn(e.getMessage());
                 log.warn("further messages from this file will be suppressed");
             }
 
             return new FingerprintIdDetails(readGroupRecord, samFile.toUri().toString());
         } else {
-            log.error(e);
+            log.error(e.getMessage());
             throw e;
         }
     }
@@ -633,7 +643,7 @@ public Map<String, Fingerprint> identifyContaminant(final Path samFile, final do
                 final Snp snp = this.haplotypes.getSnp(info.getSequenceName(), info.getPosition());
 
                 // randomly select locusMaxReads elements from the list
-                final List<SamLocusIterator.RecordAndOffset> recordAndOffsetList = randomSublist(info.getRecordAndPositions(), locusMaxReads);
+                final List<SamLocusIterator.RecordAndOffset> recordAndOffsetList = MathUtil.randomSublist(info.getRecordAndOffsets(), locusMaxReads, random);
 
                 for (final SamLocusIterator.RecordAndOffset rec : recordAndOffsetList) {
                     final SAMReadGroupRecord rg = rec.getRecord().getReadGroup();
@@ -661,33 +671,6 @@ public Map<String, Fingerprint> identifyContaminant(final Path samFile, final do
         return fingerprintsBySample;
     }
 
-    /**
-     * A small utility function to choose n random elements (un-shuffled) from a list
-     *
-     * @param list A list of elements
-     * @param n    a number of elements requested from list
-     * @return a list of n randomly chosen (but in the original order) elements from list.
-     * If the list has less than n elements it is returned in its entirety.
-     */
-    protected static <T> List<T> randomSublist(final List<T> list, final int n) {
-        int availableElements = list.size();
-        if (availableElements <= n) return list;
-
-        int stillNeeded = n;
-        final Random rg = new Random();
-        final List<T> shortList = new ArrayList<>(n);
-        for (final T aList : list) {
-            if (rg.nextDouble() < stillNeeded / (double) availableElements) {
-                shortList.add(aList);
-                stillNeeded--;
-            }
-            if (stillNeeded == 0) break; // fast out if do not need more elements
-            availableElements--;
-        }
-
-        return shortList;
-    }
-
     /**
      * Fingerprints one or more SAM/BAM/VCF files at all available loci within the haplotype map, using multiple threads
      * to speed up the processing.
@@ -715,11 +698,10 @@ public Map<FingerprintIdDetails, Fingerprint> fingerprintFiles(final Collection<
                 }
 
                 if (oneFileFingerprints.isEmpty()) {
-                        log.warn("No fingerprint data was found in file:" + p);
+                    log.warn("No fingerprint data was found in file:" + p);
                 }
                 retval.putAll(oneFileFingerprints);
 
-
                 log.debug("Processed file: " + p.toUri().toString() + " (" + filesRead.get() + ")");
                 if (filesRead.incrementAndGet() % 100 == 0) {
                     log.info("Processed " + filesRead.get() + " out of " + files.size());
@@ -868,18 +850,18 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
     public static MatchResults calculateMatchResults(final Fingerprint observedFp, final Fingerprint expectedFp, final double minPExpected, final double pLoH, final boolean calculateLocusInfo, final boolean calculateTumorAwareLod) {
         final List<LocusResult> locusResults = calculateLocusInfo ? new ArrayList<>() : null;
 
-        double llThisSample = 0;
-        double llOtherSample = 0;
+        double llNoSwapModel = 0;
+        double llSwapModel = 0;
 
         double lodExpectedSampleTumorNormal = 0;
         double lodExpectedSampleNormalTumor = 0;
 
-        final double lminPExpected = Math.log10(minPExpected);
-
         for (final HaplotypeProbabilities probs2 : expectedFp.values()) {
             final HaplotypeBlock haplotypeBlock = probs2.getHaplotype();
             final HaplotypeProbabilities probs1 = observedFp.get(haplotypeBlock);
-            if (probs1 == null) continue;
+            if (probs1 == null) {
+                continue;
+            }
 
             final HaplotypeProbabilityOfNormalGivenTumor prob1AssumingDataFromTumor;
             final HaplotypeProbabilityOfNormalGivenTumor prob2AssumingDataFromTumor;
@@ -917,11 +899,8 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
                 locusResults.add(lr);
             }
             if (probs1.hasEvidence() && probs2.hasEvidence()) {
-                //TODO: what's the mathematics behind the lminPexpected?
-                llThisSample += Math.max(lminPExpected,
-                        probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2));
-
-                llOtherSample += probs1.shiftedLogEvidenceProbability();
+                llNoSwapModel += probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2);
+                llSwapModel += probs1.shiftedLogEvidenceProbability() + probs2.shiftedLogEvidenceProbability();
 
                 if (calculateTumorAwareLod) {
                     lodExpectedSampleTumorNormal += prob1AssumingDataFromTumor.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2) -
@@ -934,7 +913,7 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
         }
 
         // TODO: prune the set of LocusResults for things that are too close together?
-        return new MatchResults(expectedFp.getSource(), expectedFp.getSample(), llThisSample, llOtherSample, lodExpectedSampleTumorNormal, lodExpectedSampleNormalTumor, locusResults);
+        return new MatchResults(expectedFp.getSource(), expectedFp.getSample(), llNoSwapModel, llSwapModel, lodExpectedSampleTumorNormal, lodExpectedSampleNormalTumor, locusResults);
     }
 
     /**

diff --git a/src/main/java/picard/fingerprint/HaplotypeProbabilities.java b/src/main/java/picard/fingerprint/HaplotypeProbabilities.java
@@ -24,7 +24,6 @@
 
 package picard.fingerprint;
 
-import htsjdk.utils.ValidationUtils;
 import picard.util.MathUtil;
 
 /**
@@ -57,28 +56,39 @@ public double[] getPriorProbablities() {
      * Mathematically, this is P(H | D, F) where and H is the vector of possible haplotypes {AA,Aa,aa}.
      * D is the data seen by the class, and
      * F is the population frequency of each genotype.
-     */
-    /**
-     * Returns the posterior probabilities using the population frequency as a prior.
+     *
+     *
+     * Returns the posterior normalized probabilities using the population frequency as a prior.
      */
     public double[] getPosteriorProbabilities() {
-        return MathUtil.pNormalizeVector(MathUtil.multiply(getLikelihoods(), getPriorProbablities()));
+        return MathUtil.pNormalizeVector(getPosteriorLikelihoods());
     }
 
+    /** Returns the probabilities, in order, of the AA, Aa and aa haplotypes.
+
+     * Mathematically, this is P(H | D, F) where and H is the vector of possible haplotypes {AA,Aa,aa}.
+     * D is the data seen by the class, and
+     * F is the population frequency of each genotype.
+     *
+     *
+     * Returns the unnormalized likelihoods using the population frequency as a prior.
+     */
+    public double[] getPosteriorLikelihoods() {
+        return MathUtil.multiply(getLikelihoods(), getPriorProbablities());
+    }
+
+
     /**
      * Returns the likelihoods, in order, of the AA, Aa and aa haplotypes given the evidence
      * <p>
      * Mathematically this is P(evidence | haplotype) where haplotype={AA,Aa,aa}.
+     *
+     * Will be normalized.
      */
     public abstract double[] getLikelihoods();
 
     public double[] getLogLikelihoods() {
-        final double[] likelihoods = getLikelihoods();
-        final double[] lLikelihoods = new double[NUM_GENOTYPES];
-        for (final Genotype g : Genotype.values()){
-            lLikelihoods[g.v] = Math.log10(likelihoods[g.v]);
-        }
-        return lLikelihoods;
+        return MathUtil.getLogFromProbability(getLikelihoods());
     }
 
     /**
@@ -164,7 +174,7 @@ public DiploidGenotype getMostLikelyGenotype(final Snp snp) {
      */
     void assertSnpPartOfHaplotype(final Snp snp) {
         if (!this.haplotypeBlock.contains(snp)) {
-            throw new IllegalArgumentException("Snp " + snp + " does not belong to haplotype " + this.haplotypeBlock);
+            throw new IllegalArgumentException("Snp " + snp + " does not belong to haplotype " + this.haplotypeBlock + ".");
         }
     }
 
@@ -188,15 +198,7 @@ void assertSnpPartOfHaplotype(final Snp snp) {
      */
 
     public double scaledEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
-        final double[] likelihoods = getLikelihoods();
-        ValidationUtils.validateArg(genotypeFrequencies.length == NUM_GENOTYPES, "provided genotype frequencies must be length 3");
-        ValidationUtils.validateArg(likelihoods.length == NUM_GENOTYPES, "internal liklihoods must be length 3");
-
-        double result = 0;
-        for (Genotype g : Genotype.values()) {
-            result += likelihoods[g.v] * genotypeFrequencies[g.v];
-        }
-        return result;
+        return MathUtil.sum(MathUtil.multiply(getLikelihoods(), genotypeFrequencies));
     }
 
     public double shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
@@ -213,16 +215,16 @@ public double shiftedLogEvidenceProbabilityGivenOtherEvidence(final HaplotypePro
         if (!this.haplotypeBlock.equals(otherHp.getHaplotype())) {
             throw new IllegalArgumentException("Haplotypes are from different HaplotypeBlocks!");
         }
-        /** Get the posterior from the other otherHp. Use this posterior as the prior to calculate probability.
-         *
-         *   P(hap|x,y) = P(x|hap,y) P(hap|y) / P(x|y)
-         *              = P(x | hap) * P(hap | y) / P(x)
-         *                likelihood * other.posterior
-         *
-         *              = P(x|hap) P(y|hap) P(hap)/P(x)P(y)
-         *              = A P(x| hap) P(y| hap) P(hap)  # where A is an unknown scaling factor
+        /* Get the posterior from the other otherHp. Use this posterior as the prior to calculate probability.
+
+            P(hap|x,y) = P(x|hap,y) P(hap|y) / P(x|y)
+                       = P(x | hap) * P(hap | y) / P(x)
+                         likelihood * other.posterior
+
+                       = P(x|hap) P(y|hap) P(hap)/P(x)P(y)
+                       = A P(x| hap) P(y| hap) P(hap)  # where A is an unknown scaling factor
          */
-        return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(otherHp.getPosteriorProbabilities());
+        return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(otherHp.getPosteriorLikelihoods());
     }
 
     /**

diff --git a/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java b/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java
@@ -42,6 +42,7 @@ public class HaplotypeProbabilitiesFromContaminatorSequence extends HaplotypePro
     // for each model (contGenotype, mainGenotype) there's a likelihood of the data. These need to be collected separately
     // and only collated once all the data is in.
     private final double[][] likelihoodMap = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
+    private boolean valuesNeedUpdating = true;
 
     public HaplotypeProbabilitiesFromContaminatorSequence(final HaplotypeBlock haplotypeBlock, final double contamination) {
         super(haplotypeBlock);
@@ -68,6 +69,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence(HaplotypeProbabilitiesFrom
          */
     public void addToProbs(final Snp snp, final byte base, final byte qual) {
         assertSnpPartOfHaplotype(snp);
+        valuesNeedUpdating = true;
 
         // Skip bases that don't match either expected allele for this SNP
         final boolean altAllele;
@@ -91,20 +93,24 @@ public void addToProbs(final Snp snp, final byte base, final byte qual) {
             for (final Genotype mainGeno : Genotype.values()) {
                 //theta is the expected frequency of the alternate allele
                 final double theta = 0.5 * ((1 - contamination) * mainGeno.v + contamination * contGeno.v);
-                likelihoodMap[contGeno.v][mainGeno.v] *= (( altAllele ? theta : (1 - theta)) * (1 - pErr) +
-                                                          (!altAllele ? theta : (1 - theta)) * pErr);
+                likelihoodMap[contGeno.v][mainGeno.v] *= ((altAllele ? theta : (1 - theta)) * (1 - pErr) +
+                        (!altAllele ? theta : (1 - theta)) * pErr);
             }
         }
     }
 
     //a function needed to update the logLikelihoods from the likelihoodMap.
     private void updateLikelihoods() {
+        if (!valuesNeedUpdating) {
+            return;
+        }
         final double[] ll = new double[NUM_GENOTYPES];
         for (final Genotype contGeno : Genotype.values()) {
             // p(a | g_c) = \sum_g_m { P(g_m) \prod_i P(a_i| g_m, g_c)}
             ll[contGeno.v] = Math.log10(MathUtil.sum(MathUtil.multiply(this.getPriorProbablities(), likelihoodMap[contGeno.v])));
         }
         setLogLikelihoods(ll);
+        valuesNeedUpdating = false;
     }
 
     @Override
@@ -133,12 +139,18 @@ public HaplotypeProbabilitiesFromContaminatorSequence merge(final HaplotypeProba
         for (final Genotype contGeno : Genotype.values()) {
             this.likelihoodMap[contGeno.v] = MathUtil.multiply(this.likelihoodMap[contGeno.v], o.likelihoodMap[contGeno.v]);
         }
+        valuesNeedUpdating = true;
         return this;
     }
 
     @Override
-    public double[] getLogLikelihoods() {
+    public double[] getLikelihoods() {
         updateLikelihoods();
+        return super.getLikelihoods();
+    }
+
+    @Override
+    public double[] getLogLikelihoods() {
         return super.getLogLikelihoods();
     }
 }