Skip to content

Commit

Permalink
Fix non-symmetry bug in FingerprintChecker (#1455)
Browse files Browse the repository at this point in the history
* Found a bug in the fingerprinting code that resulted in non-symmetric results: LOD(a,b) != LOD(b,a)

* The problem was that the __normlized__ posterior was used as the prior in the denominator of the LOD calculation.
* The correct thing to do is to use the __unnormalized__ posterior array which is what this commit does.

Other changes include:

- more tests
- cleanup
- Found a disabled test laying around...enabled it and more cleanup.
-move "randomSublist" to a more appropriate place
  • Loading branch information
Yossi Farjoun committed Feb 21, 2020
1 parent ab0435a commit a7f9963
Show file tree
Hide file tree
Showing 12 changed files with 558 additions and 270 deletions.
87 changes: 33 additions & 54 deletions src/main/java/picard/fingerprint/FingerprintChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import htsjdk.variant.vcf.VCFFileReader;
import picard.PicardException;
import picard.util.AlleleSubsettingUtils;
import picard.util.MathUtil;
import picard.util.ThreadPoolExecutorWithExceptions;

import java.io.File;
Expand Down Expand Up @@ -89,6 +90,9 @@ public class FingerprintChecker {
public static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;
public static final int DEFAULT_MAXIMAL_PL_DIFFERENCE = 30;

// used sometimes to subset loci. Fix the random seed so that the results are deterministic
private static final Random random = new Random(42);

private final HaplotypeMap haplotypes;
private int minimumBaseQuality = DEFAULT_MINIMUM_BASE_QUALITY;
private int minimumMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY;
Expand All @@ -104,7 +108,9 @@ public void setValidationStringency(final ValidationStringency validationStringe
this.validationStringency = validationStringency;
}

public File getReferenceFasta() { return referenceFasta;}
public File getReferenceFasta() {
return referenceFasta;
}

private ValidationStringency validationStringency = ValidationStringency.DEFAULT_STRINGENCY;

Expand Down Expand Up @@ -197,9 +203,7 @@ public Map<String, Fingerprint> loadFingerprints(final Path fingerprintFile, fin
fingerprints = loadFingerprintsFromVariantContexts(reader, specificSample, fingerprintFile);
}
//add an entry for each sample which was not fingerprinted
reader.getFileHeader().getGenotypeSamples().forEach(sample -> {
fingerprints.computeIfAbsent(sample, s -> new Fingerprint(s, fingerprintFile, null));
});
reader.getFileHeader().getGenotypeSamples().forEach(sample -> fingerprints.computeIfAbsent(sample, s -> new Fingerprint(s, fingerprintFile, null)));

return fingerprints;
}
Expand Down Expand Up @@ -235,7 +239,6 @@ private static SAMSequenceDictionary getActiveDictionary(final HaplotypeMap hapl
* @param specificSample - null to load genotypes for all samples contained in the file or the name
* of an individual sample to load (and exclude all others).
* @return a Map of Sample name to Fingerprint
*
*/
public Map<String, Fingerprint> loadFingerprintsFromNonIndexedVcf(final Path fingerprintFile, final String specificSample) {
final VCFFileReader reader = new VCFFileReader(fingerprintFile, false);
Expand All @@ -261,7 +264,9 @@ public Map<String, Fingerprint> loadFingerprintsFromVariantContexts(final Iterab

for (final VariantContext ctx : iterable) {
// Setup the sample names set if needed
if (ctx == null) continue;
if (ctx == null) {
continue;
}

if (samples == null) {
if (specificSample != null) {
Expand All @@ -280,7 +285,7 @@ public Map<String, Fingerprint> loadFingerprintsFromVariantContexts(final Iterab
try {
getFingerprintFromVc(fingerprints, ctx);
} catch (final IllegalArgumentException e) {
log.warn(e,"There was a genotyping error in File: " + source.toUri().toString() + "\n" + e.getMessage());
log.warn(e, "There was a genotyping error in File: " + source.toUri().toString() + "\n" + e.getMessage());
}
}

Expand Down Expand Up @@ -336,7 +341,9 @@ public Map<String, Fingerprint> loadFingerprintsFromQueriableReader(final VCFFil
*/
private void getFingerprintFromVc(final Map<String, Fingerprint> fingerprints, final VariantContext ctx) throws IllegalArgumentException {
final HaplotypeBlock h = this.haplotypes.getHaplotype(ctx.getContig(), ctx.getStart());
if (h == null) return;
if (h == null) {
return;
}

final Snp snp = this.haplotypes.getSnp(ctx.getContig(), ctx.getStart());

Expand Down Expand Up @@ -388,13 +395,17 @@ private void getFingerprintFromVc(final Map<String, Fingerprint> fingerprints, f
fp.add(hFp);
} else {

if (genotype.isNoCall()) continue;
if (genotype.isNoCall()) {
continue;
}

// TODO: when multiple genotypes are available for a Haplotype check that they
// TODO: agree. Not urgent since DownloadGenotypes already does this.
// TODO: more urgent now as we convert vcfs to haplotypeProbabilities and
// TODO: there could be different VCs with information we'd like to use...
if (fp.containsKey(h)) continue;
if (fp.containsKey(h)) {
continue;
}

final boolean hom = genotype.isHom();
final byte allele = StringUtil.toUpperCase(genotype.getAllele(0).getBases()[0]);
Expand Down Expand Up @@ -556,7 +567,6 @@ public Map<FingerprintIdDetails, Fingerprint> fingerprintSamFile(final Path samF
log.error(e);
throw e;
}

}
}

Expand All @@ -572,13 +582,13 @@ private FingerprintIdDetails createUnknownFP(final Path samFile, final SAMRecord
readGroupRecord.setPlatformUnit("<UNKNOWN>.0.ZZZ");

if (validationStringency == ValidationStringency.LENIENT) {
log.warn(e);
log.warn(e.getMessage());
log.warn("further messages from this file will be suppressed");
}

return new FingerprintIdDetails(readGroupRecord, samFile.toUri().toString());
} else {
log.error(e);
log.error(e.getMessage());
throw e;
}
}
Expand Down Expand Up @@ -633,7 +643,7 @@ public Map<String, Fingerprint> identifyContaminant(final Path samFile, final do
final Snp snp = this.haplotypes.getSnp(info.getSequenceName(), info.getPosition());

// randomly select locusMaxReads elements from the list
final List<SamLocusIterator.RecordAndOffset> recordAndOffsetList = randomSublist(info.getRecordAndPositions(), locusMaxReads);
final List<SamLocusIterator.RecordAndOffset> recordAndOffsetList = MathUtil.randomSublist(info.getRecordAndOffsets(), locusMaxReads, random);

for (final SamLocusIterator.RecordAndOffset rec : recordAndOffsetList) {
final SAMReadGroupRecord rg = rec.getRecord().getReadGroup();
Expand Down Expand Up @@ -661,33 +671,6 @@ public Map<String, Fingerprint> identifyContaminant(final Path samFile, final do
return fingerprintsBySample;
}

/**
* A small utility function to choose n random elements (un-shuffled) from a list
*
* @param list A list of elements
* @param n a number of elements requested from list
* @return a list of n randomly chosen (but in the original order) elements from list.
* If the list has less than n elements it is returned in its entirety.
*/
protected static <T> List<T> randomSublist(final List<T> list, final int n) {
int availableElements = list.size();
if (availableElements <= n) return list;

int stillNeeded = n;
final Random rg = new Random();
final List<T> shortList = new ArrayList<>(n);
for (final T aList : list) {
if (rg.nextDouble() < stillNeeded / (double) availableElements) {
shortList.add(aList);
stillNeeded--;
}
if (stillNeeded == 0) break; // fast out if do not need more elements
availableElements--;
}

return shortList;
}

/**
* Fingerprints one or more SAM/BAM/VCF files at all available loci within the haplotype map, using multiple threads
* to speed up the processing.
Expand Down Expand Up @@ -715,11 +698,10 @@ public Map<FingerprintIdDetails, Fingerprint> fingerprintFiles(final Collection<
}

if (oneFileFingerprints.isEmpty()) {
log.warn("No fingerprint data was found in file:" + p);
log.warn("No fingerprint data was found in file:" + p);
}
retval.putAll(oneFileFingerprints);


log.debug("Processed file: " + p.toUri().toString() + " (" + filesRead.get() + ")");
if (filesRead.incrementAndGet() % 100 == 0) {
log.info("Processed " + filesRead.get() + " out of " + files.size());
Expand Down Expand Up @@ -868,18 +850,18 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
public static MatchResults calculateMatchResults(final Fingerprint observedFp, final Fingerprint expectedFp, final double minPExpected, final double pLoH, final boolean calculateLocusInfo, final boolean calculateTumorAwareLod) {
final List<LocusResult> locusResults = calculateLocusInfo ? new ArrayList<>() : null;

double llThisSample = 0;
double llOtherSample = 0;
double llNoSwapModel = 0;
double llSwapModel = 0;

double lodExpectedSampleTumorNormal = 0;
double lodExpectedSampleNormalTumor = 0;

final double lminPExpected = Math.log10(minPExpected);

for (final HaplotypeProbabilities probs2 : expectedFp.values()) {
final HaplotypeBlock haplotypeBlock = probs2.getHaplotype();
final HaplotypeProbabilities probs1 = observedFp.get(haplotypeBlock);
if (probs1 == null) continue;
if (probs1 == null) {
continue;
}

final HaplotypeProbabilityOfNormalGivenTumor prob1AssumingDataFromTumor;
final HaplotypeProbabilityOfNormalGivenTumor prob2AssumingDataFromTumor;
Expand Down Expand Up @@ -917,11 +899,8 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
locusResults.add(lr);
}
if (probs1.hasEvidence() && probs2.hasEvidence()) {
//TODO: what's the mathematics behind the lminPexpected?
llThisSample += Math.max(lminPExpected,
probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2));

llOtherSample += probs1.shiftedLogEvidenceProbability();
llNoSwapModel += probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2);
llSwapModel += probs1.shiftedLogEvidenceProbability() + probs2.shiftedLogEvidenceProbability();

if (calculateTumorAwareLod) {
lodExpectedSampleTumorNormal += prob1AssumingDataFromTumor.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2) -
Expand All @@ -934,7 +913,7 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f
}

// TODO: prune the set of LocusResults for things that are too close together?
return new MatchResults(expectedFp.getSource(), expectedFp.getSample(), llThisSample, llOtherSample, lodExpectedSampleTumorNormal, lodExpectedSampleNormalTumor, locusResults);
return new MatchResults(expectedFp.getSource(), expectedFp.getSample(), llNoSwapModel, llSwapModel, lodExpectedSampleTumorNormal, lodExpectedSampleNormalTumor, locusResults);
}

/**
Expand Down
62 changes: 32 additions & 30 deletions src/main/java/picard/fingerprint/HaplotypeProbabilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

package picard.fingerprint;

import htsjdk.utils.ValidationUtils;
import picard.util.MathUtil;

/**
Expand Down Expand Up @@ -57,28 +56,39 @@ public double[] getPriorProbablities() {
* Mathematically, this is P(H | D, F) where and H is the vector of possible haplotypes {AA,Aa,aa}.
* D is the data seen by the class, and
* F is the population frequency of each genotype.
*/
/**
* Returns the posterior probabilities using the population frequency as a prior.
*
*
* Returns the posterior normalized probabilities using the population frequency as a prior.
*/
public double[] getPosteriorProbabilities() {
return MathUtil.pNormalizeVector(MathUtil.multiply(getLikelihoods(), getPriorProbablities()));
return MathUtil.pNormalizeVector(getPosteriorLikelihoods());
}

/** Returns the probabilities, in order, of the AA, Aa and aa haplotypes.
* Mathematically, this is P(H | D, F) where and H is the vector of possible haplotypes {AA,Aa,aa}.
* D is the data seen by the class, and
* F is the population frequency of each genotype.
*
*
* Returns the unnormalized likelihoods using the population frequency as a prior.
*/
public double[] getPosteriorLikelihoods() {
return MathUtil.multiply(getLikelihoods(), getPriorProbablities());
}


/**
* Returns the likelihoods, in order, of the AA, Aa and aa haplotypes given the evidence
* <p>
* Mathematically this is P(evidence | haplotype) where haplotype={AA,Aa,aa}.
*
* Will be normalized.
*/
public abstract double[] getLikelihoods();

public double[] getLogLikelihoods() {
final double[] likelihoods = getLikelihoods();
final double[] lLikelihoods = new double[NUM_GENOTYPES];
for (final Genotype g : Genotype.values()){
lLikelihoods[g.v] = Math.log10(likelihoods[g.v]);
}
return lLikelihoods;
return MathUtil.getLogFromProbability(getLikelihoods());
}

/**
Expand Down Expand Up @@ -164,7 +174,7 @@ public DiploidGenotype getMostLikelyGenotype(final Snp snp) {
*/
void assertSnpPartOfHaplotype(final Snp snp) {
if (!this.haplotypeBlock.contains(snp)) {
throw new IllegalArgumentException("Snp " + snp + " does not belong to haplotype " + this.haplotypeBlock);
throw new IllegalArgumentException("Snp " + snp + " does not belong to haplotype " + this.haplotypeBlock + ".");
}
}

Expand All @@ -188,15 +198,7 @@ void assertSnpPartOfHaplotype(final Snp snp) {
*/

public double scaledEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
final double[] likelihoods = getLikelihoods();
ValidationUtils.validateArg(genotypeFrequencies.length == NUM_GENOTYPES, "provided genotype frequencies must be length 3");
ValidationUtils.validateArg(likelihoods.length == NUM_GENOTYPES, "internal liklihoods must be length 3");

double result = 0;
for (Genotype g : Genotype.values()) {
result += likelihoods[g.v] * genotypeFrequencies[g.v];
}
return result;
return MathUtil.sum(MathUtil.multiply(getLikelihoods(), genotypeFrequencies));
}

public double shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(final double[] genotypeFrequencies) {
Expand All @@ -213,16 +215,16 @@ public double shiftedLogEvidenceProbabilityGivenOtherEvidence(final HaplotypePro
if (!this.haplotypeBlock.equals(otherHp.getHaplotype())) {
throw new IllegalArgumentException("Haplotypes are from different HaplotypeBlocks!");
}
/** Get the posterior from the other otherHp. Use this posterior as the prior to calculate probability.
*
* P(hap|x,y) = P(x|hap,y) P(hap|y) / P(x|y)
* = P(x | hap) * P(hap | y) / P(x)
* likelihood * other.posterior
*
* = P(x|hap) P(y|hap) P(hap)/P(x)P(y)
* = A P(x| hap) P(y| hap) P(hap) # where A is an unknown scaling factor
/* Get the posterior from the other otherHp. Use this posterior as the prior to calculate probability.
P(hap|x,y) = P(x|hap,y) P(hap|y) / P(x|y)
= P(x | hap) * P(hap | y) / P(x)
likelihood * other.posterior
= P(x|hap) P(y|hap) P(hap)/P(x)P(y)
= A P(x| hap) P(y| hap) P(hap) # where A is an unknown scaling factor
*/
return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(otherHp.getPosteriorProbabilities());
return shiftedLogEvidenceProbabilityUsingGenotypeFrequencies(otherHp.getPosteriorLikelihoods());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public class HaplotypeProbabilitiesFromContaminatorSequence extends HaplotypePro
// for each model (contGenotype, mainGenotype) there's a likelihood of the data. These need to be collected separately
// and only collated once all the data is in.
private final double[][] likelihoodMap = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}};
private boolean valuesNeedUpdating = true;

public HaplotypeProbabilitiesFromContaminatorSequence(final HaplotypeBlock haplotypeBlock, final double contamination) {
super(haplotypeBlock);
Expand All @@ -68,6 +69,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence(HaplotypeProbabilitiesFrom
*/
public void addToProbs(final Snp snp, final byte base, final byte qual) {
assertSnpPartOfHaplotype(snp);
valuesNeedUpdating = true;

// Skip bases that don't match either expected allele for this SNP
final boolean altAllele;
Expand All @@ -91,20 +93,24 @@ public void addToProbs(final Snp snp, final byte base, final byte qual) {
for (final Genotype mainGeno : Genotype.values()) {
//theta is the expected frequency of the alternate allele
final double theta = 0.5 * ((1 - contamination) * mainGeno.v + contamination * contGeno.v);
likelihoodMap[contGeno.v][mainGeno.v] *= (( altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr);
likelihoodMap[contGeno.v][mainGeno.v] *= ((altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr);
}
}
}

//a function needed to update the logLikelihoods from the likelihoodMap.
private void updateLikelihoods() {
if (!valuesNeedUpdating) {
return;
}
final double[] ll = new double[NUM_GENOTYPES];
for (final Genotype contGeno : Genotype.values()) {
// p(a | g_c) = \sum_g_m { P(g_m) \prod_i P(a_i| g_m, g_c)}
ll[contGeno.v] = Math.log10(MathUtil.sum(MathUtil.multiply(this.getPriorProbablities(), likelihoodMap[contGeno.v])));
}
setLogLikelihoods(ll);
valuesNeedUpdating = false;
}

@Override
Expand Down Expand Up @@ -133,12 +139,18 @@ public HaplotypeProbabilitiesFromContaminatorSequence merge(final HaplotypeProba
for (final Genotype contGeno : Genotype.values()) {
this.likelihoodMap[contGeno.v] = MathUtil.multiply(this.likelihoodMap[contGeno.v], o.likelihoodMap[contGeno.v]);
}
valuesNeedUpdating = true;
return this;
}

@Override
public double[] getLogLikelihoods() {
public double[] getLikelihoods() {
updateLikelihoods();
return super.getLikelihoods();
}

@Override
public double[] getLogLikelihoods() {
return super.getLogLikelihoods();
}
}
Loading

0 comments on commit a7f9963

Please sign in to comment.