/
AlleleFrequencyCalculator.java
313 lines (261 loc) · 17.9 KB
/
AlleleFrequencyCalculator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
package org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import org.apache.commons.math3.special.Gamma;
import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.MathArrays;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeIndexCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingLikelihoods;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* @author David Benjamin <davidben@broadinstitute.org>
*/
public final class AlleleFrequencyCalculator {
private static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
private static final int HOM_REF_GENOTYPE_INDEX = 0;
private final double refPseudocount;
private final double snpPseudocount;
private final double indelPseudocount;
private final int defaultPloidy;
public AlleleFrequencyCalculator(final double refPseudocount, final double snpPseudocount,
final double indelPseudocount, final int defaultPloidy) {
this.refPseudocount = refPseudocount;
this.snpPseudocount = snpPseudocount;
this.indelPseudocount = indelPseudocount;
this.defaultPloidy = defaultPloidy;
}
public static AlleleFrequencyCalculator makeCalculator(GenotypeCalculationArgumentCollection genotypeArgs) {
final double refPseudocount = genotypeArgs.snpHeterozygosity / Math.pow(genotypeArgs.heterozygosityStandardDeviation,2);
final double snpPseudocount = genotypeArgs.snpHeterozygosity * refPseudocount;
final double indelPseudocount = genotypeArgs.indelHeterozygosity * refPseudocount;
return new AlleleFrequencyCalculator(refPseudocount, snpPseudocount, indelPseudocount, genotypeArgs.samplePloidy);
}
public static AlleleFrequencyCalculator makeCalculator(final DragstrParams dragstrParms, final int period,
final int repeats, final int ploidy,
final double snpHeterozygosity, final double scale) {
final double api = dragstrParms.api(period, repeats);
final double log10IndelFreq = api * -.1;
final double log10RefFreq = MathUtils.log10OneMinusPow10(log10IndelFreq);
final double log10SnpFreq = log10RefFreq + Math.log10(snpHeterozygosity);
final double log10Sum = MathUtils.log10SumLog10(log10RefFreq, log10IndelFreq, log10SnpFreq);
final double log10ScaleUp = Math.log10(scale) - log10Sum;
final double refPseudoCount = Math.pow(10, log10ScaleUp + log10RefFreq);
final double indelPseudoCount = Math.pow(10, log10ScaleUp + log10IndelFreq);
final double snpPseudoCount = Math.pow(10, log10ScaleUp + log10SnpFreq);
return new AlleleFrequencyCalculator(refPseudoCount, snpPseudoCount, indelPseudoCount, ploidy);
}
/**
*
* @param g must have likelihoods or (if approximateHomRefsFromGQ is true) GQ
* @param log10AlleleFrequencies
* @return
*/
private static double[] log10NormalizedGenotypePosteriors(final Genotype g, final double[] log10AlleleFrequencies) {
final double[] log10Likelihoods;
if (g.hasLikelihoods()) {
log10Likelihoods = g.getLikelihoods().getAsVector();
} else if ( g.isHomRef() || g.isNoCall()) {
Utils.validate(g.getPloidy() == 2,() -> "Likelihoods are required to calculate posteriors for hom-refs with ploidy != 2, " +
"but were not found for genotype " + g);
Utils.validate(g.hasGQ(), () -> "Genotype " + g + " does not contain GQ necessary to calculate posteriors.");
log10Likelihoods = GenotypeUtils.makeApproximateDiploidLog10LikelihoodsFromGQ(g, log10AlleleFrequencies.length);
} else {
//no-call with no PLs are too risky -- don't assume they're reblocked hom-refs
throw new IllegalStateException("Genotype " + g + " does not contain likelihoods necessary to calculate posteriors.");
}
final int ploidy = g.getPloidy();
final int alleleCount = log10AlleleFrequencies.length;
final double[] log10Posteriors = new double[GenotypeIndexCalculator.genotypeCount(ploidy, alleleCount)];
Utils.validate(log10Likelihoods.length == log10Posteriors.length, "Ploidy, allele count, and genotype likelihoods are inconsistent");
for (final GenotypeAlleleCounts gac : GenotypeAlleleCounts.iterable(ploidy, alleleCount)) {
log10Posteriors[gac.index()] = gac.log10CombinationCount() + log10Likelihoods[gac.index()]
+ gac.sumOverAlleleIndicesAndCounts((index, count) -> count * log10AlleleFrequencies[index]);
}
return MathUtils.normalizeLog10(log10Posteriors);
}
private static int[] genotypeIndicesWithOnlyRefAndSpanDel(final int ploidy, final List<Allele> alleles) {
final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
if (!spanningDeletionPresent) {
return new int[] {HOM_REF_GENOTYPE_INDEX};
} else {
final int spanDelIndex = alleles.indexOf(Allele.SPAN_DEL);
// allele counts are in the GenotypeLikelihoodCalculator format of {ref index, ref count, span del index, span del count}
return new IndexRange(0, ploidy + 1).mapToInteger(n -> GenotypeIndexCalculator.alleleCountsToIndex(0, ploidy - n, spanDelIndex, n));
}
}
public int getPloidy() {
return defaultPloidy;
}
public AFCalculationResult calculate(final VariantContext vc) {
// maxAltAlleles is not used by getLog10PNonRef, so don't worry about the 0
return calculate(vc, defaultPloidy);
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information. The VariantContext
* must have at least 1 alternative allele and at least one variant genotype with likelihoods.
* Hom-ref genotype likelihoods can be approximated, but the result will be zero allele counts without
* likelihoods for a non-reference genotype.
* @return result (for programming convenience)
*/
public AFCalculationResult calculate(final VariantContext vc, final int defaultPloidy) {
Utils.nonNull(vc, "VariantContext cannot be null");
Utils.validate(vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods),
"VariantContext at " + vc.getContig() + ":" + vc.getStart() + "must contain at least one " +
"genotype with likelihoods -- did this VC exceed the max number of alt alleles?");
final int numAlleles = vc.getNAlleles();
final List<Allele> alleles = vc.getAlleles();
Utils.validateArg( numAlleles > 1, () -> "VariantContext at " + vc.getContig() + ":" + vc.getStart() +
"has only a single reference allele, but getLog10PNonRef requires at least alternate allele");
return calculate(numAlleles, alleles, vc.getGenotypes(), defaultPloidy, vc.getReference().length());
}
/**
* Identical to AlleleFrequencyCalculator but does not require VariantContext, and assumes
* only a single alternative allele. This is useful in the context of AlleleFiltering where
* we are basically genotyping the allele (all haplotypes that contain the allele) versus ~allele
* (all haplotypes that do not contain the allele).
*
* @param gls the GenotyplingLikelihoods holding the alleles and sample information.
* @return result (for programming convenience)
*/
public AFCalculationResult fastCalculateDiploidBasedOnGLs(final GenotypingLikelihoods<Allele> gls, final int defaultPloidy) {
Utils.nonNull(gls, "Likelihoods can only be non-null");
Utils.validateArg(gls.numberOfAlleles()==2, "Only case of two alleles is supported");
final int numAlleles = gls.numberOfAlleles();
final List<Allele> alleles = gls.asListOfAlleles();
final int alleleLength = gls.asListOfAlleles().stream().map(Allele::length).max(Integer::compare).get();
final List<String> samples = gls.asListOfSamples();
final List<Genotype> genotypes = IntStream.range(0, samples.size()).mapToObj(idx -> new GenotypeBuilder(samples.get(idx)).alleles(alleles).PL(gls.sampleLikelihoods(idx).getAsPLs()).make()).collect(Collectors.toList());
return calculate(numAlleles, alleles, genotypes, defaultPloidy, alleleLength);
}
/**
* Private function that actually calculates allele frequencies etc.
*
*/
private AFCalculationResult calculate(final int numAlleles,
final List<Allele> alleles,
final List<Genotype> genotypes,
final int defaultPloidy,
final int refLength) {
final double[] priorPseudocounts = alleles.stream()
.mapToDouble(a -> a.isReference() ? refPseudocount : (a.length() == refLength ? snpPseudocount : indelPseudocount)).toArray();
double[] alleleCounts = new double[numAlleles];
final double flatLog10AlleleFrequency = -Math.log10(numAlleles); // log10(1/numAlleles)
double[] log10AlleleFrequencies = new IndexRange(0, numAlleles).mapToDouble(n -> flatLog10AlleleFrequency);
for (double alleleCountsMaximumDifference = Double.POSITIVE_INFINITY; alleleCountsMaximumDifference > AlleleFrequencyCalculator.THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE; ) {
final double[] newAlleleCounts = effectiveAlleleCounts(genotypes, log10AlleleFrequencies);
alleleCountsMaximumDifference = Arrays.stream(MathArrays.ebeSubtract(alleleCounts, newAlleleCounts)).map(Math::abs).max().getAsDouble();
alleleCounts = newAlleleCounts;
final double[] posteriorPseudocounts = MathArrays.ebeAdd(priorPseudocounts, alleleCounts);
// first iteration uses flat prior in order to avoid local minimum where the prior + no pseudocounts gives such a low
// effective allele frequency that it overwhelms the genotype likelihood of a real variant
// basically, we want a chance to get non-zero pseudocounts before using a prior that's biased against a variant
log10AlleleFrequencies = new Dirichlet(posteriorPseudocounts).log10MeanWeights();
}
double[] log10POfZeroCountsByAllele = new double[numAlleles];
double log10PNoVariant = 0;
final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
final Map<Integer, int[]> nonVariantIndicesByPloidy = new Int2ObjectArrayMap<>();
// re-usable buffers of the log10 genotype posteriors of genotypes missing each allele
final List<DoubleArrayList> log10AbsentPosteriors = IntStream.range(0,numAlleles).mapToObj(n -> new DoubleArrayList()).collect(Collectors.toList());
for (final Genotype g : genotypes) {
if (!GenotypeUtils.genotypeIsUsableForAFCalculation(g)) {
continue;
}
final int ploidy = g.getPloidy() == 0 ? defaultPloidy : g.getPloidy();
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g,log10AlleleFrequencies);
//the total probability
if (!spanningDeletionPresent) {
log10PNoVariant += log10GenotypePosteriors[AlleleFrequencyCalculator.HOM_REF_GENOTYPE_INDEX];
} else {
nonVariantIndicesByPloidy.computeIfAbsent(ploidy, p -> genotypeIndicesWithOnlyRefAndSpanDel(p, alleles));
final int[] nonVariantIndices = nonVariantIndicesByPloidy.get(ploidy);
final double[] nonVariantLog10Posteriors = MathUtils.applyToArray(nonVariantIndices, n -> log10GenotypePosteriors[n]);
// when the only alt allele is the spanning deletion the probability that the site is non-variant
// may be so close to 1 that finite precision error in log10SumLog10 yields a positive value,
// which is bogus. Thus we cap it at 0.
log10PNoVariant += Math.min(0,MathUtils.log10SumLog10(nonVariantLog10Posteriors));
}
// if the VC is biallelic the allele-specific qual equals the variant qual
if (numAlleles == 2 && !spanningDeletionPresent) {
continue;
}
// for each allele, we collect the log10 probabilities of genotypes in which the allele is absent, then add (in log space)
// to get the log10 probability that the allele is absent in this sample
log10AbsentPosteriors.forEach(DoubleArrayList::clear); // clear the buffers. Note that this is O(1) due to the primitive backing array
for (final GenotypeAlleleCounts gac : GenotypeAlleleCounts.iterable(ploidy, numAlleles)) {
final double log10GenotypePosterior = log10GenotypePosteriors[gac.index()];
gac.forEachAbsentAlleleIndex(a -> log10AbsentPosteriors.get(a).add(log10GenotypePosterior), numAlleles);
}
final double[] log10PNoAllele = log10AbsentPosteriors.stream()
.mapToDouble(buffer -> MathUtils.log10SumLog10(buffer.toDoubleArray()))
.map(x -> Math.min(0, x)).toArray(); // if prob of non hom ref > 1 due to finite precision, short-circuit to avoid NaN
// multiply the cumulative probabilities of alleles being absent, which is addition of logs
MathUtils.addToArrayInPlace(log10POfZeroCountsByAllele, log10PNoAllele);
}
// for biallelic the allele-specific qual equals the variant qual, and we short-circuited the calculation above
if (numAlleles == 2 && !spanningDeletionPresent) {
log10POfZeroCountsByAllele[1] = log10PNoVariant;
}
// unfortunately AFCalculationResult expects integers for the MLE. We really should emit the EM no-integer values
// which are valuable (eg in CombineGVCFs) as the sufficient statistics of the Dirichlet posterior on allele frequencies
final int[] integerAlleleCounts = Arrays.stream(alleleCounts).mapToInt(x -> (int) Math.round(x)).toArray();
final int[] integerAltAlleleCounts = Arrays.copyOfRange(integerAlleleCounts, 1, numAlleles);
//skip the ref allele (index 0)
final Map<Allele, Double> log10PRefByAllele = IntStream.range(1, numAlleles).boxed()
.collect(Collectors.toMap(alleles::get, a -> log10POfZeroCountsByAllele[a]));
return new AFCalculationResult(integerAltAlleleCounts, alleles, log10PNoVariant, log10PRefByAllele);
}
/**
* Calculate the posterior probability that a single biallelic genotype is non-ref
*
* The nth genotype (n runs from 0 to the sample ploidy, inclusive) contains n copies of the alt allele
* @param log10GenotypeLikelihoods
*/
public double calculateSingleSampleBiallelicNonRefPosterior(final double[] log10GenotypeLikelihoods, final boolean returnZeroIfRefIsMax) {
Utils.nonNull(log10GenotypeLikelihoods);
if (returnZeroIfRefIsMax && MathUtils.maxElementIndex(log10GenotypeLikelihoods) == 0) {
return 0;
}
final int ploidy = log10GenotypeLikelihoods.length - 1;
final double[] log10UnnormalizedPosteriors = new IndexRange(0, ploidy + 1)
.mapToDouble(n -> log10GenotypeLikelihoods[n] + MathUtils.logToLog10(CombinatoricsUtils.binomialCoefficientLog(ploidy, n)
+ Gamma.logGamma(n + snpPseudocount ) + Gamma.logGamma(ploidy - n + refPseudocount)));
return (returnZeroIfRefIsMax && MathUtils.maxElementIndex(log10UnnormalizedPosteriors) == 0) ? 0.0 :
1 - MathUtils.normalizeFromLog10ToLinearSpace(log10UnnormalizedPosteriors)[0];
}
// effectiveAlleleCounts[allele a] = SUM_{genotypes g} (posterior_probability(g) * num_copies of a in g), which we denote as SUM [n_g p_g]
// for numerical stability we will do this in log space:
// count = SUM 10^(log (n_g p_g)) = SUM 10^(log n_g + log p_g)
// thanks to the log-sum-exp trick this lets us work with log posteriors alone
private double[] effectiveAlleleCounts(List<Genotype> genotypes, final double[] log10AlleleFrequencies) {
final int numAlleles = log10AlleleFrequencies.length;
Utils.validateArg(numAlleles == log10AlleleFrequencies.length, "number of alleles inconsistent");
final double[] log10Result = new double[numAlleles];
Arrays.fill(log10Result, Double.NEGATIVE_INFINITY);
for (final Genotype g : genotypes) {
if (!GenotypeUtils.genotypeIsUsableForAFCalculation(g)) {
continue;
}
final double[] log10GenotypePosteriors = log10NormalizedGenotypePosteriors(g, log10AlleleFrequencies);
for (final GenotypeAlleleCounts gac : GenotypeAlleleCounts.iterable(g.getPloidy(), numAlleles)) {
gac.forEachAlleleIndexAndCount((alleleIndex, count) -> log10Result[alleleIndex] =
MathUtils.log10SumLog10(log10Result[alleleIndex], log10GenotypePosteriors[gac.index()] + Math.log10(count)));
}
}
return MathUtils.applyToArrayInPlace(log10Result, x -> Math.pow(10.0, x));
}
}