Skip to content

Commit

Permalink
Makes Fisher's exact test match R and GATK4
Browse files Browse the repository at this point in the history
  • Loading branch information
meganshand committed Dec 5, 2016
1 parent 66aa735 commit 3265a66
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 92 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@

package org.broadinstitute.gatk.tools.walkers.annotator;

import cern.jet.math.Arithmetic;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.apache.commons.math3.distribution.HypergeometricDistribution;

import java.util.ArrayList;
import java.util.List;
Expand Down Expand Up @@ -83,22 +83,51 @@ public static Double FisherExactPValueForContingencyTable(int[][] originalTable)

int[][] table = copyContingencyTable(normalizedTable);

double pCutoff = computePValue(table);

double pValue = pCutoff;
while (rotateTable(table)) {
double pValuePiece = computePValue(table);

if (pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];
int sampleSize = colSums[0];
int numberOfNonSuccesses = rowSums[1];
int numberOfSuccesses = rowSums[0];
/*
* The lowest possible number of successes you can sample is what's left of your sample size after
* drawing every non success in the urn. If the number of non successes in the urn is greater than the sample
* size then the minimum number of drawn successes is 0.
*/
int lo = Math.max(0, sampleSize - numberOfNonSuccesses);
/*
* The highest possible number of successes you can draw is either the total sample size or the number of
* successes in the urn. (Whichever is smaller)
*/
int hi = Math.min(sampleSize, numberOfSuccesses);

/**
* If the support of the distribution is only one value, creating the HypergeometricDistribution
* doesn't make sense. There would be only one possible observation so the p-value has to be 1.
*/
if (lo == hi) {
return 1.0;
}

table = copyContingencyTable(normalizedTable);
while (unrotateTable(table)) {
double pValuePiece = computePValue(table);

if (pValuePiece <= pCutoff) {
/**
* For the hypergeometric distribution from which to calculate the probabilities:
* The population (N = a+b+c+d) is the sum of all four numbers in the contingency table. Then the number of
* "successes" (K = a+b) is the sum of the top row, and the sample size (n = a+c) is the sum of the first column.
*/
final HypergeometricDistribution dist = new HypergeometricDistribution(N, numberOfSuccesses, sampleSize);

//Then we determine a given probability with the sampled successes (k = a) from the first entry in the table.
double pCutoff = dist.probability(table[0][0]);

double pValue = 0.0;
/**
* Now cycle through all possible k's and add those whose probabilities are smaller than our observed k
* to the p-value, since this is a two-sided test
*/
for(int i = lo; i <= hi; i++){
double pValuePiece = dist.probability(i);

if(pValuePiece <= pCutoff) {
pValue += pValuePiece;
}
}
Expand Down Expand Up @@ -190,45 +219,6 @@ protected static int[][] normalizeContingencyTable(final int[][] table) {
return c;
}

protected static boolean rotateTable(int[][] table) {
table[0][0]--;
table[1][0]++;

table[0][1]++;
table[1][1]--;

return (table[0][0] >= 0 && table[1][1] >= 0);
}

protected static boolean unrotateTable(int[][] table) {
table[0][0]++;
table[1][0]--;

table[0][1]--;
table[1][1]++;

return (table[0][1] >= 0 && table[1][0] >= 0);
}

protected static double computePValue(int[][] table) {

int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
int N = rowSums[0] + rowSums[1];

// calculate in log space for better precision
double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ Arithmetic.logFactorial(rowSums[1])
+ Arithmetic.logFactorial(colSums[0])
+ Arithmetic.logFactorial(colSums[1])
- Arithmetic.logFactorial(table[0][0])
- Arithmetic.logFactorial(table[0][1])
- Arithmetic.logFactorial(table[1][0])
- Arithmetic.logFactorial(table[1][1])
- Arithmetic.logFactorial(N);
return Math.exp(pCutoff);
}

private static int sumRow(int[][] table, int column) {
int sum = 0;
for (int r = 0; r < table.length; r++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,23 +72,33 @@ public Object[][] makeUsingTable() {
//> fisher(c(2068, 6796, 1133, 0))

final List<Object[]> tests = new ArrayList<>();
tests.add(new Object[]{0, 0, 0, 0, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 100000, 1.0});
tests.add(new Object[]{1, 2, 3, 4, 1.0});
tests.add(new Object[]{0, 0, 100000, 100000, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 0, 0.0}); //below R's or Java's precision
tests.add(new Object[]{9,11,12,10, 0.7578618});
tests.add(new Object[]{12,10,9,11, 0.7578618});
tests.add(new Object[]{9,10,12,10, 0.7578618});
tests.add(new Object[]{9,9,12,10, 1.0});
tests.add(new Object[]{9,13,12,10, 0.5466948});
tests.add(new Object[]{12,10,9,13, 0.5466948});
tests.add(new Object[]{9,12,11,9, 0.5377362});

tests.add(new Object[]{200000, 100000, 1, 2, 1.0}); //differs from GATK4 implementation
tests.add(new Object[]{100, 100, 100, 0, 3.730187e-23});
tests.add(new Object[]{13736, 9047, 41, 1433, 1.232E-4}); //differs from GATK4 implementation
tests.add(new Object[]{66, 14, 64, 4, 0.0688244});
tests.add(new Object[]{351169, 306836, 153739, 2379, 0.0}); //below R's or Java's precision
tests.add(new Object[]{116449, 131216, 289, 16957, 0.0026801}); //differs from GATK4 implementation
tests.add(new Object[]{137, 159, 9, 23, 0.10752410}); //differs from GATK4 implementation
tests.add(new Object[]{129, 90, 21, 20, 0.6450772}); //differs from GATK4 implementation
tests.add(new Object[]{14054, 9160, 16, 7827, 0.0}); //below R's or Java's precision
tests.add(new Object[]{32803, 9184, 32117, 3283, 0.0289540}); //differs from GATK4 implementation
tests.add(new Object[]{2068, 6796, 1133, 0, 0.0}); //below R's or Java's precision
tests.add(new Object[]{0, 0, 0, 0, 1.0});
tests.add(new Object[]{100000, 100000, 100000, 100000, 1.0} );
tests.add(new Object[]{0, 0, 100000, 100000, 1.0});
tests.add(new Object[]{100000,100000,100000,0, 1.312515e-15});

tests.add(new Object[]{0, 0, 0, 3, 1.0});
tests.add(new Object[]{9, 0, 0, 0, 1.0});
tests.add(new Object[]{200000, 100000, 1, 2, 1.0});

tests.add(new Object[]{100,100,100,0, 3.730187e-23});
tests.add(new Object[]{13736,9047,41,1433, 6.162592e-05});
tests.add(new Object[]{66, 14, 64, 4, 4.243330e-02});
tests.add(new Object[]{351169, 306836, 153739, 2379, 2.193607e-09});
tests.add(new Object[]{116449, 131216, 289, 16957, 1.340052e-03});
tests.add(new Object[]{137, 159, 9, 23, 6.088506e-02});
tests.add(new Object[]{129, 90, 21, 20, 3.919603e-01});
tests.add(new Object[]{14054, 9160, 16, 7827, 7.466277e-17});
tests.add(new Object[]{32803, 9184, 32117, 3283, 1.795855e-02});
tests.add(new Object[]{2068, 6796, 1133, 0, 5.919091e-13});

return tests.toArray(new Object[][]{});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("75b5a925c2009a8c14ea34fff3d04443"));
Arrays.asList("751334df2c7b14cc0e57df231825da57"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public void testUserPrior() {
public void emitPLsAtAllSites() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1,
Arrays.asList("afcb9c4fd4a0e9ba4694d911bc75a7b2"));
Arrays.asList("067d0a9d90c978c5563ea56a86fc682f"));
// GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail
spec1.disableShadowBCF();

Expand All @@ -190,12 +190,12 @@ public void emitPLsAtAllSites() {

@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "8abbec54f4bf82d7c48bf40b43fdaa91" );
testHeterozosity( 0.01, "0175a3d4dedea857f87149522d133d78" );
}

@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "4221b00b0d10c005fb69fd2a298e384c" );
testHeterozosity( 1.0 / 1850, "080a14940e4d3f5c14c533d019b99341" );
}

private void testHeterozosity(final double arg, final String md5) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public void testWithAllelesPassedIn2() {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("5919639094d775cdd6b6965e1210753a"));
Arrays.asList("ac9905e26a2c51129d22b5c617679c6a"));
executeTest("test SingleSample Pilot2", spec);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public Object[][] makeMyDataProviderHaploid() {
//TODO this might need to be addressed at some point.
//TODO the following test is commented out for the record
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "c87ff5438d4a6559b33970f1294f77c6"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "c64e8f169b40dfcdac5bea71156753b5"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2cc9f789100e138ffc0c383b12a1322a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "44cc8f78e28d905efc30c218d821cc7c"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"});
Expand All @@ -106,7 +106,7 @@ public Object[][] makeMyDataProvider() {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;

// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "cbf988eca3f368ef5b17108595cd8c5e"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "f2807ff921854059746da2954dc44a7b"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d146c8dc4fc0605b3776ab5fec837d53"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "c317193f0d1c9a8168f2625c8bf1dd2b"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"});
Expand All @@ -129,7 +129,7 @@ public Object[][] makeMyDataProviderTetraploid() {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;

// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "85749762dac9684f4c8f9da18a210109"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "126527c225d24a2a0bb329ad9b3f682a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "6c727b804084a2324ecd1c98b72734b9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "190cef14684c95ba290d7a5fa13fdc07"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"});
Expand All @@ -147,7 +147,7 @@ public Object[][] makeMyDataProviderManyploid() {
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;

// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "532188686870c7edd2ea3352ea93f66a"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8e17f26d07fbba596d3cfd2e344c4cd2"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "48521b89cecceb9846e4dfc0dd415874"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "eaacbeaff99a37ffa07e1f11e7f1deb2"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ private void HCTestWithBAMOut(String bam, String args, String md5Variants, Strin

@Test
public void testHaplotypeBAMOutFlags() throws IOException {
HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "2e4cd93b4cad12259728d19a41d2a6ff", "9d6bd79cdae3e3222fa93f542fbca153");
HCTestWithBAMOut(NA12878_BAM, " -L 20:10000000-10100000 ", "6588123afd06ff6acc9f10ea25250f54", "9d6bd79cdae3e3222fa93f542fbca153");
}

@Test
Expand Down Expand Up @@ -312,7 +312,7 @@ public void testLeftAlignmentBamOutBugFix() {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("fc71471b01f93bc531e3cf19cdf78b1f"));
Arrays.asList("04ff9b301bd6f50df848800fbe09de5c"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}

Expand All @@ -329,7 +329,7 @@ public void HCTestDBSNPAnnotationWEx() {
public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,090,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("ec9a1fb56882c21f3e4793e5f71f4e9e"));
Arrays.asList("dbae51c7903e088b2e62cbada6ea2d50"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}

Expand Down Expand Up @@ -361,7 +361,7 @@ public void HCTestGraphBasedPCRFreePositiveLogLkFix() {
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -pairHMMSub " + HMM_SUB_IMPLEMENTATION + " " + ALWAYS_LOAD_VECTOR_HMM + " -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,270,000-10,300,000", 1,
Arrays.asList("fcc81209c562f3c7f1627b187a4dfab4"));
Arrays.asList("8c3ae4dc3d8af2aa8c71deaadb26cc14"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}

Expand Down
Loading

0 comments on commit 3265a66

Please sign in to comment.