Navigation Menu

Skip to content

Commit

Permalink
Modified exactP to correctly handle ties. JIRA: MATH-1246.
Browse files Browse the repository at this point in the history
  • Loading branch information
psteitz committed Sep 9, 2015
1 parent 2091cfb commit ce98d00
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 12 deletions.
Expand Up @@ -19,6 +19,7 @@

import java.math.BigDecimal;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;

import org.apache.commons.math4.util.Precision;
Expand Down Expand Up @@ -220,7 +221,10 @@ public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[]
* {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This
* is accomplished by enumerating all partitions of the combined sample into two subsamples of
* the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the
* proportion of partitions that give \(D\) values exceeding the observed value.</li>
* proportion of partitions that give \(D\) values exceeding the observed value. In the very
* small sample case, if there are ties in the data, the actual sample values (including ties)
* are used in generating the partitions (which are basically multi-set partitions in this
* case).</li>
* <li>For mid-size samples (product of sample sizes greater than or equal to
* {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo
* simulation is used to compute the p-value. The simulation randomly generates partitions and
Expand All @@ -243,6 +247,9 @@ public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[]
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
final long lengthProduct = (long) x.length * y.length;
if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
if (hasTies(x, y)) {
return exactP(x, y, strict);
}
return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict);
}
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
Expand Down Expand Up @@ -908,6 +915,67 @@ public double exactP(double d, int n, int m, boolean strict) {
return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
}

/**
* Computes the exact p value for a two-sample Kolmogorov-Smirnov test with
* {@code x} and {@code y} as samples, possibly containing ties. This method
* uses the same implementation as {@link #exactP(double, int, int, boolean)}
* with the exception that it examines partitions of the combined sample,
* preserving ties in the data. What is returned is the exact probability
* that a random partition of the combined dataset into a subset of size
* {@code x.length} and another of size {@code y.length} yields a \(D\)
* value greater than (resp greater than or equal to) \(D(x,y)\).
* <p>
* This method should not be used on large samples (a good rule of thumb is
* to keep the product of the sample sizes less than
* {@link #SMALL_SAMPLE_PRODUCT} when using this method). If the data do
* not contain ties, {@link #exactP(double[], double[], boolean)} should be
* used instead of this method.</p>
*
* @param x first sample
* @param y second sample
* @param strict whether or not the inequality in the null hypothesis is strict
* @return p-value
*/
public double exactP(double[] x, double[] y, boolean strict) {
final double d = kolmogorovSmirnovStatistic(x, y);
final int n = x.length;
final int m = y.length;

// Concatenate x and y into universe, preserving ties in the data
final double[] universe = new double[n + m];
System.arraycopy(x, 0, universe, 0, n);
System.arraycopy(y, 0, universe, n, m);

// Iterate over all n, m partitions of the n + m elements in the universe,
// Computing D for each one
Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n);
long tail = 0;
final double[] nSet = new double[n];
final double[] mSet = new double[m];
final double tol = 1e-12; // d-values within tol of one another are considered equal
while (combinationsIterator.hasNext()) {
// Generate an n-set
final int[] nSetI = combinationsIterator.next();
// Copy the elements of the universe in the n-set to nSet
// and the others to mSet
int j = 0;
int k = 0;
for (int i = 0; i < n + m; i++) {
if (j < n && nSetI[j] == i) {
nSet[j++] = universe[i];
} else {
mSet[k++] = universe[i];
}
}
final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
final int order = Precision.compareTo(curD, d, tol);
if (order > 0 || (order == 0 && !strict)) {
tail++;
}
}
return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n);
}

/**
* Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
* is the 2-sample Kolmogorov-Smirnov statistic. See
Expand Down Expand Up @@ -1009,4 +1077,27 @@ public double monteCarloP(final double d, final int n, final int m, final boolea
}
return (double) tail / iterations;
}

/**
* Returns true iff there are ties in the combined sample
* formed from x and y.
*
* @param x first sample
* @param y second sample
* @return true if x and y together contain ties
*/
private boolean hasTies(double[] x, double[] y) {
HashSet<Double> values = new HashSet<Double>();
for (int i = 0; i < x.length; i++) {
if (!values.add(x[i])) {
return true;
}
}
for (int i = 0; i < y.length; i++) {
if (!values.add(y[i])) {
return true;
}
}
return false;
}
}
Expand Up @@ -185,6 +185,62 @@ public void testTwoSampleSmallSampleExact() {
Assert.assertEquals(0.5, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE);
}

/** Small sample no ties, exactP methods should agree */
@Test
public void testExactPConsistency() {
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
final double[] x = {
1, 7, 9, 13, 19, 21, 22, 23, 24
};
final double[] y = {
3, 4, 12, 16, 20, 27, 28, 32, 44, 54
};
Assert.assertEquals(test.exactP(x, y, true),
test.exactP(test.kolmogorovSmirnovStatistic(x, y),
x.length, y.length, true), Double.MIN_VALUE);
Assert.assertEquals(test.exactP(x, y, false),
test.exactP(test.kolmogorovSmirnovStatistic(x, y),
x.length, y.length, false), Double.MIN_VALUE);
}

/**
* Extreme case for ties - all values the same. Strict p-value should be 0,
* non-strict should be 1
*/
@Test
public void testExactPNoVariance() {
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
final double[] x = {
1, 1, 1, 1, 1, 1
};
final double[] y = {
1, 1, 1, 1
};
Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE);
Assert.assertEquals(1, test.exactP(x, y, false), Double.MIN_VALUE);
Assert.assertEquals(0, test.kolmogorovSmirnovTest(x, y, true), Double.MIN_VALUE);
Assert.assertEquals(1, test.kolmogorovSmirnovTest(x, y, false), Double.MIN_VALUE);
}

/**
* Split {0, 0, 0, 1, 1, 1} into 3-sets. Most extreme is 0's vs 1's. Non-strict
* p-value for this split should be 2 / (6 choose 3); strict should be 0.
*/
@Test
public void testExactPSimpleSplit() {
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
final double[] x = {
0, 0, 0
};
final double[] y = {
1, 1, 1
};
// Above is one way to do this - other way is s/x/y - so 2 in strict test below
Assert.assertEquals(0, test.exactP(x, y, true), Double.MIN_VALUE);
Assert.assertEquals(2 / (double) CombinatoricsUtils.binomialCoefficient(6, 3),
test.exactP(x, y, false), Double.MIN_VALUE);
}

/**
* Checks exact p-value computations using critical values from Table 9 in V.K Rohatgi, An
* Introduction to Probability and Mathematical Statistics, Wiley, 1976, ISBN 0-471-73135-8.
Expand Down Expand Up @@ -430,10 +486,10 @@ public void testTwoSamplesAllEqual() {
Assert.assertEquals(1.0, test.approximateP(0, values.length, values.length), 0.);
}
}

/**
* JIRA: MATH-1245
*
*
* Verify that D-values are not viewed as distinct when they are mathematically equal
* when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
*/
Expand All @@ -444,19 +500,19 @@ public void testDRounding() {
final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
Assert.assertEquals(0.0027495724090154106, test.kolmogorovSmirnovTest(x, y,false), tol);

final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
final double[] y1 = {0, 1, 3, 5, 7};
Assert.assertEquals(0.085914085914085896, test.kolmogorovSmirnovTest(x1, y1, false), tol);

final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
final double[] y2 = {0, 1, 2, 3, 5};
Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol);
Assert.assertEquals(0.015151515151515027, test.kolmogorovSmirnovTest(x2, y2, false), tol);
}

/**
* JIRA: MATH-1245
*
*
* Verify that D-values are not viewed as distinct when they are mathematically equal
* when computing p-statistics for small sample tests. Reference values are from R 3.2.0.
*/
Expand All @@ -465,17 +521,17 @@ public void testDRoundingMonteCarlo() {
final double tol = 1e-2;
final int iterations = 1000000;
final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000));

final double[] x = {0, 2, 3, 4, 5, 6, 7, 8, 9, 12};
final double[] y = {1, 10, 11, 13, 14, 15, 16, 17, 18};
double d = test.kolmogorovSmirnovStatistic(x, y);
Assert.assertEquals(0.0027495724090154106, test.monteCarloP(d, x.length, y.length, false, iterations), tol);

final double[] x1 = {2, 4, 6, 8, 9, 10, 11, 12, 13};
final double[] y1 = {0, 1, 3, 5, 7};
d = test.kolmogorovSmirnovStatistic(x1, y1);
Assert.assertEquals(0.085914085914085896, test.monteCarloP(d, x1.length, y1.length, false, iterations), tol);

final double[] x2 = {4, 6, 7, 8, 9, 10, 11};
final double[] y2 = {0, 1, 2, 3, 5};
d = test.kolmogorovSmirnovStatistic(x2, y2);
Expand Down Expand Up @@ -566,4 +622,4 @@ private void checkApproximateTable(int n, int m, double criticalValue, double al
Assert.assertEquals(alpha, test.approximateP(criticalValue, n, m), epsilon);
}

}
}

0 comments on commit ce98d00

Please sign in to comment.