From 49d07ab0d28b2c1c6fc5deb6f264be8c4b555a5a Mon Sep 17 00:00:00 2001 From: MechCoder Date: Thu, 16 Jul 2015 02:35:05 +0530 Subject: [PATCH 1/3] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov Test --- .../mllib/api/python/PythonMLLibAPI.scala | 23 +++++++- python/pyspark/mllib/stat/_statistics.py | 54 ++++++++++++++++++- python/pyspark/mllib/stat/test.py | 35 +++++++----- python/pyspark/mllib/tests.py | 15 ++++++ 4 files changed, 113 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index c58a64001d9a0..331cc1b9a971c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -43,7 +43,7 @@ import org.apache.spark.mllib.recommendation._ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.stat.correlation.CorrelationNames import org.apache.spark.mllib.stat.distribution.MultivariateGaussian -import org.apache.spark.mllib.stat.test.ChiSqTestResult +import org.apache.spark.mllib.stat.test.{ChiSqTestResult, KolmogorovSmirnovTestResult} import org.apache.spark.mllib.stat.{ KernelDensity, MultivariateStatisticalSummary, Statistics} import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy} @@ -1093,6 +1093,27 @@ private[python] class PythonMLLibAPI extends Serializable { LinearDataGenerator.generateLinearRDD( sc, nexamples, nfeatures, eps, nparts, intercept) } + + /** + * Wrapper around Statistics.kolmogorovSmirnovTestWrapper with default params. + */ + def kolmogorovSmirnovTestWrapper( + data: JavaRDD[Double], + distName: String): KolmogorovSmirnovTestResult = { + Statistics.kolmogorovSmirnovTest(data, distName) + } + + /** + * Wrapper around Statistics.kolmogorovSmirnovTestWrapper. + */ + def kolmogorovSmirnovTestWrapper( + data: JavaRDD[Double], + distName: String, + params: JList[Double]): KolmogorovSmirnovTestResult = { + val seqParams = params.asScala.toSeq + Statistics.kolmogorovSmirnovTest(data, distName, seqParams: _*) + } + } /** diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index b475be4b4d953..240a8049c47d1 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -15,11 +15,15 @@ # limitations under the License. # +import sys +if sys.version >= '3': + basestring = str + from pyspark.rdd import RDD, ignore_unicode_prefix from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import Matrix, _convert_to_vector from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.stat.test import ChiSqTestResult +from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult __all__ = ['MultivariateStatisticalSummary', 'Statistics'] @@ -238,6 +242,54 @@ def chiSqTest(observed, expected=None): jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected) return ChiSqTestResult(jmodel) + @staticmethod + @ignore_unicode_prefix + def kolmogorovSmirnovTest(data, distName="norm", *params): + """ + .. note:: Experimental + + Performs the Kolmogorov Smirnov (KS) test for data sampled from a continuous + distribution. It tests the null hypothesis that the data is generated from a + particular distribution. + + The given data is sorted, the Empirical Cumulative Distribution Function (ECDF) + is calculated which is the number of points having a CDF value lesser than a given point + divided by the total number of points. Since the data is sorted, this is a step function + that rises by (1 / length of data) for every ordered point. + + The KS statistic gives us the maximum distance between the ECDF and the CDF. Intuitively + if this value is large, the probabilty that the null hypothesis is true becomes small. + For specific details of the implementation, please have a look at the Scala documentation. + + :param data: RDD, samples from the data + :param distName: string, currently only "norm" is suuported. (Normal distribution) + :param params: additional values which need to be provided for a certain distribution. + If not provided, the default values are used. + :return: KolmogorovSmirnovTestResult object containing the test statistic, degrees + of freedom, p-value, the method used, and the null hypothesis. + + >>> kstest = Statistics.kolmogorovSmirnovTest + >>> data = sc.parallelize([-1.0, 0.0, 1.0]) + >>> ksmodel = kstest(data, "norm") + >>> print(round(ksmodel.pValue, 3)) + 1.0 + >>> print(round(ksmodel.statistic, 3)) + 0.175 + >>> ksmodel.nullHypothesis + u'Sample follows theoretical distribution' + + """ + if not isinstance(data, RDD): + raise TypeError("data should be an RDD, got %s." % type(data)) + if not isinstance(distName, str): + raise TypeError("distName should be a string, got %s." % type(distname)) + + if len(params) == 0: + jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName) + else: + jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName, list(params)) + return KolmogorovSmirnovTestResult(jmodel) + def _test(): import doctest diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py index 762506e952b43..53f61ccd2c842 100644 --- a/python/pyspark/mllib/stat/test.py +++ b/python/pyspark/mllib/stat/test.py @@ -15,24 +15,16 @@ # limitations under the License. # -from pyspark.mllib.common import JavaModelWrapper +from pyspark.mllib.common import inherit_doc, JavaModelWrapper -__all__ = ["ChiSqTestResult"] +__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"] -class ChiSqTestResult(JavaModelWrapper): +class TestResult(JavaModelWrapper): """ - .. note:: Experimental - - Object containing the test results for the chi-squared hypothesis test. + Base class for all test results. """ - @property - def method(self): - """ - Name of the test method - """ - return self._java_model.method() @property def pValue(self): @@ -67,3 +59,22 @@ def nullHypothesis(self): def __str__(self): return self._java_model.toString() + + +class ChiSqTestResult(TestResult): + """ + Object containing the test results for the chi-squared hypothesis test. + """ + + @property + def method(self): + """ + Name of the test method + """ + return self._java_model.method() + + +class KolmogorovSmirnovTestResult(TestResult): + """ + Object containing the test results for the ks tests. + """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index f2eab5b18f077..9b0f5577adef8 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -869,6 +869,21 @@ def test_right_number_of_results(self): self.assertIsNotNone(chi[1000]) +class KolmogorovSmirnovTest(MLlibTestCase): + + def test_R_implementation_equivalence(self): + data = self.sc.parallelize([ + 1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501, + -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555, + -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063, + -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691, + 0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942 + ]) + model = Statistics.kolmogorovSmirnovTest(data, "norm") + self.assertAlmostEqual(model.statistic, 0.189, 3) + self.assertAlmostEqual(model.pValue, 0.422, 3) + + class SerDeTest(MLlibTestCase): def test_to_java_object_rdd(self): # SPARK-6660 data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0) From 021d23386b42cd28ceeb40e5fcecb4c469c1e044 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Fri, 17 Jul 2015 16:23:50 +0530 Subject: [PATCH 2/3] Remove one wrapper and other minor stuff --- .../mllib/api/python/PythonMLLibAPI.scala | 17 ++----- python/pyspark/mllib/stat/_statistics.py | 48 +++++++++++-------- python/pyspark/mllib/stat/test.py | 6 ++- python/pyspark/mllib/tests.py | 4 ++ 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 331cc1b9a971c..fda8d5a0b048f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1095,23 +1095,14 @@ private[python] class PythonMLLibAPI extends Serializable { } /** - * Wrapper around Statistics.kolmogorovSmirnovTestWrapper with default params. + * Java stub for Statistics.kolmogorovSmirnovTest() */ - def kolmogorovSmirnovTestWrapper( - data: JavaRDD[Double], - distName: String): KolmogorovSmirnovTestResult = { - Statistics.kolmogorovSmirnovTest(data, distName) - } - - /** - * Wrapper around Statistics.kolmogorovSmirnovTestWrapper. - */ - def kolmogorovSmirnovTestWrapper( + def kolmogorovSmirnovTest( data: JavaRDD[Double], distName: String, params: JList[Double]): KolmogorovSmirnovTestResult = { - val seqParams = params.asScala.toSeq - Statistics.kolmogorovSmirnovTest(data, distName, seqParams: _*) + val paramsSeq = params.asScala.toSeq + Statistics.kolmogorovSmirnovTest(data, distName, paramsSeq: _*) } } diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index 240a8049c47d1..ddc453bfba5ce 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -248,25 +248,34 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): """ .. note:: Experimental - Performs the Kolmogorov Smirnov (KS) test for data sampled from a continuous - distribution. It tests the null hypothesis that the data is generated from a - particular distribution. + Performs the Kolmogorov Smirnov (KS) test for data sampled from + a continuous distribution. It tests the null hypothesis that + the data is generated from a particular distribution. - The given data is sorted, the Empirical Cumulative Distribution Function (ECDF) - is calculated which is the number of points having a CDF value lesser than a given point - divided by the total number of points. Since the data is sorted, this is a step function + The given data is sorted and the Empirical Cumulative + Distribution Function (ECDF) is calculated + which for a given point is the number of points having a CDF + value lesser than it divided by the total number of points. + + Since the data is sorted, this is a step function that rises by (1 / length of data) for every ordered point. - The KS statistic gives us the maximum distance between the ECDF and the CDF. Intuitively - if this value is large, the probabilty that the null hypothesis is true becomes small. - For specific details of the implementation, please have a look at the Scala documentation. + The KS statistic gives us the maximum distance between the + ECDF and the CDF. Intuitively if this statistic is large, the + probabilty that the null hypothesis is true becomes small. + For specific details of the implementation, please have a look + at the Scala documentation. :param data: RDD, samples from the data - :param distName: string, currently only "norm" is suuported. (Normal distribution) - :param params: additional values which need to be provided for a certain distribution. + :param distName: string, currently only "norm" is supported. + (Normal distribution) to calculate the + theoretical distribution of the data. + :param params: additional values which need to be provided for + a certain distribution. If not provided, the default values are used. - :return: KolmogorovSmirnovTestResult object containing the test statistic, degrees - of freedom, p-value, the method used, and the null hypothesis. + :return: KolmogorovSmirnovTestResult object containing the test + statistic, degrees of freedom, p-value, + the method used, and the null hypothesis. >>> kstest = Statistics.kolmogorovSmirnovTest >>> data = sc.parallelize([-1.0, 0.0, 1.0]) @@ -277,18 +286,15 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): 0.175 >>> ksmodel.nullHypothesis u'Sample follows theoretical distribution' - """ if not isinstance(data, RDD): raise TypeError("data should be an RDD, got %s." % type(data)) - if not isinstance(distName, str): - raise TypeError("distName should be a string, got %s." % type(distname)) + if not isinstance(distName, basestring): + raise TypeError("distName should be a string, got %s." % type(distName)) - if len(params) == 0: - jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName) - else: - jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName, list(params)) - return KolmogorovSmirnovTestResult(jmodel) + params = [float(param) for param in params] + return KolmogorovSmirnovTestResult( + callMLlibFunc("kolmogorovSmirnovTest", data, distName, params)) def _test(): diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py index 53f61ccd2c842..0abe104049ff9 100644 --- a/python/pyspark/mllib/stat/test.py +++ b/python/pyspark/mllib/stat/test.py @@ -61,9 +61,10 @@ def __str__(self): return self._java_model.toString() +@inherit_doc class ChiSqTestResult(TestResult): """ - Object containing the test results for the chi-squared hypothesis test. + Contains test results for the chi-squared hypothesis test. """ @property @@ -74,7 +75,8 @@ def method(self): return self._java_model.method() +@inherit_doc class KolmogorovSmirnovTestResult(TestResult): """ - Object containing the test results for the ks tests. + Contains test results for the Kolmogorov-Smirnov test. """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 9b0f5577adef8..3f5a02af12e39 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -883,6 +883,10 @@ def test_R_implementation_equivalence(self): self.assertAlmostEqual(model.statistic, 0.189, 3) self.assertAlmostEqual(model.pValue, 0.422, 3) + model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) + self.assertAlmostEqual(model.statistic, 0.189, 3) + self.assertAlmostEqual(model.pValue, 0.422, 3) + class SerDeTest(MLlibTestCase): def test_to_java_object_rdd(self): # SPARK-6660 From 2dd009dcf76639db46332f4b3bd29925e7e27898 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sun, 19 Jul 2015 02:10:08 +0530 Subject: [PATCH 3/3] minor --- python/pyspark/mllib/stat/_statistics.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py index ddc453bfba5ce..36c8f48a4a882 100644 --- a/python/pyspark/mllib/stat/_statistics.py +++ b/python/pyspark/mllib/stat/_statistics.py @@ -248,7 +248,7 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): """ .. note:: Experimental - Performs the Kolmogorov Smirnov (KS) test for data sampled from + Performs the Kolmogorov-Smirnov (KS) test for data sampled from a continuous distribution. It tests the null hypothesis that the data is generated from a particular distribution. @@ -286,6 +286,13 @@ def kolmogorovSmirnovTest(data, distName="norm", *params): 0.175 >>> ksmodel.nullHypothesis u'Sample follows theoretical distribution' + + >>> data = sc.parallelize([2.0, 3.0, 4.0]) + >>> ksmodel = kstest(data, "norm", 3.0, 1.0) + >>> print(round(ksmodel.pValue, 3)) + 1.0 + >>> print(round(ksmodel.statistic, 3)) + 0.175 """ if not isinstance(data, RDD): raise TypeError("data should be an RDD, got %s." % type(data))