From 49d07ab0d28b2c1c6fc5deb6f264be8c4b555a5a Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Thu, 16 Jul 2015 02:35:05 +0530
Subject: [PATCH 1/3] [SPARK-8996] [MLlib] Python API for Kolmogorov-Smirnov
 Test

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 23 +++++++-
 python/pyspark/mllib/stat/_statistics.py      | 54 ++++++++++++++++++-
 python/pyspark/mllib/stat/test.py             | 35 +++++++-----
 python/pyspark/mllib/tests.py                 | 15 ++++++
 4 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index c58a64001d9a0..331cc1b9a971c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,7 @@ import org.apache.spark.mllib.recommendation._
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
-import org.apache.spark.mllib.stat.test.ChiSqTestResult
+import org.apache.spark.mllib.stat.test.{ChiSqTestResult, KolmogorovSmirnovTestResult}
 import org.apache.spark.mllib.stat.{
   KernelDensity, MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, Strategy}
@@ -1093,6 +1093,27 @@ private[python] class PythonMLLibAPI extends Serializable {
     LinearDataGenerator.generateLinearRDD(
       sc, nexamples, nfeatures, eps, nparts, intercept)
   }
+
+  /**
+   * Wrapper around Statistics.kolmogorovSmirnovTestWrapper with default params.
+   */
+  def kolmogorovSmirnovTestWrapper(
+      data: JavaRDD[Double],
+      distName: String): KolmogorovSmirnovTestResult = {
+    Statistics.kolmogorovSmirnovTest(data, distName)
+  }
+
+  /**
+   * Wrapper around Statistics.kolmogorovSmirnovTestWrapper.
+   */
+  def kolmogorovSmirnovTestWrapper(
+      data: JavaRDD[Double],
+      distName: String,
+      params: JList[Double]): KolmogorovSmirnovTestResult = {
+    val seqParams = params.asScala.toSeq
+    Statistics.kolmogorovSmirnovTest(data, distName, seqParams: _*)
+  }
+
 }
 
 /**
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index b475be4b4d953..240a8049c47d1 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -15,11 +15,15 @@
 # limitations under the License.
 #
 
+import sys
+if sys.version >= '3':
+    basestring = str
+
 from pyspark.rdd import RDD, ignore_unicode_prefix
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import Matrix, _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.stat.test import ChiSqTestResult
+from pyspark.mllib.stat.test import ChiSqTestResult, KolmogorovSmirnovTestResult
 
 
 __all__ = ['MultivariateStatisticalSummary', 'Statistics']
@@ -238,6 +242,54 @@ def chiSqTest(observed, expected=None):
             jmodel = callMLlibFunc("chiSqTest", _convert_to_vector(observed), expected)
         return ChiSqTestResult(jmodel)
 
+    @staticmethod
+    @ignore_unicode_prefix
+    def kolmogorovSmirnovTest(data, distName="norm", *params):
+        """
+        .. note:: Experimental
+
+        Performs the Kolmogorov Smirnov (KS) test for data sampled from a continuous
+        distribution. It tests the null hypothesis that the data is generated from a
+        particular distribution.
+
+        The given data is sorted, the Empirical Cumulative Distribution Function (ECDF)
+        is calculated which is the number of points having a CDF value lesser than a given point
+        divided by the total number of points. Since the data is sorted, this is a step function
+        that rises by (1 / length of data) for every ordered point.
+
+        The KS statistic gives us the maximum distance between the ECDF and the CDF. Intuitively
+        if this value is large, the probabilty that the null hypothesis is true becomes small.
+        For specific details of the implementation, please have a look at the Scala documentation.
+
+        :param data: RDD, samples from the data
+        :param distName: string, currently only "norm" is suuported. (Normal distribution)
+        :param params: additional values which need to be provided for a certain distribution.
+                       If not provided, the default values are used.
+        :return: KolmogorovSmirnovTestResult object containing the test statistic, degrees
+                 of freedom, p-value, the method used, and the null hypothesis.
+
+        >>> kstest = Statistics.kolmogorovSmirnovTest
+        >>> data = sc.parallelize([-1.0, 0.0, 1.0])
+        >>> ksmodel = kstest(data, "norm")
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
+        >>> ksmodel.nullHypothesis
+        u'Sample follows theoretical distribution'
+
+        """
+        if not isinstance(data, RDD):
+            raise TypeError("data should be an RDD, got %s." % type(data))
+        if not isinstance(distName, str):
+            raise TypeError("distName should be a string, got %s." % type(distname))
+
+        if len(params) == 0:
+            jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName)
+        else:
+            jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName, list(params))
+        return KolmogorovSmirnovTestResult(jmodel)
+
 
 def _test():
     import doctest
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
index 762506e952b43..53f61ccd2c842 100644
--- a/python/pyspark/mllib/stat/test.py
+++ b/python/pyspark/mllib/stat/test.py
@@ -15,24 +15,16 @@
 # limitations under the License.
 #
 
-from pyspark.mllib.common import JavaModelWrapper
+from pyspark.mllib.common import inherit_doc, JavaModelWrapper
 
 
-__all__ = ["ChiSqTestResult"]
+__all__ = ["ChiSqTestResult", "KolmogorovSmirnovTestResult"]
 
 
-class ChiSqTestResult(JavaModelWrapper):
+class TestResult(JavaModelWrapper):
     """
-    .. note:: Experimental
-
-    Object containing the test results for the chi-squared hypothesis test.
+    Base class for all test results.
     """
-    @property
-    def method(self):
-        """
-        Name of the test method
-        """
-        return self._java_model.method()
 
     @property
     def pValue(self):
@@ -67,3 +59,22 @@ def nullHypothesis(self):
 
     def __str__(self):
         return self._java_model.toString()
+
+
+class ChiSqTestResult(TestResult):
+    """
+    Object containing the test results for the chi-squared hypothesis test.
+    """
+
+    @property
+    def method(self):
+        """
+        Name of the test method
+        """
+        return self._java_model.method()
+
+
+class KolmogorovSmirnovTestResult(TestResult):
+    """
+    Object containing the test results for the ks tests.
+    """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index f2eab5b18f077..9b0f5577adef8 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -869,6 +869,21 @@ def test_right_number_of_results(self):
         self.assertIsNotNone(chi[1000])
 
 
+class KolmogorovSmirnovTest(MLlibTestCase):
+
+    def test_R_implementation_equivalence(self):
+        data = self.sc.parallelize([
+            1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
+            -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
+            -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
+            -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
+            0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
+        ])
+        model = Statistics.kolmogorovSmirnovTest(data, "norm")
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
+
 class SerDeTest(MLlibTestCase):
     def test_to_java_object_rdd(self):  # SPARK-6660
         data = RandomRDDs.uniformRDD(self.sc, 10, 5, seed=0)

From 021d23386b42cd28ceeb40e5fcecb4c469c1e044 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Fri, 17 Jul 2015 16:23:50 +0530
Subject: [PATCH 2/3] Remove one wrapper and other minor stuff

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 17 ++-----
 python/pyspark/mllib/stat/_statistics.py      | 48 +++++++++++--------
 python/pyspark/mllib/stat/test.py             |  6 ++-
 python/pyspark/mllib/tests.py                 |  4 ++
 4 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 331cc1b9a971c..fda8d5a0b048f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -1095,23 +1095,14 @@ private[python] class PythonMLLibAPI extends Serializable {
   }
 
   /**
-   * Wrapper around Statistics.kolmogorovSmirnovTestWrapper with default params.
+   * Java stub for Statistics.kolmogorovSmirnovTest()
    */
-  def kolmogorovSmirnovTestWrapper(
-      data: JavaRDD[Double],
-      distName: String): KolmogorovSmirnovTestResult = {
-    Statistics.kolmogorovSmirnovTest(data, distName)
-  }
-
-  /**
-   * Wrapper around Statistics.kolmogorovSmirnovTestWrapper.
-   */
-  def kolmogorovSmirnovTestWrapper(
+  def kolmogorovSmirnovTest(
       data: JavaRDD[Double],
       distName: String,
       params: JList[Double]): KolmogorovSmirnovTestResult = {
-    val seqParams = params.asScala.toSeq
-    Statistics.kolmogorovSmirnovTest(data, distName, seqParams: _*)
+    val paramsSeq = params.asScala.toSeq
+    Statistics.kolmogorovSmirnovTest(data, distName, paramsSeq: _*)
   }
 
 }
diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index 240a8049c47d1..ddc453bfba5ce 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -248,25 +248,34 @@ def kolmogorovSmirnovTest(data, distName="norm", *params):
         """
         .. note:: Experimental
 
-        Performs the Kolmogorov Smirnov (KS) test for data sampled from a continuous
-        distribution. It tests the null hypothesis that the data is generated from a
-        particular distribution.
+        Performs the Kolmogorov Smirnov (KS) test for data sampled from
+        a continuous distribution. It tests the null hypothesis that
+        the data is generated from a particular distribution.
 
-        The given data is sorted, the Empirical Cumulative Distribution Function (ECDF)
-        is calculated which is the number of points having a CDF value lesser than a given point
-        divided by the total number of points. Since the data is sorted, this is a step function
+        The given data is sorted and the Empirical Cumulative
+        Distribution Function (ECDF) is calculated
+        which for a given point is the number of points having a CDF
+        value lesser than it divided by the total number of points.
+
+        Since the data is sorted, this is a step function
         that rises by (1 / length of data) for every ordered point.
 
-        The KS statistic gives us the maximum distance between the ECDF and the CDF. Intuitively
-        if this value is large, the probabilty that the null hypothesis is true becomes small.
-        For specific details of the implementation, please have a look at the Scala documentation.
+        The KS statistic gives us the maximum distance between the
+        ECDF and the CDF. Intuitively if this statistic is large, the
+        probabilty that the null hypothesis is true becomes small.
+        For specific details of the implementation, please have a look
+        at the Scala documentation.
 
         :param data: RDD, samples from the data
-        :param distName: string, currently only "norm" is suuported. (Normal distribution)
-        :param params: additional values which need to be provided for a certain distribution.
+        :param distName: string, currently only "norm" is supported.
+                         (Normal distribution) to calculate the
+                         theoretical distribution of the data.
+        :param params: additional values which need to be provided for
+                       a certain distribution.
                        If not provided, the default values are used.
-        :return: KolmogorovSmirnovTestResult object containing the test statistic, degrees
-                 of freedom, p-value, the method used, and the null hypothesis.
+        :return: KolmogorovSmirnovTestResult object containing the test
+                 statistic, degrees of freedom, p-value,
+                 the method used, and the null hypothesis.
 
         >>> kstest = Statistics.kolmogorovSmirnovTest
         >>> data = sc.parallelize([-1.0, 0.0, 1.0])
@@ -277,18 +286,15 @@ def kolmogorovSmirnovTest(data, distName="norm", *params):
         0.175
         >>> ksmodel.nullHypothesis
         u'Sample follows theoretical distribution'
-
         """
         if not isinstance(data, RDD):
             raise TypeError("data should be an RDD, got %s." % type(data))
-        if not isinstance(distName, str):
-            raise TypeError("distName should be a string, got %s." % type(distname))
+        if not isinstance(distName, basestring):
+            raise TypeError("distName should be a string, got %s." % type(distName))
 
-        if len(params) == 0:
-            jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName)
-        else:
-            jmodel = callMLlibFunc("kolmogorovSmirnovTestWrapper", data, distName, list(params))
-        return KolmogorovSmirnovTestResult(jmodel)
+        params = [float(param) for param in params]
+        return KolmogorovSmirnovTestResult(
+            callMLlibFunc("kolmogorovSmirnovTest", data, distName, params))
 
 
 def _test():
diff --git a/python/pyspark/mllib/stat/test.py b/python/pyspark/mllib/stat/test.py
index 53f61ccd2c842..0abe104049ff9 100644
--- a/python/pyspark/mllib/stat/test.py
+++ b/python/pyspark/mllib/stat/test.py
@@ -61,9 +61,10 @@ def __str__(self):
         return self._java_model.toString()
 
 
+@inherit_doc
 class ChiSqTestResult(TestResult):
     """
-    Object containing the test results for the chi-squared hypothesis test.
+    Contains test results for the chi-squared hypothesis test.
     """
 
     @property
@@ -74,7 +75,8 @@ def method(self):
         return self._java_model.method()
 
 
+@inherit_doc
 class KolmogorovSmirnovTestResult(TestResult):
     """
-    Object containing the test results for the ks tests.
+    Contains test results for the Kolmogorov-Smirnov test.
     """
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 9b0f5577adef8..3f5a02af12e39 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -883,6 +883,10 @@ def test_R_implementation_equivalence(self):
         self.assertAlmostEqual(model.statistic, 0.189, 3)
         self.assertAlmostEqual(model.pValue, 0.422, 3)
 
+        model = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
+        self.assertAlmostEqual(model.statistic, 0.189, 3)
+        self.assertAlmostEqual(model.pValue, 0.422, 3)
+
 
 class SerDeTest(MLlibTestCase):
     def test_to_java_object_rdd(self):  # SPARK-6660

From 2dd009dcf76639db46332f4b3bd29925e7e27898 Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Sun, 19 Jul 2015 02:10:08 +0530
Subject: [PATCH 3/3] minor

---
 python/pyspark/mllib/stat/_statistics.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/stat/_statistics.py b/python/pyspark/mllib/stat/_statistics.py
index ddc453bfba5ce..36c8f48a4a882 100644
--- a/python/pyspark/mllib/stat/_statistics.py
+++ b/python/pyspark/mllib/stat/_statistics.py
@@ -248,7 +248,7 @@ def kolmogorovSmirnovTest(data, distName="norm", *params):
         """
         .. note:: Experimental
 
-        Performs the Kolmogorov Smirnov (KS) test for data sampled from
+        Performs the Kolmogorov-Smirnov (KS) test for data sampled from
         a continuous distribution. It tests the null hypothesis that
         the data is generated from a particular distribution.
 
@@ -286,6 +286,13 @@ def kolmogorovSmirnovTest(data, distName="norm", *params):
         0.175
         >>> ksmodel.nullHypothesis
         u'Sample follows theoretical distribution'
+
+        >>> data = sc.parallelize([2.0, 3.0, 4.0])
+        >>> ksmodel = kstest(data, "norm", 3.0, 1.0)
+        >>> print(round(ksmodel.pValue, 3))
+        1.0
+        >>> print(round(ksmodel.statistic, 3))
+        0.175
         """
         if not isinstance(data, RDD):
             raise TypeError("data should be an RDD, got %s." % type(data))