From 7dda1f496465eb1b9972adf7219f4878cfe766e3 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 21 Jul 2015 16:22:55 +0800
Subject: [PATCH 1/5] support ml.NaiveBayes for Python

---
 python/pyspark/ml/classification.py | 112 +++++++++++++++++++++++++++-
 python/pyspark/ml/wrapper.py        |   2 +-
 2 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 89117e492846b..9a1a63d9450bb 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -25,7 +25,8 @@
 
 __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
-           'RandomForestClassifier', 'RandomForestClassificationModel']
+           'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
+           'NaiveBayesModel']
 
 
 @inherit_doc
@@ -576,6 +577,115 @@ class GBTClassificationModel(TreeEnsembleModels):
     """
 
 
+@inherit_doc
+class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
+    """
+    Naive Bayes Classifiers.
+
+    >>> from pyspark.sql import Row
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sc.parallelize([
+    ...     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
+    ...     Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
+    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))]).toDF()
+    >>> nb = NaiveBayes(lambda_=1.0, modelType="multinomial")
+    >>> model = nb.fit(df)
+    >>> model.pi
+    DenseVector([-0.51..., -0.91...])
+    >>> model.theta
+    DenseMatrix(2, 2, [-1.09..., -0.40..., -0.40..., -1.09...], 1)
+    >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
+    >>> model.transform(test0).head().prediction
+    1.0
+    >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
+    >>> model.transform(test1).head().prediction
+    1.0
+    """
+
+    # a placeholder to make it appear in the generated doc
+    lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.")
+    modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
+        "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 lambda_=1.0, modelType="multinomial"):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 lambda_=1.0, modelType="multinomial")
+        """
+        super(NaiveBayes, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.NaiveBayes", self.uid)
+        #: param for the smoothing parameter.
+        self.lambda_ = Param(self, "lambda_", "")
+        #: param for the model type.
+        self.modelType = Param(self, "modelType", "")
+        self._setDefault(lambda_=1.0, modelType="multinomial")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  lambda_=1.0, modelType="multinomial"):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  lambda_=1.0, modelType="multinomial")
+        Sets params for Naive Bayes.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return NaiveBayesModel(java_model)
+
+    def setLambda(self, value):
+        """
+        Sets the value of :py:attr:`lambda_`.
+        """
+        self._paramMap[self.lambda_] = value
+        return self
+
+    def getLambda(self):
+        """
+        Gets the value of lambda_ or its default value.
+        """
+        return self.getOrDefault(self.lambda_)
+
+    def setModelType(self, value):
+        """
+        Sets the value of :py:attr:`modelType`.
+        """
+        self._paramMap[self.modelType] = value
+        return self
+
+    def getModelType(self):
+        """
+        Gets the value of modelType or its default value.
+        """
+        return self.getOrDefault(self.modelType)
+
+
+class NaiveBayesModel(JavaModel):
+    """
+    Model fitted by NaiveBayes.
+    """
+
+    @property
+    def pi(self):
+        """
+        log of class priors.
+        """
+        return self._call_java("pi")
+
+    @property
+    def theta(self):
+        """
+        log of class conditional probabilities.
+        """
+        return self._call_java("theta")
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 253705bde913e..79c1474a84ac1 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -68,7 +68,7 @@ def _make_java_param_pair(self, param, value):
         """
         sc = SparkContext._active_spark_context
         param = self._resolveParam(param)
-        java_param = self._java_obj.getParam(param.name)
+        java_param = self._java_obj.getParam(param.name.rstrip("_"))
         java_value = _py2java(sc, value)
         return java_param.w(java_value)
 

From 180452ae9c3a63b86b5d7a71bec8416ff2e2bc15 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 21 Jul 2015 16:50:11 +0800
Subject: [PATCH 2/5] fix typos

---
 python/pyspark/ml/classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 9a1a63d9450bb..55d0bd3d76941 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -605,7 +605,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
     # a placeholder to make it appear in the generated doc
     lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.")
     modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
-        "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
+                      "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

From f9c94d1015e0e328aa265b86c9b95ec8185f9ba6 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 30 Jul 2015 20:45:28 +0800
Subject: [PATCH 3/5] change lambda_ to smoothing and fix other issues

---
 .../spark/ml/classification/NaiveBayes.scala  | 10 +++---
 .../classification/JavaNaiveBayesSuite.java   |  4 +--
 .../ml/classification/NaiveBayesSuite.scala   |  6 ++--
 python/pyspark/ml/classification.py           | 34 +++++++++++--------
 python/pyspark/ml/wrapper.py                  |  2 +-
 5 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 1f547e4a98af7..5be35fe209291 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -38,11 +38,11 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
    * (default = 1.0).
    * @group param
    */
-  final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The smoothing parameter.",
+  final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.",
     ParamValidators.gtEq(0))
 
   /** @group getParam */
-  final def getLambda: Double = $(lambda)
+  final def getSmoothing: Double = $(smoothing)
 
   /**
    * The model type which is a string (case-sensitive).
@@ -79,8 +79,8 @@ class NaiveBayes(override val uid: String)
    * Default is 1.0.
    * @group setParam
    */
-  def setLambda(value: Double): this.type = set(lambda, value)
-  setDefault(lambda -> 1.0)
+  def setSmoothing(value: Double): this.type = set(smoothing, value)
+  setDefault(smoothing -> 1.0)
 
   /**
    * Set the model type using a string (case-sensitive).
@@ -92,7 +92,7 @@ class NaiveBayes(override val uid: String)
 
   override protected def train(dataset: DataFrame): NaiveBayesModel = {
     val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
-    val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType))
+    val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType))
     NaiveBayesModel.fromOld(oldModel, this)
   }
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 09a9fba0c19cf..a700c9cddb206 100644
--- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -68,7 +68,7 @@ public void naiveBayesDefaultParams() {
     assert(nb.getLabelCol() == "label");
     assert(nb.getFeaturesCol() == "features");
     assert(nb.getPredictionCol() == "prediction");
-    assert(nb.getLambda() == 1.0);
+    assert(nb.getSmoothing() == 1.0);
     assert(nb.getModelType() == "multinomial");
   }
 
@@ -89,7 +89,7 @@ public void testNaiveBayes() {
     });
 
     DataFrame dataset = jsql.createDataFrame(jrdd, schema);
-    NaiveBayes nb = new NaiveBayes().setLambda(0.5).setModelType("multinomial");
+    NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial");
     NaiveBayesModel model = nb.fit(dataset);
 
     DataFrame predictionAndLabels = model.transform(dataset).select("prediction", "label");
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
index 76381a2741296..264bde3703c5f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
@@ -58,7 +58,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(nb.getLabelCol === "label")
     assert(nb.getFeaturesCol === "features")
     assert(nb.getPredictionCol === "prediction")
-    assert(nb.getLambda === 1.0)
+    assert(nb.getSmoothing === 1.0)
     assert(nb.getModelType === "multinomial")
   }
 
@@ -75,7 +75,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 42, "multinomial"))
-    val nb = new NaiveBayes().setLambda(1.0).setModelType("multinomial")
+    val nb = new NaiveBayes().setSmoothing(1.0).setModelType("multinomial")
     val model = nb.fit(testDataset)
 
     validateModelFit(pi, theta, model)
@@ -101,7 +101,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext {
 
     val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput(
       piArray, thetaArray, nPoints, 45, "bernoulli"))
-    val nb = new NaiveBayes().setLambda(1.0).setModelType("bernoulli")
+    val nb = new NaiveBayes().setSmoothing(1.0).setModelType("bernoulli")
     val model = nb.fit(testDataset)
 
     validateModelFit(pi, theta, model)
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 55d0bd3d76941..9fbc6283ed1f2 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -584,11 +584,11 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
 
     >>> from pyspark.sql import Row
     >>> from pyspark.mllib.linalg import Vectors
-    >>> df = sc.parallelize([
+    >>> df = sqlContext.createDataFrame([
     ...     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
     ...     Row(label=0.0, features=Vectors.dense([0.0, 1.0])),
-    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))]).toDF()
-    >>> nb = NaiveBayes(lambda_=1.0, modelType="multinomial")
+    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0]))])
+    >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
     >>> model = nb.fit(df)
     >>> model.pi
     DenseVector([-0.51..., -0.91...])
@@ -603,34 +603,38 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
     """
 
     # a placeholder to make it appear in the generated doc
-    lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.")
+    smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
+                    "default is 1.0")
     modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
                       "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 lambda_=1.0, modelType="multinomial"):
+                 smoothing=1.0, modelType="multinomial"):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 lambda_=1.0, modelType="multinomial")
+                 smoothing=1.0, modelType="multinomial")
         """
         super(NaiveBayes, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.NaiveBayes", self.uid)
         #: param for the smoothing parameter.
-        self.lambda_ = Param(self, "lambda_", "")
+        self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
+                             "default is 1.0")
         #: param for the model type.
-        self.modelType = Param(self, "modelType", "")
-        self._setDefault(lambda_=1.0, modelType="multinomial")
+        self.modelType = Param(self, "modelType", "The model type which is a string " +
+                               "(case-sensitive). Supported options: multinomial (default) " +
+                               "and bernoulli.")
+        self._setDefault(smoothing=1.0, modelType="multinomial")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  lambda_=1.0, modelType="multinomial"):
+                  smoothing=1.0, modelType="multinomial"):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  lambda_=1.0, modelType="multinomial")
+                  smoothing=1.0, modelType="multinomial")
         Sets params for Naive Bayes.
         """
         kwargs = self.setParams._input_kwargs
@@ -641,16 +645,16 @@ def _create_model(self, java_model):
 
     def setLambda(self, value):
         """
-        Sets the value of :py:attr:`lambda_`.
+        Sets the value of :py:attr:`smoothing`.
         """
-        self._paramMap[self.lambda_] = value
+        self._paramMap[self.smoothing] = value
         return self
 
     def getLambda(self):
         """
-        Gets the value of lambda_ or its default value.
+        Gets the value of smoothing or its default value.
         """
-        return self.getOrDefault(self.lambda_)
+        return self.getOrDefault(self.smoothing)
 
     def setModelType(self, value):
         """
diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py
index 79c1474a84ac1..253705bde913e 100644
--- a/python/pyspark/ml/wrapper.py
+++ b/python/pyspark/ml/wrapper.py
@@ -68,7 +68,7 @@ def _make_java_param_pair(self, param, value):
         """
         sc = SparkContext._active_spark_context
         param = self._resolveParam(param)
-        java_param = self._java_obj.getParam(param.name.rstrip("_"))
+        java_param = self._java_obj.getParam(param.name)
         java_value = _py2java(sc, value)
         return java_param.w(java_value)
 

From 3ecd0467b43d706b30ff427ef17c3336e4f55a9b Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 30 Jul 2015 23:19:19 +0800
Subject: [PATCH 4/5] fix typos

---
 python/pyspark/ml/classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 9fbc6283ed1f2..fc58f22a37fb6 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -604,7 +604,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol):
 
     # a placeholder to make it appear in the generated doc
     smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
-                    "default is 1.0")
+                      "default is 1.0")
     modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
                       "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
 
@@ -620,7 +620,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
             "org.apache.spark.ml.classification.NaiveBayes", self.uid)
         #: param for the smoothing parameter.
         self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
-                             "default is 1.0")
+                               "default is 1.0")
         #: param for the model type.
         self.modelType = Param(self, "modelType", "The model type which is a string " +
                                "(case-sensitive). Supported options: multinomial (default) " +

From 5ee3fd60a462f10b6cf74ad4331b3c033b5453f1 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 31 Jul 2015 12:08:57 +0800
Subject: [PATCH 5/5] fix typos

---
 python/pyspark/ml/classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index fc58f22a37fb6..df658902b32f0 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -643,14 +643,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return NaiveBayesModel(java_model)
 
-    def setLambda(self, value):
+    def setSmoothing(self, value):
         """
         Sets the value of :py:attr:`smoothing`.
         """
         self._paramMap[self.smoothing] = value
         return self
 
-    def getLambda(self):
+    def getSmoothing(self):
         """
         Gets the value of smoothing or its default value.
         """