From 7dda1f496465eb1b9972adf7219f4878cfe766e3 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 21 Jul 2015 16:22:55 +0800 Subject: [PATCH 1/5] support ml.NaiveBayes for Python --- python/pyspark/ml/classification.py | 112 +++++++++++++++++++++++++++- python/pyspark/ml/wrapper.py | 2 +- 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 89117e492846b..9a1a63d9450bb 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -25,7 +25,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', - 'RandomForestClassifier', 'RandomForestClassificationModel'] + 'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes', + 'NaiveBayesModel'] @inherit_doc @@ -576,6 +577,115 @@ class GBTClassificationModel(TreeEnsembleModels): """ +@inherit_doc +class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol): + """ + Naive Bayes Classifiers. + + >>> from pyspark.sql import Row + >>> from pyspark.mllib.linalg import Vectors + >>> df = sc.parallelize([ + ... Row(label=0.0, features=Vectors.dense([0.0, 0.0])), + ... Row(label=0.0, features=Vectors.dense([0.0, 1.0])), + ... Row(label=1.0, features=Vectors.dense([1.0, 0.0]))]).toDF() + >>> nb = NaiveBayes(lambda_=1.0, modelType="multinomial") + >>> model = nb.fit(df) + >>> model.pi + DenseVector([-0.51..., -0.91...]) + >>> model.theta + DenseMatrix(2, 2, [-1.09..., -0.40..., -0.40..., -1.09...], 1) + >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() + >>> model.transform(test0).head().prediction + 1.0 + >>> test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() + >>> model.transform(test1).head().prediction + 1.0 + """ + + # a placeholder to make it appear in the generated doc + lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.") + modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + + "(case-sensitive). Supported options: multinomial (default) and bernoulli.") + + @keyword_only + def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + lambda_=1.0, modelType="multinomial"): + """ + __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + lambda_=1.0, modelType="multinomial") + """ + super(NaiveBayes, self).__init__() + self._java_obj = self._new_java_obj( + "org.apache.spark.ml.classification.NaiveBayes", self.uid) + #: param for the smoothing parameter. + self.lambda_ = Param(self, "lambda_", "") + #: param for the model type. + self.modelType = Param(self, "modelType", "") + self._setDefault(lambda_=1.0, modelType="multinomial") + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + lambda_=1.0, modelType="multinomial"): + """ + setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + lambda_=1.0, modelType="multinomial") + Sets params for Naive Bayes. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return NaiveBayesModel(java_model) + + def setLambda(self, value): + """ + Sets the value of :py:attr:`lambda_`. + """ + self._paramMap[self.lambda_] = value + return self + + def getLambda(self): + """ + Gets the value of lambda_ or its default value. + """ + return self.getOrDefault(self.lambda_) + + def setModelType(self, value): + """ + Sets the value of :py:attr:`modelType`. + """ + self._paramMap[self.modelType] = value + return self + + def getModelType(self): + """ + Gets the value of modelType or its default value. + """ + return self.getOrDefault(self.modelType) + + +class NaiveBayesModel(JavaModel): + """ + Model fitted by NaiveBayes. + """ + + @property + def pi(self): + """ + log of class priors. + """ + return self._call_java("pi") + + @property + def theta(self): + """ + log of class conditional probabilities. + """ + return self._call_java("theta") + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 253705bde913e..79c1474a84ac1 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -68,7 +68,7 @@ def _make_java_param_pair(self, param, value): """ sc = SparkContext._active_spark_context param = self._resolveParam(param) - java_param = self._java_obj.getParam(param.name) + java_param = self._java_obj.getParam(param.name.rstrip("_")) java_value = _py2java(sc, value) return java_param.w(java_value) From 180452ae9c3a63b86b5d7a71bec8416ff2e2bc15 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 21 Jul 2015 16:50:11 +0800 Subject: [PATCH 2/5] fix typos --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 9a1a63d9450bb..55d0bd3d76941 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -605,7 +605,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol): # a placeholder to make it appear in the generated doc lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.") modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + - "(case-sensitive). Supported options: multinomial (default) and bernoulli.") + "(case-sensitive). Supported options: multinomial (default) and bernoulli.") @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", From f9c94d1015e0e328aa265b86c9b95ec8185f9ba6 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 30 Jul 2015 20:45:28 +0800 Subject: [PATCH 3/5] change lambda_ to smoothing and fix other issues --- .../spark/ml/classification/NaiveBayes.scala | 10 +++--- .../classification/JavaNaiveBayesSuite.java | 4 +-- .../ml/classification/NaiveBayesSuite.scala | 6 ++-- python/pyspark/ml/classification.py | 34 +++++++++++-------- python/pyspark/ml/wrapper.py | 2 +- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 1f547e4a98af7..5be35fe209291 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -38,11 +38,11 @@ private[ml] trait NaiveBayesParams extends PredictorParams { * (default = 1.0). * @group param */ - final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The smoothing parameter.", + final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.", ParamValidators.gtEq(0)) /** @group getParam */ - final def getLambda: Double = $(lambda) + final def getSmoothing: Double = $(smoothing) /** * The model type which is a string (case-sensitive). @@ -79,8 +79,8 @@ class NaiveBayes(override val uid: String) * Default is 1.0. * @group setParam */ - def setLambda(value: Double): this.type = set(lambda, value) - setDefault(lambda -> 1.0) + def setSmoothing(value: Double): this.type = set(smoothing, value) + setDefault(smoothing -> 1.0) /** * Set the model type using a string (case-sensitive). @@ -92,7 +92,7 @@ class NaiveBayes(override val uid: String) override protected def train(dataset: DataFrame): NaiveBayesModel = { val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset) - val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType)) + val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType)) NaiveBayesModel.fromOld(oldModel, this) } diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java index 09a9fba0c19cf..a700c9cddb206 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java @@ -68,7 +68,7 @@ public void naiveBayesDefaultParams() { assert(nb.getLabelCol() == "label"); assert(nb.getFeaturesCol() == "features"); assert(nb.getPredictionCol() == "prediction"); - assert(nb.getLambda() == 1.0); + assert(nb.getSmoothing() == 1.0); assert(nb.getModelType() == "multinomial"); } @@ -89,7 +89,7 @@ public void testNaiveBayes() { }); DataFrame dataset = jsql.createDataFrame(jrdd, schema); - NaiveBayes nb = new NaiveBayes().setLambda(0.5).setModelType("multinomial"); + NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial"); NaiveBayesModel model = nb.fit(dataset); DataFrame predictionAndLabels = model.transform(dataset).select("prediction", "label"); diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala index 76381a2741296..264bde3703c5f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala @@ -58,7 +58,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext { assert(nb.getLabelCol === "label") assert(nb.getFeaturesCol === "features") assert(nb.getPredictionCol === "prediction") - assert(nb.getLambda === 1.0) + assert(nb.getSmoothing === 1.0) assert(nb.getModelType === "multinomial") } @@ -75,7 +75,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext { val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput( piArray, thetaArray, nPoints, 42, "multinomial")) - val nb = new NaiveBayes().setLambda(1.0).setModelType("multinomial") + val nb = new NaiveBayes().setSmoothing(1.0).setModelType("multinomial") val model = nb.fit(testDataset) validateModelFit(pi, theta, model) @@ -101,7 +101,7 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext { val testDataset = sqlContext.createDataFrame(generateNaiveBayesInput( piArray, thetaArray, nPoints, 45, "bernoulli")) - val nb = new NaiveBayes().setLambda(1.0).setModelType("bernoulli") + val nb = new NaiveBayes().setSmoothing(1.0).setModelType("bernoulli") val model = nb.fit(testDataset) validateModelFit(pi, theta, model) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 55d0bd3d76941..9fbc6283ed1f2 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -584,11 +584,11 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol): >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors - >>> df = sc.parallelize([ + >>> df = sqlContext.createDataFrame([ ... Row(label=0.0, features=Vectors.dense([0.0, 0.0])), ... Row(label=0.0, features=Vectors.dense([0.0, 1.0])), - ... Row(label=1.0, features=Vectors.dense([1.0, 0.0]))]).toDF() - >>> nb = NaiveBayes(lambda_=1.0, modelType="multinomial") + ... Row(label=1.0, features=Vectors.dense([1.0, 0.0]))]) + >>> nb = NaiveBayes(smoothing=1.0, modelType="multinomial") >>> model = nb.fit(df) >>> model.pi DenseVector([-0.51..., -0.91...]) @@ -603,34 +603,38 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol): """ # a placeholder to make it appear in the generated doc - lambda_ = Param(Params._dummy(), "lambda_", "The smoothing parameter, should be >= 0.") + smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + + "default is 1.0") modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + "(case-sensitive). Supported options: multinomial (default) and bernoulli.") @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - lambda_=1.0, modelType="multinomial"): + smoothing=1.0, modelType="multinomial"): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - lambda_=1.0, modelType="multinomial") + smoothing=1.0, modelType="multinomial") """ super(NaiveBayes, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.NaiveBayes", self.uid) #: param for the smoothing parameter. - self.lambda_ = Param(self, "lambda_", "") + self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " + + "default is 1.0") #: param for the model type. - self.modelType = Param(self, "modelType", "") - self._setDefault(lambda_=1.0, modelType="multinomial") + self.modelType = Param(self, "modelType", "The model type which is a string " + + "(case-sensitive). Supported options: multinomial (default) " + + "and bernoulli.") + self._setDefault(smoothing=1.0, modelType="multinomial") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - lambda_=1.0, modelType="multinomial"): + smoothing=1.0, modelType="multinomial"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - lambda_=1.0, modelType="multinomial") + smoothing=1.0, modelType="multinomial") Sets params for Naive Bayes. """ kwargs = self.setParams._input_kwargs @@ -641,16 +645,16 @@ def _create_model(self, java_model): def setLambda(self, value): """ - Sets the value of :py:attr:`lambda_`. + Sets the value of :py:attr:`smoothing`. """ - self._paramMap[self.lambda_] = value + self._paramMap[self.smoothing] = value return self def getLambda(self): """ - Gets the value of lambda_ or its default value. + Gets the value of smoothing or its default value. """ - return self.getOrDefault(self.lambda_) + return self.getOrDefault(self.smoothing) def setModelType(self, value): """ diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 79c1474a84ac1..253705bde913e 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -68,7 +68,7 @@ def _make_java_param_pair(self, param, value): """ sc = SparkContext._active_spark_context param = self._resolveParam(param) - java_param = self._java_obj.getParam(param.name.rstrip("_")) + java_param = self._java_obj.getParam(param.name) java_value = _py2java(sc, value) return java_param.w(java_value) From 3ecd0467b43d706b30ff427ef17c3336e4f55a9b Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 30 Jul 2015 23:19:19 +0800 Subject: [PATCH 4/5] fix typos --- python/pyspark/ml/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 9fbc6283ed1f2..fc58f22a37fb6 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -604,7 +604,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol): # a placeholder to make it appear in the generated doc smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0") + "default is 1.0") modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + "(case-sensitive). Supported options: multinomial (default) and bernoulli.") @@ -620,7 +620,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred "org.apache.spark.ml.classification.NaiveBayes", self.uid) #: param for the smoothing parameter. self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0") + "default is 1.0") #: param for the model type. self.modelType = Param(self, "modelType", "The model type which is a string " + "(case-sensitive). Supported options: multinomial (default) " + From 5ee3fd60a462f10b6cf74ad4331b3c033b5453f1 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 31 Jul 2015 12:08:57 +0800 Subject: [PATCH 5/5] fix typos --- python/pyspark/ml/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index fc58f22a37fb6..df658902b32f0 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -643,14 +643,14 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre def _create_model(self, java_model): return NaiveBayesModel(java_model) - def setLambda(self, value): + def setSmoothing(self, value): """ Sets the value of :py:attr:`smoothing`. """ self._paramMap[self.smoothing] = value return self - def getLambda(self): + def getSmoothing(self): """ Gets the value of smoothing or its default value. """