From 70d1da9dd1ecf910c0d933c3d24d62a950b5b4c4 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Mon, 10 Aug 2015 19:18:01 +0800 Subject: [PATCH 1/5] Add Python API for MultilayerPerceptronClassifier and fix bug --- .../MultilayerPerceptronClassifier.scala | 4 +- python/pyspark/ml/classification.py | 119 ++++++++++++++++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 8cd2103d7d5e6..16687bc9bb050 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -172,8 +172,8 @@ class MultilayerPerceptronClassifier(override val uid: String) @Experimental class MultilayerPerceptronClassifierModel private[ml] ( override val uid: String, - layers: Array[Int], - weights: Vector) + val layers: Array[Int], + val weights: Vector) extends PredictionModel[Vector, MultilayerPerceptronClassifierModel] with Serializable { diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5978d8f4d3a01..b89428dbdc54b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -774,6 +774,125 @@ def theta(self): return self._call_java("theta") +@inherit_doc +class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, + HasMaxIter, HasTol, HasSeed): + """ + Classifier trainer based on the Multilayer Perceptron. + Each layer has sigmoid activation function, output layer has softmax. + Number of inputs has to be equal to the size of feature vectors. + Number of outputs has to be equal to the total number of labels. + + >>> from pyspark.sql import Row + >>> from pyspark.mllib.linalg import Vectors + >>> df = sc.parallelize([ + ... Row(label=0.0, features=Vectors.dense([0.0, 0.0])), + ... Row(label=1.0, features=Vectors.dense([0.0, 1.0])), + ... Row(label=1.0, features=Vectors.dense([1.0, 0.0])), + ... Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF() + >>> layers = [2, 5, 2] + >>> lr = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=11) + >>> model = lr.fit(df) + >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() + >>> model.transform(test0).head().prediction + 1.0 + >>> test1 = sc.parallelize([Row(features=Vectors.dense([0.0, 0.0]))]).toDF() + >>> model.transform(test1).head().prediction + 0.0 + """ + + # a placeholder to make it appear in the generated doc + layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + + "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + + "neurons and output layer of 10 neurons, default is [1, 1].") + blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + + "matrices. Data is stacked within partitions. If block size is more than " + + "remaining data in a partition then it is adjusted to the size of this " + + "data. Recommended size is between 10 and 1000, default is 128.") + + @keyword_only + def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128): + """ + __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) + """ + super(MultilayerPerceptronClassifier, self).__init__() + self._java_obj = self._new_java_obj( + "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) + self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " + + "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " + + "100 neurons and output layer of 10 neurons, default is [1, 1].") + self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " + + "matrices. Data is stacked within partitions. If block size is " + + "more than remaining data in a partition then it is adjusted to " + + "the size of this data. Recommended size is between 10 and 1000, " + + "default is 128.") + self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128) + kwargs = self.__init__._input_kwargs + self.setParams(**kwargs) + + @keyword_only + def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128): + """ + setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) + Sets params for MultilayerPerceptronClassifier. + """ + kwargs = self.setParams._input_kwargs + return self._set(**kwargs) + + def _create_model(self, java_model): + return MultilayerPerceptronClassifierModel(java_model) + + def setLayers(self, value): + """ + Sets the value of :py:attr:`layers`. + """ + self._paramMap[self.layers] = value + return self + + def getLayers(self): + """ + Gets the value of layers or its default value. + """ + return self.getOrDefault(self.layers) + + def setBlockSize(self, value): + """ + Sets the value of :py:attr:`blockSize`. + """ + self._paramMap[self.blockSize] = value + return self + + def getBlockSize(self): + """ + Gets the value of blockSize or its default value. + """ + return self.getOrDefault(self.blockSize) + + +class MultilayerPerceptronClassifierModel(JavaModel): + """ + Model fitted by MultilayerPerceptronClassifier. + """ + + @property + def layers(self): + """ + array of layer sizes including input and output layers. + """ + return self._call_java("layers") + + @property + def weights(self): + """ + vector of initial weights for the model that consists of the weights of layers. + """ + return self._call_java("weights") + + if __name__ == "__main__": import doctest from pyspark.context import SparkContext From 851081755992a37eced11a57772b410ed55f7e98 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 11 Aug 2015 16:56:46 +0800 Subject: [PATCH 2/5] workaround for python 2&3 compatibility --- python/pyspark/ml/classification.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b89428dbdc54b..09dd2855c3931 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -26,7 +26,8 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier', 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', 'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes', - 'NaiveBayesModel'] + 'NaiveBayesModel', 'MultilayerPerceptronClassifier', + 'MultilayerPerceptronClassifierModel'] @inherit_doc @@ -790,9 +791,9 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, ... Row(label=1.0, features=Vectors.dense([0.0, 1.0])), ... Row(label=1.0, features=Vectors.dense([1.0, 0.0])), ... Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF() - >>> layers = [2, 5, 2] - >>> lr = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=11) - >>> model = lr.fit(df) + >>> myLayers = [2, 5, 2] + >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11) + >>> model = mlp.fit(df) >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() >>> model.transform(test0).head().prediction 1.0 @@ -878,13 +879,6 @@ class MultilayerPerceptronClassifierModel(JavaModel): Model fitted by MultilayerPerceptronClassifier. """ - @property - def layers(self): - """ - array of layer sizes including input and output layers. - """ - return self._call_java("layers") - @property def weights(self): """ From 8c94570d932ed09838a1593af507bb88e1605598 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Sat, 15 Aug 2015 17:43:09 +0800 Subject: [PATCH 3/5] Rename MultilayerPerceptronClassifierModel to MultilayerPerceptronClassificationModel --- python/pyspark/ml/classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f784d868deb34..c14a819d0b975 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -27,7 +27,7 @@ 'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel', 'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes', 'NaiveBayesModel', 'MultilayerPerceptronClassifier', - 'MultilayerPerceptronClassifierModel'] + 'MultilayerPerceptronClassificationModel'] @inherit_doc @@ -879,7 +879,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre return self._set(**kwargs) def _create_model(self, java_model): - return MultilayerPerceptronClassifierModel(java_model) + return MultilayerPerceptronClassificationModel(java_model) def setLayers(self, value): """ @@ -908,7 +908,7 @@ def getBlockSize(self): return self.getOrDefault(self.blockSize) -class MultilayerPerceptronClassifierModel(JavaModel): +class MultilayerPerceptronClassificationModel(JavaModel): """ Model fitted by MultilayerPerceptronClassifier. """ From b09386241d21c5dab46b4e87176bfe2579f37135 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Thu, 10 Sep 2015 17:45:50 +0800 Subject: [PATCH 4/5] add Java-friendly method javaLayers --- .../MultilayerPerceptronClassifier.scala | 9 +++++++++ python/pyspark/ml/classification.py | 11 +++++++++++ 2 files changed, 20 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 82fc80c58054f..5f60dea91fcfa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.classification +import scala.collection.JavaConverters._ + import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed} import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor} @@ -181,6 +183,13 @@ class MultilayerPerceptronClassificationModel private[ml] ( private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights) + /** + * Returns layers in a Java List. + */ + private[ml] def javaLayers: java.util.List[Int] = { + layers.toList.asJava + } + /** * Predict label for the given features. * This internal method is used to implement [[transform()]] and output [[predictionCol]]. diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index c14a819d0b975..eca3dec84fb5c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -828,6 +828,10 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, >>> myLayers = [2, 5, 2] >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11) >>> model = mlp.fit(df) + >>> model.layers + [2, 5, 2] + >>> model.weights.size + 27 >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() >>> model.transform(test0).head().prediction 1.0 @@ -913,6 +917,13 @@ class MultilayerPerceptronClassificationModel(JavaModel): Model fitted by MultilayerPerceptronClassifier. """ + @property + def layers(self): + """ + array of layer sizes including input and output layers. + """ + return self._call_java("javaLayers") + @property def weights(self): """ From 5ac6a70c91c9330f7b04753cd36bf7ff5b12eb4e Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Fri, 11 Sep 2015 15:28:55 +0800 Subject: [PATCH 5/5] remove mutable default arguments and fix doc test --- python/pyspark/ml/classification.py | 44 ++++++++++++++++------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index eca3dec84fb5c..4cb32f547d4c5 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -818,26 +818,29 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. - >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors - >>> df = sc.parallelize([ - ... Row(label=0.0, features=Vectors.dense([0.0, 0.0])), - ... Row(label=1.0, features=Vectors.dense([0.0, 1.0])), - ... Row(label=1.0, features=Vectors.dense([1.0, 0.0])), - ... Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF() - >>> myLayers = [2, 5, 2] - >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11) + >>> df = sqlContext.createDataFrame([ + ... (0.0, Vectors.dense([0.0, 0.0])), + ... (1.0, Vectors.dense([0.0, 1.0])), + ... (1.0, Vectors.dense([1.0, 0.0])), + ... (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) + >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11) >>> model = mlp.fit(df) >>> model.layers [2, 5, 2] >>> model.weights.size 27 - >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF() - >>> model.transform(test0).head().prediction - 1.0 - >>> test1 = sc.parallelize([Row(features=Vectors.dense([0.0, 0.0]))]).toDF() - >>> model.transform(test1).head().prediction - 0.0 + >>> testDF = sqlContext.createDataFrame([ + ... (Vectors.dense([1.0, 0.0]),), + ... (Vectors.dense([0.0, 0.0]),)], ["features"]) + >>> model.transform(testDF).show() + +---------+----------+ + | features|prediction| + +---------+----------+ + |[1.0,0.0]| 1.0| + |[0.0,0.0]| 0.0| + +---------+----------+ + ... """ # a placeholder to make it appear in the generated doc @@ -851,9 +854,9 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ - __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) """ super(MultilayerPerceptronClassifier, self).__init__() @@ -873,14 +876,17 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ - setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128) Sets params for MultilayerPerceptronClassifier. """ kwargs = self.setParams._input_kwargs - return self._set(**kwargs) + if layers is None: + return self._set(**kwargs).setLayers([1, 1]) + else: + return self._set(**kwargs) def _create_model(self, java_model): return MultilayerPerceptronClassificationModel(java_model)