From 70d1da9dd1ecf910c0d933c3d24d62a950b5b4c4 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 10 Aug 2015 19:18:01 +0800
Subject: [PATCH 1/5] Add Python API for MultilayerPerceptronClassifier and fix
 bug

---
 .../MultilayerPerceptronClassifier.scala      |   4 +-
 python/pyspark/ml/classification.py           | 119 ++++++++++++++++++
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 8cd2103d7d5e6..16687bc9bb050 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -172,8 +172,8 @@ class MultilayerPerceptronClassifier(override val uid: String)
 @Experimental
 class MultilayerPerceptronClassifierModel private[ml] (
     override val uid: String,
-    layers: Array[Int],
-    weights: Vector)
+    val layers: Array[Int],
+    val weights: Vector)
   extends PredictionModel[Vector, MultilayerPerceptronClassifierModel]
   with Serializable {
 
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 5978d8f4d3a01..b89428dbdc54b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -774,6 +774,125 @@ def theta(self):
         return self._call_java("theta")
 
 
+@inherit_doc
+class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
+                                     HasMaxIter, HasTol, HasSeed):
+    """
+    Classifier trainer based on the Multilayer Perceptron.
+    Each layer has sigmoid activation function, output layer has softmax.
+    Number of inputs has to be equal to the size of feature vectors.
+    Number of outputs has to be equal to the total number of labels.
+
+    >>> from pyspark.sql import Row
+    >>> from pyspark.mllib.linalg import Vectors
+    >>> df = sc.parallelize([
+    ...     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
+    ...     Row(label=1.0, features=Vectors.dense([0.0, 1.0])),
+    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0])),
+    ...     Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF()
+    >>> layers = [2, 5, 2]
+    >>> lr = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=11)
+    >>> model = lr.fit(df)
+    >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
+    >>> model.transform(test0).head().prediction
+    1.0
+    >>> test1 = sc.parallelize([Row(features=Vectors.dense([0.0, 0.0]))]).toDF()
+    >>> model.transform(test1).head().prediction
+    0.0
+    """
+
+    # a placeholder to make it appear in the generated doc
+    layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
+                   "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
+                   "neurons and output layer of 10 neurons, default is [1, 1].")
+    blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
+                      "matrices. Data is stacked within partitions. If block size is more than " +
+                      "remaining data in a partition then it is adjusted to the size of this " +
+                      "data. Recommended size is between 10 and 1000, default is 128.")
+
+    @keyword_only
+    def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128):
+        """
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        """
+        super(MultilayerPerceptronClassifier, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
+        self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
+                            "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
+                            "100 neurons and output layer of 10 neurons, default is [1, 1].")
+        self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
+                               "matrices. Data is stacked within partitions. If block size is " +
+                               "more than remaining data in a partition then it is adjusted to " +
+                               "the size of this data. Recommended size is between 10 and 1000, " +
+                               "default is 128.")
+        self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128):
+        """
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
+        Sets params for MultilayerPerceptronClassifier.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def _create_model(self, java_model):
+        return MultilayerPerceptronClassifierModel(java_model)
+
+    def setLayers(self, value):
+        """
+        Sets the value of :py:attr:`layers`.
+        """
+        self._paramMap[self.layers] = value
+        return self
+
+    def getLayers(self):
+        """
+        Gets the value of layers or its default value.
+        """
+        return self.getOrDefault(self.layers)
+
+    def setBlockSize(self, value):
+        """
+        Sets the value of :py:attr:`blockSize`.
+        """
+        self._paramMap[self.blockSize] = value
+        return self
+
+    def getBlockSize(self):
+        """
+        Gets the value of blockSize or its default value.
+        """
+        return self.getOrDefault(self.blockSize)
+
+
+class MultilayerPerceptronClassifierModel(JavaModel):
+    """
+    Model fitted by MultilayerPerceptronClassifier.
+    """
+
+    @property
+    def layers(self):
+        """
+        array of layer sizes including input and output layers.
+        """
+        return self._call_java("layers")
+
+    @property
+    def weights(self):
+        """
+        vector of initial weights for the model that consists of the weights of layers.
+        """
+        return self._call_java("weights")
+
+
 if __name__ == "__main__":
     import doctest
     from pyspark.context import SparkContext

From 851081755992a37eced11a57772b410ed55f7e98 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 11 Aug 2015 16:56:46 +0800
Subject: [PATCH 2/5] workaround for python 2&3 compatibility

---
 python/pyspark/ml/classification.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index b89428dbdc54b..09dd2855c3931 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -26,7 +26,8 @@
 __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassifier',
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
            'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
-           'NaiveBayesModel']
+           'NaiveBayesModel', 'MultilayerPerceptronClassifier',
+           'MultilayerPerceptronClassifierModel']
 
 
 @inherit_doc
@@ -790,9 +791,9 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     ...     Row(label=1.0, features=Vectors.dense([0.0, 1.0])),
     ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0])),
     ...     Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF()
-    >>> layers = [2, 5, 2]
-    >>> lr = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=1, seed=11)
-    >>> model = lr.fit(df)
+    >>> myLayers = [2, 5, 2]
+    >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11)
+    >>> model = mlp.fit(df)
     >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
     >>> model.transform(test0).head().prediction
     1.0
@@ -878,13 +879,6 @@ class MultilayerPerceptronClassifierModel(JavaModel):
     Model fitted by MultilayerPerceptronClassifier.
     """
 
-    @property
-    def layers(self):
-        """
-        array of layer sizes including input and output layers.
-        """
-        return self._call_java("layers")
-
     @property
     def weights(self):
         """

From 8c94570d932ed09838a1593af507bb88e1605598 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 15 Aug 2015 17:43:09 +0800
Subject: [PATCH 3/5] Rename MultilayerPerceptronClassifierModel to
 MultilayerPerceptronClassificationModel

---
 python/pyspark/ml/classification.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index f784d868deb34..c14a819d0b975 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -27,7 +27,7 @@
            'DecisionTreeClassificationModel', 'GBTClassifier', 'GBTClassificationModel',
            'RandomForestClassifier', 'RandomForestClassificationModel', 'NaiveBayes',
            'NaiveBayesModel', 'MultilayerPerceptronClassifier',
-           'MultilayerPerceptronClassifierModel']
+           'MultilayerPerceptronClassificationModel']
 
 
 @inherit_doc
@@ -879,7 +879,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
         return self._set(**kwargs)
 
     def _create_model(self, java_model):
-        return MultilayerPerceptronClassifierModel(java_model)
+        return MultilayerPerceptronClassificationModel(java_model)
 
     def setLayers(self, value):
         """
@@ -908,7 +908,7 @@ def getBlockSize(self):
         return self.getOrDefault(self.blockSize)
 
 
-class MultilayerPerceptronClassifierModel(JavaModel):
+class MultilayerPerceptronClassificationModel(JavaModel):
     """
     Model fitted by MultilayerPerceptronClassifier.
     """

From b09386241d21c5dab46b4e87176bfe2579f37135 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Sep 2015 17:45:50 +0800
Subject: [PATCH 4/5] add Java-friendly method javaLayers

---
 .../MultilayerPerceptronClassifier.scala              |  9 +++++++++
 python/pyspark/ml/classification.py                   | 11 +++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 82fc80c58054f..5f60dea91fcfa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.ml.classification
 
+import scala.collection.JavaConverters._
+
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.param.shared.{HasTol, HasMaxIter, HasSeed}
 import org.apache.spark.ml.{PredictorParams, PredictionModel, Predictor}
@@ -181,6 +183,13 @@ class MultilayerPerceptronClassificationModel private[ml] (
 
   private val mlpModel = FeedForwardTopology.multiLayerPerceptron(layers, true).getInstance(weights)
 
+  /**
+   * Returns layers in a Java List.
+   */
+  private[ml] def javaLayers: java.util.List[Int] = {
+    layers.toList.asJava
+  }
+
   /**
    * Predict label for the given features.
    * This internal method is used to implement [[transform()]] and output [[predictionCol]].
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index c14a819d0b975..eca3dec84fb5c 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -828,6 +828,10 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     >>> myLayers = [2, 5, 2]
     >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11)
     >>> model = mlp.fit(df)
+    >>> model.layers
+    [2, 5, 2]
+    >>> model.weights.size
+    27
     >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
     >>> model.transform(test0).head().prediction
     1.0
@@ -913,6 +917,13 @@ class MultilayerPerceptronClassificationModel(JavaModel):
     Model fitted by MultilayerPerceptronClassifier.
     """
 
+    @property
+    def layers(self):
+        """
+        array of layer sizes including input and output layers.
+        """
+        return self._call_java("javaLayers")
+
     @property
     def weights(self):
         """

From 5ac6a70c91c9330f7b04753cd36bf7ff5b12eb4e Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 11 Sep 2015 15:28:55 +0800
Subject: [PATCH 5/5] remove mutable default arguments and fix doc test

---
 python/pyspark/ml/classification.py | 44 ++++++++++++++++-------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index eca3dec84fb5c..4cb32f547d4c5 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -818,26 +818,29 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     Number of inputs has to be equal to the size of feature vectors.
     Number of outputs has to be equal to the total number of labels.
 
-    >>> from pyspark.sql import Row
     >>> from pyspark.mllib.linalg import Vectors
-    >>> df = sc.parallelize([
-    ...     Row(label=0.0, features=Vectors.dense([0.0, 0.0])),
-    ...     Row(label=1.0, features=Vectors.dense([0.0, 1.0])),
-    ...     Row(label=1.0, features=Vectors.dense([1.0, 0.0])),
-    ...     Row(label=0.0, features=Vectors.dense([1.0, 1.0]))]).toDF()
-    >>> myLayers = [2, 5, 2]
-    >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=myLayers, blockSize=1, seed=11)
+    >>> df = sqlContext.createDataFrame([
+    ...     (0.0, Vectors.dense([0.0, 0.0])),
+    ...     (1.0, Vectors.dense([0.0, 1.0])),
+    ...     (1.0, Vectors.dense([1.0, 0.0])),
+    ...     (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"])
+    >>> mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[2, 5, 2], blockSize=1, seed=11)
     >>> model = mlp.fit(df)
     >>> model.layers
     [2, 5, 2]
     >>> model.weights.size
     27
-    >>> test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
-    >>> model.transform(test0).head().prediction
-    1.0
-    >>> test1 = sc.parallelize([Row(features=Vectors.dense([0.0, 0.0]))]).toDF()
-    >>> model.transform(test1).head().prediction
-    0.0
+    >>> testDF = sqlContext.createDataFrame([
+    ...     (Vectors.dense([1.0, 0.0]),),
+    ...     (Vectors.dense([0.0, 0.0]),)], ["features"])
+    >>> model.transform(testDF).show()
+    +---------+----------+
+    | features|prediction|
+    +---------+----------+
+    |[1.0,0.0]|       1.0|
+    |[0.0,0.0]|       0.0|
+    +---------+----------+
+    ...
     """
 
     # a placeholder to make it appear in the generated doc
@@ -851,9 +854,9 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                 maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128):
+                 maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
         """
-        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+        __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
         """
         super(MultilayerPerceptronClassifier, self).__init__()
@@ -873,14 +876,17 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
 
     @keyword_only
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
-                  maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128):
+                  maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
         """
-        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
+        setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, tol=1e-4, seed=None, layers=[1, 1], blockSize=128)
         Sets params for MultilayerPerceptronClassifier.
         """
         kwargs = self.setParams._input_kwargs
-        return self._set(**kwargs)
+        if layers is None:
+            return self._set(**kwargs).setLayers([1, 1])
+        else:
+            return self._set(**kwargs)
 
     def _create_model(self, java_model):
         return MultilayerPerceptronClassificationModel(java_model)