From 565d47627953bd5e420b81d48a9a80afe4e6f66b Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 17 Nov 2014 11:18:07 -0800
Subject: [PATCH 1/9] add python api for random forest

---
 .../mllib/api/python/PythonMLLibAPI.scala     |  38 +++-
 .../tree/model/WeightedEnsembleModel.scala    |  12 +
 python/pyspark/mllib/classification.py        |   3 +-
 python/pyspark/mllib/tree.py                  | 214 ++++++++++++++++--
 4 files changed, 243 insertions(+), 24 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index c8476a5370b6c..a3a0fc3b69d81 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -40,10 +40,10 @@ import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
 import org.apache.spark.mllib.stat.correlation.CorrelationNames
 import org.apache.spark.mllib.stat.test.ChiSqTestResult
-import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.{RandomForest, DecisionTree}
 import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
 import org.apache.spark.mllib.tree.impurity._
-import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.tree.model.{WeightedEnsembleModel, DecisionTreeModel}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -464,6 +464,40 @@ class PythonMLLibAPI extends Serializable {
     DecisionTree.train(data.rdd, strategy)
   }
 
+  /**
+   * Java stub for Python mllib RandomForest.train().
+   * This stub returns a handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on exit;
+   * see the Py4J documentation.
+   */
+  def trainRandomForestModel(
+    data: JavaRDD[LabeledPoint],
+    algoStr: String,
+    numClasses: Int,
+    categoricalFeaturesInfo: JMap[Int, Int],
+    impurityStr: String,
+    maxDepth: Int,
+    maxBins: Int,
+    numTrees: Int,
+    featureSubsetStrategy: String,
+    seed: Int): WeightedEnsembleModel = {
+
+    val algo = Algo.fromString(algoStr)
+    val impurity = Impurities.fromString(impurityStr)
+    val strategy = new Strategy(
+      algo = algo,
+      impurity = impurity,
+      maxDepth = maxDepth,
+      numClassesForClassification = numClasses,
+      maxBins = maxBins,
+      categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap)
+    if (algo == Algo.Classification) {
+      RandomForest.trainClassifier(data.rdd, strategy, numTrees, featureSubsetStrategy, seed)
+    } else {
+      RandomForest.trainRegressor(data.rdd, strategy, numTrees, featureSubsetStrategy, seed)
+    }
+  }
+
   /**
    * Java stub for mllib Statistics.colStats(X: RDD[Vector]).
    * TODO figure out return type.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala
index 7b052d9163a13..287afbca6708d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.tree.model
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._
@@ -118,6 +119,17 @@ class WeightedEnsembleModel(
    */
   def predict(features: RDD[Vector]): RDD[Double] = features.map(x => predict(x))
 
+
+  /**
+   * Predict values for the given data set.
+   *
+   * @param features JavaRDD representing data points to be predicted
+   * @return JavaRDD[Double] where each entry contains the corresponding prediction
+   */
+  def predict(features: JavaRDD[Vector]): RDD[Double] = {
+    features.rdd.map(x => predict(x)).toJavaRDD()
+  }
+
   /**
    * Print a summary of the model.
    */
diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index b654813fb4cf6..ce53a8a523e5b 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -69,7 +69,8 @@ def predict(self, x):
         else:
             exp_margin = exp(margin)
             prob = exp_margin / (1 + exp_margin)
-        return 1 if prob > 0.5 else 0
+        return 1 if prob  0.5 else 0
+        return 0 if prob < 0.5 else 1
 
 
 class LogisticRegressionWithSGD(object):
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index ef0d556fac7bc..5a4815333e2fa 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -15,21 +15,22 @@
 # limitations under the License.
 #
 
+from __future__ import absolute_import
+
+import random
+
 from pyspark import SparkContext, RDD
 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
 
-__all__ = ['DecisionTreeModel', 'DecisionTree']
+__all__ = ['DecisionTreeModel', 'DecisionTree', 'WeightedEnsembleModel', 'RandomForest']
 
 
 class DecisionTreeModel(JavaModelWrapper):
 
     """
     A decision tree model for classification or regression.
-
-    EXPERIMENTAL: This is an experimental API.
-                  It will probably be modified in future.
     """
     def predict(self, x):
         """
@@ -51,11 +52,11 @@ def depth(self):
         return self._java_model.depth()
 
     def __repr__(self):
-        """ Print summary of model. """
+        """ summary of model. """
         return self._java_model.toString()
 
     def toDebugString(self):
-        """ Print full model. """
+        """ full model. """
         return self._java_model.toDebugString()
 
 
@@ -64,14 +65,10 @@ class DecisionTree(object):
     """
     Learning algorithm for a decision tree model
     for classification or regression.
-
-    EXPERIMENTAL: This is an experimental API.
-                  It will probably be modified for Spark v1.2.
-
     """
 
-    @staticmethod
-    def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
+    @classmethod
+    def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32,
                minInstancesPerNode=1, minInfoGain=0.0):
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
@@ -79,8 +76,8 @@ def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBin
                               impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
         return DecisionTreeModel(model)
 
-    @staticmethod
-    def trainClassifier(data, numClasses, categoricalFeaturesInfo,
+    @classmethod
+    def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                         impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                         minInfoGain=0.0):
         """
@@ -132,11 +129,11 @@ def trainClassifier(data, numClasses, categoricalFeaturesInfo,
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
-        return DecisionTree._train(data, "classification", numClasses, categoricalFeaturesInfo,
-                                   impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+        return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
+                          impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
 
-    @staticmethod
-    def trainRegressor(data, categoricalFeaturesInfo,
+    @classmethod
+    def trainRegressor(cls, data, categoricalFeaturesInfo,
                        impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1,
                        minInfoGain=0.0):
         """
@@ -160,7 +157,6 @@ def trainRegressor(data, categoricalFeaturesInfo,
 
         Example usage:
 
-        >>> from numpy import array
         >>> from pyspark.mllib.regression import LabeledPoint
         >>> from pyspark.mllib.tree import DecisionTree
         >>> from pyspark.mllib.linalg import SparseVector
@@ -181,8 +177,184 @@ def trainRegressor(data, categoricalFeaturesInfo,
         >>> model.predict(rdd).collect()
         [1.0, 0.0]
         """
-        return DecisionTree._train(data, "regression", 0, categoricalFeaturesInfo,
-                                   impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+        return cls._train(data, "regression", 0, categoricalFeaturesInfo,
+                          impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
+
+
+class WeightedEnsembleModel(JavaModelWrapper):
+    """
+    A model trained by RandomForest
+    """
+    def predict(self, x):
+        """
+        Predict values for a single data point or an RDD of points using the model trained.
+        """
+        if isinstance(x, RDD):
+            return self.call("predict", x.map(_convert_to_vector))
+
+        else:
+            return self.call("predict", _convert_to_vector(x))
+
+    def numWeakHypotheses(self):
+        """
+        Get number of trees in forest.
+        """
+        return self.call("numWeakHypotheses")
+
+    def totalNumNodes(self):
+        """
+        Get total number of nodes, summed over all trees in the forest.
+        """
+        return self.call("totalNumNodes")
+
+    def __repr__(self):
+        """ Summary of model """
+        return self._java_model.toString()
+
+    def toDebugString(self):
+        """Full model """
+        return self._java_model.toDebugString()
+
+
+class RandomForest(object):
+
+    supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
+
+    @classmethod
+    def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins,
+               numTrees, featureSubsetStrategy, seed):
+        first = data.first()
+        assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
+        if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
+            raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
+        if seed is None:
+            seed = random.randint(0, 1 << 30)
+        model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features,
+                              impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed)
+        return WeightedEnsembleModel(model)
+
+    @classmethod
+    def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees,
+                        featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
+                        seed=None):
+        """
+        Method to train a decision tree model for binary or multiclass classification.
+
+        :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+                     Labels should take values {0, 1, ..., numClasses-1}.
+        :param numClassesForClassification: number of classes for classification.
+        :param categoricalFeaturesInfo: Map storing arity of categorical features.
+                                    E.g., an entry (n -> k) indicates that feature n is categorical
+                                    with k categories indexed from 0: {0, 1, ..., k-1}.
+        :param numTrees: Number of trees in the random forest.
+        :param featureSubsetStrategy: Number of features to consider for splits at each node.
+                                  Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+                                  If "auto" is set, this parameter is set based on numTrees:
+                                    if numTrees == 1, set to "all";
+                                    if numTrees > 1 (forest) set to "sqrt" for classification and
+                                      to "onethird" for regression.
+        :param impurity: Criterion used for information gain calculation.
+                     Supported values: "gini" (recommended) or "entropy".
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means
+                         1 internal node + 2 leaf nodes. (default: 4)
+        :param maxBins: maximum number of bins used for splitting features (default: 100)
+        :param seed:  Random seed for bootstrapping and choosing feature subsets.
+        :return: WeightedEnsembleModel that can be used for prediction
+
+        Example usage:
+
+        >>> from pyspark.mllib.regression import LabeledPoint
+        >>> from pyspark.mllib.tree import RandomForest
+        >>>
+        >>> data = [
+        ...     LabeledPoint(0.0, [0.0]),
+        ...     LabeledPoint(0.0, [1.0]),
+        ...     LabeledPoint(1.0, [4.0]),
+        ...     LabeledPoint(1.0, [5.0])
+        ... ]
+        >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 2, seed=42)
+        >>> model.numWeakHypotheses()
+        2
+        >>> model.totalNumNodes()
+        4
+        >>> print model,
+        WeightedEnsembleModel classifier with 2 trees
+        >>> print model.toDebugString(),
+        WeightedEnsembleModel classifier with 2 trees
+        <BLANKLINE>
+          Tree 0:
+            Predict: 1.0
+          Tree 1:
+            If (feature 0 <= 1.0)
+             Predict: 0.0
+            Else (feature 0 > 1.0)
+             Predict: 1.0
+        >>> model.predict([2.0])
+        1.0
+        >>> model.predict([0.0])  #TODO: will fix it later
+        1.0
+        >>> rdd = sc.parallelize([[1.0], [0.0]])
+        >>> model.predict(rdd).collect()
+        [1.0, 1.0]
+        """
+        return cls._train(data, "classification", numClassesForClassification,
+                          categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees,
+                          featureSubsetStrategy, seed)
+
+    @classmethod
+    def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
+                       impurity="variance", maxDepth=4, maxBins=32, seed=None):
+        """
+        Method to train a decision tree model for regression.
+
+        :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+                     Labels are real numbers.
+        :param categoricalFeaturesInfo: Map storing arity of categorical features.
+                                     E.g., an entry (n -> k) indicates that feature n is categorical
+                                     with k categories indexed from 0: {0, 1, ..., k-1}.
+        :param numTrees: Number of trees in the random forest.
+        :param featureSubsetStrategy: Number of features to consider for splits at each node.
+                                   Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+                                   If "auto" is set, this parameter is set based on numTrees:
+                                   if numTrees == 1, set to "all";
+                                   if numTrees > 1 (forest) set to "sqrt" for classification and
+                                   to "onethird" for regression.
+        :param impurity: Criterion used for information gain calculation.
+                         Supported values: "variance".
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means
+                         1 internal node + 2 leaf nodes.(default: 4)
+        :param maxBins: maximum number of bins used for splitting features (default: 100)
+        :param seed:  Random seed for bootstrapping and choosing feature subsets.
+        :return: WeightedEnsembleModel that can be used for prediction
+
+        Example usage:
+
+        >>> from pyspark.mllib.regression import LabeledPoint
+        >>> from pyspark.mllib.tree import RandomForest
+        >>> from pyspark.mllib.linalg import SparseVector
+        >>>
+        >>> sparse_data = [
+        ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+        ...     LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
+        ...     LabeledPoint(0.0, SparseVector(2, {0: 1.0})),
+        ...     LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
+        ... ]
+        >>>
+        >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)
+        >>> model.numWeakHypotheses()
+        2
+        >>> model.totalNumNodes()
+        4
+        >>> model.predict(SparseVector(2, {1: 1.0}))
+        1.0
+        >>> model.predict(SparseVector(2, {0: 1.0}))
+        0.5
+        >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])
+        >>> model.predict(rdd).collect()
+        [1.0, 0.5]
+        """
+        return cls._train(data, "regression", 0, categoricalFeaturesInfo, impurity,
+                          maxDepth, maxBins, numTrees, featureSubsetStrategy, seed)
 
 
 def _test():

From 89a000fd8e6e15c2ba83d702bbe2f294727f0a4d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 17 Nov 2014 13:11:11 -0800
Subject: [PATCH 2/9] fix docs

---
 python/pyspark/mllib/classification.py |  3 +--
 python/pyspark/mllib/tree.py           | 16 +++++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index ce53a8a523e5b..b654813fb4cf6 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -69,8 +69,7 @@ def predict(self, x):
         else:
             exp_margin = exp(margin)
             prob = exp_margin / (1 + exp_margin)
-        return 1 if prob  0.5 else 0
-        return 0 if prob < 0.5 else 1
+        return 1 if prob > 0.5 else 0
 
 
 class LogisticRegressionWithSGD(object):
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 5a4815333e2fa..1a1b580b6b656 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -31,6 +31,8 @@ class DecisionTreeModel(JavaModelWrapper):
 
     """
     A decision tree model for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
     """
     def predict(self, x):
         """
@@ -63,8 +65,9 @@ def toDebugString(self):
 class DecisionTree(object):
 
     """
-    Learning algorithm for a decision tree model
-    for classification or regression.
+    Learning algorithm for a decision tree model for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
     """
 
     @classmethod
@@ -183,7 +186,9 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
 
 class WeightedEnsembleModel(JavaModelWrapper):
     """
-    A model trained by RandomForest
+    A model trained by :class:`RandomForest`
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
     """
     def predict(self, x):
         """
@@ -217,6 +222,11 @@ def toDebugString(self):
 
 
 class RandomForest(object):
+    """
+    Learning algorithm for a random forest model for classification or regression.
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    """
 
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
 

From dae7fc01d1df78e3d4f9a18b90ed553eff48edaa Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 17 Nov 2014 17:54:48 -0800
Subject: [PATCH 3/9] address comments

---
 python/pyspark/mllib/tree.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 1a1b580b6b656..52ac1c001c215 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -24,7 +24,7 @@
 from pyspark.mllib.linalg import _convert_to_vector
 from pyspark.mllib.regression import LabeledPoint
 
-__all__ = ['DecisionTreeModel', 'DecisionTree', 'WeightedEnsembleModel', 'RandomForest']
+__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest']
 
 
 class DecisionTreeModel(JavaModelWrapper):
@@ -185,11 +185,6 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
 
 
 class WeightedEnsembleModel(JavaModelWrapper):
-    """
-    A model trained by :class:`RandomForest`
-
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
-    """
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points using the model trained.
@@ -217,10 +212,18 @@ def __repr__(self):
         return self._java_model.toString()
 
     def toDebugString(self):
-        """Full model """
+        """ Full model """
         return self._java_model.toDebugString()
 
 
+class RandomForestModel(WeightedEnsembleModel):
+    """
+    A model trained by :class:`RandomForest`
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    """
+
+
 class RandomForest(object):
     """
     Learning algorithm for a random forest model for classification or regression.
@@ -241,7 +244,7 @@ def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins,
             seed = random.randint(0, 1 << 30)
         model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features,
                               impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed)
-        return WeightedEnsembleModel(model)
+        return RandomForestModel(model)
 
     @classmethod
     def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees,
@@ -250,7 +253,7 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
         """
         Method to train a decision tree model for binary or multiclass classification.
 
-        :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+        :param data: Training dataset: RDD of LabeledPoint.
                      Labels should take values {0, 1, ..., numClasses-1}.
         :param numClassesForClassification: number of classes for classification.
         :param categoricalFeaturesInfo: Map storing arity of categorical features.
@@ -317,11 +320,12 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         """
         Method to train a decision tree model for regression.
 
-        :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+        :param data: Training dataset: RDD of LabeledPoint.
                      Labels are real numbers.
         :param categoricalFeaturesInfo: Map storing arity of categorical features.
-                                     E.g., an entry (n -> k) indicates that feature n is categorical
-                                     with k categories indexed from 0: {0, 1, ..., k-1}.
+                                     E.g., an entry (n -> k) indicates that feature
+                                     n is categorical with k categories indexed from 0:
+                                     {0, 1, ..., k-1}.
         :param numTrees: Number of trees in the random forest.
         :param featureSubsetStrategy: Number of features to consider for splits at each node.
                                    Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
@@ -331,8 +335,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
                                    to "onethird" for regression.
         :param impurity: Criterion used for information gain calculation.
                          Supported values: "variance".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means
-                         1 internal node + 2 leaf nodes.(default: 4)
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+                         depth 1 means 1 internal node + 2 leaf nodes.(default: 4)
         :param maxBins: maximum number of bins used for splitting features (default: 100)
         :param seed:  Random seed for bootstrapping and choosing feature subsets.
         :return: WeightedEnsembleModel that can be used for prediction

From 885abee042bb64771f53dab7814ed914a68b62a1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 17 Nov 2014 17:55:45 -0800
Subject: [PATCH 4/9] address comments

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index a3a0fc3b69d81..3f946ebe554ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -471,16 +471,16 @@ class PythonMLLibAPI extends Serializable {
    * see the Py4J documentation.
    */
   def trainRandomForestModel(
-    data: JavaRDD[LabeledPoint],
-    algoStr: String,
-    numClasses: Int,
-    categoricalFeaturesInfo: JMap[Int, Int],
-    impurityStr: String,
-    maxDepth: Int,
-    maxBins: Int,
-    numTrees: Int,
-    featureSubsetStrategy: String,
-    seed: Int): WeightedEnsembleModel = {
+      data: JavaRDD[LabeledPoint],
+      algoStr: String,
+      numClasses: Int,
+      categoricalFeaturesInfo: JMap[Int, Int],
+      impurityStr: String,
+      maxDepth: Int,
+      maxBins: Int,
+      numTrees: Int,
+      featureSubsetStrategy: String,
+      seed: Int): WeightedEnsembleModel = {
 
     val algo = Algo.fromString(algoStr)
     val impurity = Impurities.fromString(impurityStr)

From 0431746062ce7160777a1034787294aa7893be1d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 20 Nov 2014 09:31:47 -0800
Subject: [PATCH 5/9] rebased

---
 .../mllib/api/python/PythonMLLibAPI.scala     |  4 +-
 python/pyspark/mllib/tree.py                  | 50 ++++++++++---------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 9cdbd8e5ae1f5..d9a4e4da8e94a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -43,7 +43,7 @@ import org.apache.spark.mllib.stat.test.ChiSqTestResult
 import org.apache.spark.mllib.tree.{RandomForest, DecisionTree}
 import org.apache.spark.mllib.tree.configuration.{Algo, Strategy}
 import org.apache.spark.mllib.tree.impurity._
-import org.apache.spark.mllib.tree.model.{WeightedEnsembleModel, DecisionTreeModel}
+import org.apache.spark.mllib.tree.model.{RandomForestModel, DecisionTreeModel}
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
@@ -515,7 +515,7 @@ class PythonMLLibAPI extends Serializable {
       maxBins: Int,
       numTrees: Int,
       featureSubsetStrategy: String,
-      seed: Int): WeightedEnsembleModel = {
+      seed: Int): RandomForestModel = {
 
     val algo = Algo.fromString(algoStr)
     val impurity = Impurities.fromString(impurityStr)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 52ac1c001c215..f472a45a2f92d 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -184,7 +184,12 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
                           impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
 
 
-class WeightedEnsembleModel(JavaModelWrapper):
+class RandomForestModel(JavaModelWrapper):
+    """
+    Represents a random forest model.
+
+    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    """
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points using the model trained.
@@ -195,11 +200,11 @@ def predict(self, x):
         else:
             return self.call("predict", _convert_to_vector(x))
 
-    def numWeakHypotheses(self):
+    def numTrees(self):
         """
         Get number of trees in forest.
         """
-        return self.call("numWeakHypotheses")
+        return self.call("numTrees")
 
     def totalNumNodes(self):
         """
@@ -216,14 +221,6 @@ def toDebugString(self):
         return self._java_model.toDebugString()
 
 
-class RandomForestModel(WeightedEnsembleModel):
-    """
-    A model trained by :class:`RandomForest`
-
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
-    """
-
-
 class RandomForest(object):
     """
     Learning algorithm for a random forest model for classification or regression.
@@ -282,18 +279,18 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
         >>> data = [
         ...     LabeledPoint(0.0, [0.0]),
         ...     LabeledPoint(0.0, [1.0]),
-        ...     LabeledPoint(1.0, [4.0]),
-        ...     LabeledPoint(1.0, [5.0])
+        ...     LabeledPoint(1.0, [2.0]),
+        ...     LabeledPoint(1.0, [3.0])
         ... ]
-        >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 2, seed=42)
-        >>> model.numWeakHypotheses()
-        2
+        >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)
+        >>> model.numTrees()
+        3
         >>> model.totalNumNodes()
-        4
+        7
         >>> print model,
-        WeightedEnsembleModel classifier with 2 trees
+        TreeEnsembleModel classifier with 3 trees
         >>> print model.toDebugString(),
-        WeightedEnsembleModel classifier with 2 trees
+        TreeEnsembleModel classifier with 3 trees
         <BLANKLINE>
           Tree 0:
             Predict: 1.0
@@ -302,13 +299,18 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
              Predict: 0.0
             Else (feature 0 > 1.0)
              Predict: 1.0
+          Tree 2:
+            If (feature 0 <= 1.0)
+             Predict: 0.0
+            Else (feature 0 > 1.0)
+             Predict: 1.0
         >>> model.predict([2.0])
         1.0
-        >>> model.predict([0.0])  #TODO: will fix it later
-        1.0
-        >>> rdd = sc.parallelize([[1.0], [0.0]])
+        >>> model.predict([0.0])
+        0.0
+        >>> rdd = sc.parallelize([[3.0], [1.0]])
         >>> model.predict(rdd).collect()
-        [1.0, 1.0]
+        [1.0, 0.0]
         """
         return cls._train(data, "classification", numClassesForClassification,
                           categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees,
@@ -355,7 +357,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         ... ]
         >>>
         >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)
-        >>> model.numWeakHypotheses()
+        >>> model.numTrees()
         2
         >>> model.totalNumNodes()
         4

From e0df852ab4f353b9f800fe5374195fee5a06aa52 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 20 Nov 2014 10:15:48 -0800
Subject: [PATCH 6/9] fix docs

---
 python/pyspark/mllib/tree.py | 99 ++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 44 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index f472a45a2f92d..8f3cfe6840ec2 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -32,7 +32,8 @@ class DecisionTreeModel(JavaModelWrapper):
     """
     A decision tree model for classification or regression.
 
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified in future.
     """
     def predict(self, x):
         """
@@ -67,7 +68,8 @@ class DecisionTree(object):
     """
     Learning algorithm for a decision tree model for classification or regression.
 
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified in future.
     """
 
     @classmethod
@@ -98,8 +100,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
                          E.g., depth 0 means 1 leaf node.
                          Depth 1 means 1 internal node + 2 leaf nodes.
         :param maxBins: Number of bins used for finding splits at each node.
-        :param minInstancesPerNode: Min number of instances required at child nodes to create
-                                    the parent split
+        :param minInstancesPerNode: Min number of instances required at child
+                                    nodes to create the parent split
         :param minInfoGain: Min info gain required to create a split
         :return: DecisionTreeModel
 
@@ -153,8 +155,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
                          E.g., depth 0 means 1 leaf node.
                          Depth 1 means 1 internal node + 2 leaf nodes.
         :param maxBins: Number of bins used for finding splits at each node.
-        :param minInstancesPerNode: Min number of instances required at child nodes to create
-                                    the parent split
+        :param minInstancesPerNode: Min number of instances required at child
+                                    nodes to create the parent split
         :param minInfoGain: Min info gain required to create a split
         :return: DecisionTreeModel
 
@@ -188,11 +190,13 @@ class RandomForestModel(JavaModelWrapper):
     """
     Represents a random forest model.
 
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified in future.
     """
     def predict(self, x):
         """
-        Predict values for a single data point or an RDD of points using the model trained.
+        Predict values for a single data point or an RDD of points using
+        the model trained.
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -225,7 +229,8 @@ class RandomForest(object):
     """
     Learning algorithm for a random forest model for classification or regression.
 
-    EXPERIMENTAL: This is an experimental API. It will probably be modified in future.
+    EXPERIMENTAL: This is an experimental API.
+                  It will probably be modified in future.
     """
 
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
@@ -248,28 +253,31 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
                         featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32,
                         seed=None):
         """
-        Method to train a decision tree model for binary or multiclass classification.
+        Method to train a decision tree model for binary or multiclass
+        classification.
 
-        :param data: Training dataset: RDD of LabeledPoint.
-                     Labels should take values {0, 1, ..., numClasses-1}.
+        :param data: Training dataset: RDD of LabeledPoint. Labels should take
+               values {0, 1, ..., numClasses-1}.
         :param numClassesForClassification: number of classes for classification.
         :param categoricalFeaturesInfo: Map storing arity of categorical features.
-                                    E.g., an entry (n -> k) indicates that feature n is categorical
-                                    with k categories indexed from 0: {0, 1, ..., k-1}.
+               E.g., an entry (n -> k) indicates that feature n is categorical
+               with k categories indexed from 0: {0, 1, ..., k-1}.
         :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for splits at each node.
-                                  Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                                  If "auto" is set, this parameter is set based on numTrees:
-                                    if numTrees == 1, set to "all";
-                                    if numTrees > 1 (forest) set to "sqrt" for classification and
-                                      to "onethird" for regression.
+        :param featureSubsetStrategy: Number of features to consider for splits at
+               each node.
+               Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+               If "auto" is set, this parameter is set based on numTrees:
+               if numTrees == 1, set to "all";
+               if numTrees > 1 (forest) set to "sqrt" for classification and to
+               "onethird" for regression.
         :param impurity: Criterion used for information gain calculation.
-                     Supported values: "gini" (recommended) or "entropy".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means
-                         1 internal node + 2 leaf nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting features (default: 100)
-        :param seed:  Random seed for bootstrapping and choosing feature subsets.
-        :return: WeightedEnsembleModel that can be used for prediction
+               Supported values: "gini" (recommended) or "entropy".
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+               depth 1 means 1 internal node + 2 leaf nodes. (default: 4)
+        :param maxBins: maximum number of bins used for splitting features
+               (default: 100)
+        :param seed: Random seed for bootstrapping and choosing feature subsets.
+        :return: RandomForestModel that can be used for prediction
 
         Example usage:
 
@@ -322,26 +330,29 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         """
         Method to train a decision tree model for regression.
 
-        :param data: Training dataset: RDD of LabeledPoint.
-                     Labels are real numbers.
-        :param categoricalFeaturesInfo: Map storing arity of categorical features.
-                                     E.g., an entry (n -> k) indicates that feature
-                                     n is categorical with k categories indexed from 0:
-                                     {0, 1, ..., k-1}.
+        :param data: Training dataset: RDD of LabeledPoint. Labels are
+               real numbers.
+        :param categoricalFeaturesInfo: Map storing arity of categorical
+               features. E.g., an entry (n -> k) indicates that feature
+               n is categorical with k categories indexed from 0:
+               {0, 1, ..., k-1}.
         :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for splits at each node.
-                                   Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                                   If "auto" is set, this parameter is set based on numTrees:
-                                   if numTrees == 1, set to "all";
-                                   if numTrees > 1 (forest) set to "sqrt" for classification and
-                                   to "onethird" for regression.
+        :param featureSubsetStrategy: Number of features to consider for
+               splits at each node.
+               Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+               If "auto" is set, this parameter is set based on numTrees:
+               if numTrees == 1, set to "all";
+               if numTrees > 1 (forest) set to "sqrt" for classification and
+               to "onethird" for regression.
         :param impurity: Criterion used for information gain calculation.
-                         Supported values: "variance".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
-                         depth 1 means 1 internal node + 2 leaf nodes.(default: 4)
-        :param maxBins: maximum number of bins used for splitting features (default: 100)
-        :param seed:  Random seed for bootstrapping and choosing feature subsets.
-        :return: WeightedEnsembleModel that can be used for prediction
+               Supported values: "variance".
+        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1
+               leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+               (default: 4)
+        :param maxBins: maximum number of bins used for splitting features
+               (default: 100)
+        :param seed: Random seed for bootstrapping and choosing feature subsets.
+        :return: RandomForestModel that can be used for prediction
 
         Example usage:
 

From 4ca593d2751a20e9b53b4b047c3dbb5eed09773d Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 20 Nov 2014 10:45:14 -0800
Subject: [PATCH 7/9] fix docs

---
 python/docs/epytext.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/docs/epytext.py b/python/docs/epytext.py
index 19fefbfc057a4..e884d5e6b19c7 100644
--- a/python/docs/epytext.py
+++ b/python/docs/epytext.py
@@ -1,7 +1,7 @@
 import re
 
 RULES = (
-    (r"<[\w.]+>", r""),
+    (r"<(!BLANKLINE)[\w.]+>", r""),
     (r"L{([\w.()]+)}", r":class:`\1`"),
     (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
     (r"C{([\w.()]+)}", r":class:`\1`"),

From 53cf510ca2cfaefd6c7b1e466308b486b2b46df1 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 20 Nov 2014 13:00:47 -0800
Subject: [PATCH 8/9] fix docs

---
 .../org/apache/spark/mllib/tree/RandomForest.scala   | 12 ++++--------
 python/pyspark/mllib/tree.py                         |  6 ++----
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
index ca0b6eea9aeb6..3ae6fa2a0ec2f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala
@@ -230,8 +230,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt" for classification and
-   *                                  to "onethird" for regression.
+   *                                if numTrees > 1 (forest) set to "sqrt".
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model that can be used for prediction
    */
@@ -261,8 +260,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt" for classification and
-   *                                  to "onethird" for regression.
+   *                                if numTrees > 1 (forest) set to "sqrt".
    * @param impurity Criterion used for information gain calculation.
    *                 Supported values: "gini" (recommended) or "entropy".
    * @param maxDepth Maximum depth of the tree.
@@ -318,8 +316,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt" for classification and
-   *                                  to "onethird" for regression.
+   *                                if numTrees > 1 (forest) set to "onethird".
    * @param seed  Random seed for bootstrapping and choosing feature subsets.
    * @return a random forest model that can be used for prediction
    */
@@ -348,8 +345,7 @@ object RandomForest extends Serializable with Logging {
    *                              Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
    *                              If "auto" is set, this parameter is set based on numTrees:
    *                                if numTrees == 1, set to "all";
-   *                                if numTrees > 1 (forest) set to "sqrt" for classification and
-   *                                  to "onethird" for regression.
+   *                                if numTrees > 1 (forest) set to "onethird".
    * @param impurity Criterion used for information gain calculation.
    *                 Supported values: "variance".
    * @param maxDepth Maximum depth of the tree.
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 8f3cfe6840ec2..15a868e0daf2d 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -268,8 +268,7 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
                Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
                If "auto" is set, this parameter is set based on numTrees:
                if numTrees == 1, set to "all";
-               if numTrees > 1 (forest) set to "sqrt" for classification and to
-               "onethird" for regression.
+               if numTrees > 1 (forest) set to "sqrt".
         :param impurity: Criterion used for information gain calculation.
                Supported values: "gini" (recommended) or "entropy".
         :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
@@ -342,8 +341,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
                Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
                If "auto" is set, this parameter is set based on numTrees:
                if numTrees == 1, set to "all";
-               if numTrees > 1 (forest) set to "sqrt" for classification and
-               to "onethird" for regression.
+               if numTrees > 1 (forest) set to "onethird" for regression.
         :param impurity: Criterion used for information gain calculation.
                Supported values: "variance".
         :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1

From 8003dfc674fedeca520cfceaa6e48845cd5138be Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 20 Nov 2014 13:09:15 -0800
Subject: [PATCH 9/9] reorder

---
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  4 ++--
 python/pyspark/mllib/tree.py                    | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index d9a4e4da8e94a..b6f7618171224 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -510,11 +510,11 @@ class PythonMLLibAPI extends Serializable {
       algoStr: String,
       numClasses: Int,
       categoricalFeaturesInfo: JMap[Int, Int],
+      numTrees: Int,
+      featureSubsetStrategy: String,
       impurityStr: String,
       maxDepth: Int,
       maxBins: Int,
-      numTrees: Int,
-      featureSubsetStrategy: String,
       seed: Int): RandomForestModel = {
 
     val algo = Algo.fromString(algoStr)
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 15a868e0daf2d..46e253991aa56 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -236,16 +236,17 @@ class RandomForest(object):
     supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird")
 
     @classmethod
-    def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins,
-               numTrees, featureSubsetStrategy, seed):
+    def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees,
+               featureSubsetStrategy, impurity, maxDepth, maxBins, seed):
         first = data.first()
         assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
         if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies:
             raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy)
         if seed is None:
             seed = random.randint(0, 1 << 30)
-        model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features,
-                              impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed)
+        model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses,
+                              categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+                              maxDepth, maxBins, seed)
         return RandomForestModel(model)
 
     @classmethod
@@ -320,8 +321,8 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI
         [1.0, 0.0]
         """
         return cls._train(data, "classification", numClassesForClassification,
-                          categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees,
-                          featureSubsetStrategy, seed)
+                          categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,
+                          maxDepth, maxBins, seed)
 
     @classmethod
     def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto",
@@ -378,8 +379,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         >>> model.predict(rdd).collect()
         [1.0, 0.5]
         """
-        return cls._train(data, "regression", 0, categoricalFeaturesInfo, impurity,
-                          maxDepth, maxBins, numTrees, featureSubsetStrategy, seed)
+        return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees,
+                          featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
 
 
 def _test():