From 565d47627953bd5e420b81d48a9a80afe4e6f66b Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Mon, 17 Nov 2014 11:18:07 -0800 Subject: [PATCH 1/9] add python api for random forest --- .../mllib/api/python/PythonMLLibAPI.scala | 38 +++- .../tree/model/WeightedEnsembleModel.scala | 12 + python/pyspark/mllib/classification.py | 3 +- python/pyspark/mllib/tree.py | 214 ++++++++++++++++-- 4 files changed, 243 insertions(+), 24 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index c8476a5370b6c..a3a0fc3b69d81 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -40,10 +40,10 @@ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.mllib.stat.correlation.CorrelationNames import org.apache.spark.mllib.stat.test.ChiSqTestResult -import org.apache.spark.mllib.tree.DecisionTree +import org.apache.spark.mllib.tree.{RandomForest, DecisionTree} import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} import org.apache.spark.mllib.tree.impurity._ -import org.apache.spark.mllib.tree.model.DecisionTreeModel +import org.apache.spark.mllib.tree.model.{WeightedEnsembleModel, DecisionTreeModel} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -464,6 +464,40 @@ class PythonMLLibAPI extends Serializable { DecisionTree.train(data.rdd, strategy) } + /** + * Java stub for Python mllib RandomForest.train(). + * This stub returns a handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on exit; + * see the Py4J documentation. + */ + def trainRandomForestModel( + data: JavaRDD[LabeledPoint], + algoStr: String, + numClasses: Int, + categoricalFeaturesInfo: JMap[Int, Int], + impurityStr: String, + maxDepth: Int, + maxBins: Int, + numTrees: Int, + featureSubsetStrategy: String, + seed: Int): WeightedEnsembleModel = { + + val algo = Algo.fromString(algoStr) + val impurity = Impurities.fromString(impurityStr) + val strategy = new Strategy( + algo = algo, + impurity = impurity, + maxDepth = maxDepth, + numClassesForClassification = numClasses, + maxBins = maxBins, + categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap) + if (algo == Algo.Classification) { + RandomForest.trainClassifier(data.rdd, strategy, numTrees, featureSubsetStrategy, seed) + } else { + RandomForest.trainRegressor(data.rdd, strategy, numTrees, featureSubsetStrategy, seed) + } + } + /** * Java stub for mllib Statistics.colStats(X: RDD[Vector]). * TODO figure out return type. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala index 7b052d9163a13..287afbca6708d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/WeightedEnsembleModel.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.Experimental +import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._ @@ -118,6 +119,17 @@ class WeightedEnsembleModel( */ def predict(features: RDD[Vector]): RDD[Double] = features.map(x => predict(x)) + + /** + * Predict values for the given data set. + * + * @param features JavaRDD representing data points to be predicted + * @return JavaRDD[Double] where each entry contains the corresponding prediction + */ + def predict(features: JavaRDD[Vector]): RDD[Double] = { + features.rdd.map(x => predict(x)).toJavaRDD() + } + /** * Print a summary of the model. */ diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index b654813fb4cf6..ce53a8a523e5b 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -69,7 +69,8 @@ def predict(self, x): else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) - return 1 if prob > 0.5 else 0 + return 1 if prob 0.5 else 0 + return 0 if prob < 0.5 else 1 class LogisticRegressionWithSGD(object): diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index ef0d556fac7bc..5a4815333e2fa 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -15,21 +15,22 @@ # limitations under the License. # +from __future__ import absolute_import + +import random + from pyspark import SparkContext, RDD from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint -__all__ = ['DecisionTreeModel', 'DecisionTree'] +__all__ = ['DecisionTreeModel', 'DecisionTree', 'WeightedEnsembleModel', 'RandomForest'] class DecisionTreeModel(JavaModelWrapper): """ A decision tree model for classification or regression. - - EXPERIMENTAL: This is an experimental API. - It will probably be modified in future. """ def predict(self, x): """ @@ -51,11 +52,11 @@ def depth(self): return self._java_model.depth() def __repr__(self): - """ Print summary of model. """ + """ summary of model. """ return self._java_model.toString() def toDebugString(self): - """ Print full model. """ + """ full model. """ return self._java_model.toDebugString() @@ -64,14 +65,10 @@ class DecisionTree(object): """ Learning algorithm for a decision tree model for classification or regression. - - EXPERIMENTAL: This is an experimental API. - It will probably be modified for Spark v1.2. - """ - @staticmethod - def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, + @classmethod + def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" @@ -79,8 +76,8 @@ def _train(data, type, numClasses, features, impurity="gini", maxDepth=5, maxBin impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) return DecisionTreeModel(model) - @staticmethod - def trainClassifier(data, numClasses, categoricalFeaturesInfo, + @classmethod + def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ @@ -132,11 +129,11 @@ def trainClassifier(data, numClasses, categoricalFeaturesInfo, >>> model.predict(rdd).collect() [1.0, 0.0] """ - return DecisionTree._train(data, "classification", numClasses, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + return cls._train(data, "classification", numClasses, categoricalFeaturesInfo, + impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) - @staticmethod - def trainRegressor(data, categoricalFeaturesInfo, + @classmethod + def trainRegressor(cls, data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ @@ -160,7 +157,6 @@ def trainRegressor(data, categoricalFeaturesInfo, Example usage: - >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector @@ -181,8 +177,184 @@ def trainRegressor(data, categoricalFeaturesInfo, >>> model.predict(rdd).collect() [1.0, 0.0] """ - return DecisionTree._train(data, "regression", 0, categoricalFeaturesInfo, - impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + return cls._train(data, "regression", 0, categoricalFeaturesInfo, + impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) + + +class WeightedEnsembleModel(JavaModelWrapper): + """ + A model trained by RandomForest + """ + def predict(self, x): + """ + Predict values for a single data point or an RDD of points using the model trained. + """ + if isinstance(x, RDD): + return self.call("predict", x.map(_convert_to_vector)) + + else: + return self.call("predict", _convert_to_vector(x)) + + def numWeakHypotheses(self): + """ + Get number of trees in forest. + """ + return self.call("numWeakHypotheses") + + def totalNumNodes(self): + """ + Get total number of nodes, summed over all trees in the forest. + """ + return self.call("totalNumNodes") + + def __repr__(self): + """ Summary of model """ + return self._java_model.toString() + + def toDebugString(self): + """Full model """ + return self._java_model.toDebugString() + + +class RandomForest(object): + + supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") + + @classmethod + def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, + numTrees, featureSubsetStrategy, seed): + first = data.first() + assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" + if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: + raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) + if seed is None: + seed = random.randint(0, 1 << 30) + model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features, + impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed) + return WeightedEnsembleModel(model) + + @classmethod + def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, + seed=None): + """ + Method to train a decision tree model for binary or multiclass classification. + + :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + Labels should take values {0, 1, ..., numClasses-1}. + :param numClassesForClassification: number of classes for classification. + :param categoricalFeaturesInfo: Map storing arity of categorical features. + E.g., an entry (n -> k) indicates that feature n is categorical + with k categories indexed from 0: {0, 1, ..., k-1}. + :param numTrees: Number of trees in the random forest. + :param featureSubsetStrategy: Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt" for classification and + to "onethird" for regression. + :param impurity: Criterion used for information gain calculation. + Supported values: "gini" (recommended) or "entropy". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means + 1 internal node + 2 leaf nodes. (default: 4) + :param maxBins: maximum number of bins used for splitting features (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: WeightedEnsembleModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import RandomForest + >>> + >>> data = [ + ... LabeledPoint(0.0, [0.0]), + ... LabeledPoint(0.0, [1.0]), + ... LabeledPoint(1.0, [4.0]), + ... LabeledPoint(1.0, [5.0]) + ... ] + >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 2, seed=42) + >>> model.numWeakHypotheses() + 2 + >>> model.totalNumNodes() + 4 + >>> print model, + WeightedEnsembleModel classifier with 2 trees + >>> print model.toDebugString(), + WeightedEnsembleModel classifier with 2 trees + + Tree 0: + Predict: 1.0 + Tree 1: + If (feature 0 <= 1.0) + Predict: 0.0 + Else (feature 0 > 1.0) + Predict: 1.0 + >>> model.predict([2.0]) + 1.0 + >>> model.predict([0.0]) #TODO: will fix it later + 1.0 + >>> rdd = sc.parallelize([[1.0], [0.0]]) + >>> model.predict(rdd).collect() + [1.0, 1.0] + """ + return cls._train(data, "classification", numClassesForClassification, + categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees, + featureSubsetStrategy, seed) + + @classmethod + def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", + impurity="variance", maxDepth=4, maxBins=32, seed=None): + """ + Method to train a decision tree model for regression. + + :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + Labels are real numbers. + :param categoricalFeaturesInfo: Map storing arity of categorical features. + E.g., an entry (n -> k) indicates that feature n is categorical + with k categories indexed from 0: {0, 1, ..., k-1}. + :param numTrees: Number of trees in the random forest. + :param featureSubsetStrategy: Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt" for classification and + to "onethird" for regression. + :param impurity: Criterion used for information gain calculation. + Supported values: "variance". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means + 1 internal node + 2 leaf nodes.(default: 4) + :param maxBins: maximum number of bins used for splitting features (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: WeightedEnsembleModel that can be used for prediction + + Example usage: + + >>> from pyspark.mllib.regression import LabeledPoint + >>> from pyspark.mllib.tree import RandomForest + >>> from pyspark.mllib.linalg import SparseVector + >>> + >>> sparse_data = [ + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), + ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), + ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) + ... ] + >>> + >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) + >>> model.numWeakHypotheses() + 2 + >>> model.totalNumNodes() + 4 + >>> model.predict(SparseVector(2, {1: 1.0})) + 1.0 + >>> model.predict(SparseVector(2, {0: 1.0})) + 0.5 + >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) + >>> model.predict(rdd).collect() + [1.0, 0.5] + """ + return cls._train(data, "regression", 0, categoricalFeaturesInfo, impurity, + maxDepth, maxBins, numTrees, featureSubsetStrategy, seed) def _test(): From 89a000fd8e6e15c2ba83d702bbe2f294727f0a4d Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Mon, 17 Nov 2014 13:11:11 -0800 Subject: [PATCH 2/9] fix docs --- python/pyspark/mllib/classification.py | 3 +-- python/pyspark/mllib/tree.py | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py index ce53a8a523e5b..b654813fb4cf6 100644 --- a/python/pyspark/mllib/classification.py +++ b/python/pyspark/mllib/classification.py @@ -69,8 +69,7 @@ def predict(self, x): else: exp_margin = exp(margin) prob = exp_margin / (1 + exp_margin) - return 1 if prob 0.5 else 0 - return 0 if prob < 0.5 else 1 + return 1 if prob > 0.5 else 0 class LogisticRegressionWithSGD(object): diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 5a4815333e2fa..1a1b580b6b656 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -31,6 +31,8 @@ class DecisionTreeModel(JavaModelWrapper): """ A decision tree model for classification or regression. + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. """ def predict(self, x): """ @@ -63,8 +65,9 @@ def toDebugString(self): class DecisionTree(object): """ - Learning algorithm for a decision tree model - for classification or regression. + Learning algorithm for a decision tree model for classification or regression. + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. """ @classmethod @@ -183,7 +186,9 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, class WeightedEnsembleModel(JavaModelWrapper): """ - A model trained by RandomForest + A model trained by :class:`RandomForest` + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. """ def predict(self, x): """ @@ -217,6 +222,11 @@ def toDebugString(self): class RandomForest(object): + """ + Learning algorithm for a random forest model for classification or regression. + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + """ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") From dae7fc01d1df78e3d4f9a18b90ed553eff48edaa Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Mon, 17 Nov 2014 17:54:48 -0800 Subject: [PATCH 3/9] address comments --- python/pyspark/mllib/tree.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 1a1b580b6b656..52ac1c001c215 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -24,7 +24,7 @@ from pyspark.mllib.linalg import _convert_to_vector from pyspark.mllib.regression import LabeledPoint -__all__ = ['DecisionTreeModel', 'DecisionTree', 'WeightedEnsembleModel', 'RandomForest'] +__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel', 'RandomForest'] class DecisionTreeModel(JavaModelWrapper): @@ -185,11 +185,6 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, class WeightedEnsembleModel(JavaModelWrapper): - """ - A model trained by :class:`RandomForest` - - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. - """ def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. @@ -217,10 +212,18 @@ def __repr__(self): return self._java_model.toString() def toDebugString(self): - """Full model """ + """ Full model """ return self._java_model.toDebugString() +class RandomForestModel(WeightedEnsembleModel): + """ + A model trained by :class:`RandomForest` + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + """ + + class RandomForest(object): """ Learning algorithm for a random forest model for classification or regression. @@ -241,7 +244,7 @@ def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, seed = random.randint(0, 1 << 30) model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features, impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed) - return WeightedEnsembleModel(model) + return RandomForestModel(model) @classmethod def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesInfo, numTrees, @@ -250,7 +253,7 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI """ Method to train a decision tree model for binary or multiclass classification. - :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. :param numClassesForClassification: number of classes for classification. :param categoricalFeaturesInfo: Map storing arity of categorical features. @@ -317,11 +320,12 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt """ Method to train a decision tree model for regression. - :param data: Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map storing arity of categorical features. - E.g., an entry (n -> k) indicates that feature n is categorical - with k categories indexed from 0: {0, 1, ..., k-1}. + E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: + {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: Number of features to consider for splits at each node. Supported: "auto" (default), "all", "sqrt", "log2", "onethird". @@ -331,8 +335,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt to "onethird" for regression. :param impurity: Criterion used for information gain calculation. Supported values: "variance". - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means - 1 internal node + 2 leaf nodes.(default: 4) + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes.(default: 4) :param maxBins: maximum number of bins used for splitting features (default: 100) :param seed: Random seed for bootstrapping and choosing feature subsets. :return: WeightedEnsembleModel that can be used for prediction From 885abee042bb64771f53dab7814ed914a68b62a1 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Mon, 17 Nov 2014 17:55:45 -0800 Subject: [PATCH 4/9] address comments --- .../mllib/api/python/PythonMLLibAPI.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a3a0fc3b69d81..3f946ebe554ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -471,16 +471,16 @@ class PythonMLLibAPI extends Serializable { * see the Py4J documentation. */ def trainRandomForestModel( - data: JavaRDD[LabeledPoint], - algoStr: String, - numClasses: Int, - categoricalFeaturesInfo: JMap[Int, Int], - impurityStr: String, - maxDepth: Int, - maxBins: Int, - numTrees: Int, - featureSubsetStrategy: String, - seed: Int): WeightedEnsembleModel = { + data: JavaRDD[LabeledPoint], + algoStr: String, + numClasses: Int, + categoricalFeaturesInfo: JMap[Int, Int], + impurityStr: String, + maxDepth: Int, + maxBins: Int, + numTrees: Int, + featureSubsetStrategy: String, + seed: Int): WeightedEnsembleModel = { val algo = Algo.fromString(algoStr) val impurity = Impurities.fromString(impurityStr) From 0431746062ce7160777a1034787294aa7893be1d Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 09:31:47 -0800 Subject: [PATCH 5/9] rebased --- .../mllib/api/python/PythonMLLibAPI.scala | 4 +- python/pyspark/mllib/tree.py | 50 ++++++++++--------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 9cdbd8e5ae1f5..d9a4e4da8e94a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -43,7 +43,7 @@ import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.mllib.tree.{RandomForest, DecisionTree} import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} import org.apache.spark.mllib.tree.impurity._ -import org.apache.spark.mllib.tree.model.{WeightedEnsembleModel, DecisionTreeModel} +import org.apache.spark.mllib.tree.model.{RandomForestModel, DecisionTreeModel} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -515,7 +515,7 @@ class PythonMLLibAPI extends Serializable { maxBins: Int, numTrees: Int, featureSubsetStrategy: String, - seed: Int): WeightedEnsembleModel = { + seed: Int): RandomForestModel = { val algo = Algo.fromString(algoStr) val impurity = Impurities.fromString(impurityStr) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 52ac1c001c215..f472a45a2f92d 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -184,7 +184,12 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) -class WeightedEnsembleModel(JavaModelWrapper): +class RandomForestModel(JavaModelWrapper): + """ + Represents a random forest model. + + EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + """ def predict(self, x): """ Predict values for a single data point or an RDD of points using the model trained. @@ -195,11 +200,11 @@ def predict(self, x): else: return self.call("predict", _convert_to_vector(x)) - def numWeakHypotheses(self): + def numTrees(self): """ Get number of trees in forest. """ - return self.call("numWeakHypotheses") + return self.call("numTrees") def totalNumNodes(self): """ @@ -216,14 +221,6 @@ def toDebugString(self): return self._java_model.toDebugString() -class RandomForestModel(WeightedEnsembleModel): - """ - A model trained by :class:`RandomForest` - - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. - """ - - class RandomForest(object): """ Learning algorithm for a random forest model for classification or regression. @@ -282,18 +279,18 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(0.0, [1.0]), - ... LabeledPoint(1.0, [4.0]), - ... LabeledPoint(1.0, [5.0]) + ... LabeledPoint(1.0, [2.0]), + ... LabeledPoint(1.0, [3.0]) ... ] - >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 2, seed=42) - >>> model.numWeakHypotheses() - 2 + >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42) + >>> model.numTrees() + 3 >>> model.totalNumNodes() - 4 + 7 >>> print model, - WeightedEnsembleModel classifier with 2 trees + TreeEnsembleModel classifier with 3 trees >>> print model.toDebugString(), - WeightedEnsembleModel classifier with 2 trees + TreeEnsembleModel classifier with 3 trees Tree 0: Predict: 1.0 @@ -302,13 +299,18 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI Predict: 0.0 Else (feature 0 > 1.0) Predict: 1.0 + Tree 2: + If (feature 0 <= 1.0) + Predict: 0.0 + Else (feature 0 > 1.0) + Predict: 1.0 >>> model.predict([2.0]) 1.0 - >>> model.predict([0.0]) #TODO: will fix it later - 1.0 - >>> rdd = sc.parallelize([[1.0], [0.0]]) + >>> model.predict([0.0]) + 0.0 + >>> rdd = sc.parallelize([[3.0], [1.0]]) >>> model.predict(rdd).collect() - [1.0, 1.0] + [1.0, 0.0] """ return cls._train(data, "classification", numClassesForClassification, categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees, @@ -355,7 +357,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt ... ] >>> >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) - >>> model.numWeakHypotheses() + >>> model.numTrees() 2 >>> model.totalNumNodes() 4 From e0df852ab4f353b9f800fe5374195fee5a06aa52 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 10:15:48 -0800 Subject: [PATCH 6/9] fix docs --- python/pyspark/mllib/tree.py | 99 ++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index f472a45a2f92d..8f3cfe6840ec2 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -32,7 +32,8 @@ class DecisionTreeModel(JavaModelWrapper): """ A decision tree model for classification or regression. - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. """ def predict(self, x): """ @@ -67,7 +68,8 @@ class DecisionTree(object): """ Learning algorithm for a decision tree model for classification or regression. - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. """ @classmethod @@ -98,8 +100,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child nodes to create - the parent split + :param minInstancesPerNode: Min number of instances required at child + nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel @@ -153,8 +155,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child nodes to create - the parent split + :param minInstancesPerNode: Min number of instances required at child + nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel @@ -188,11 +190,13 @@ class RandomForestModel(JavaModelWrapper): """ Represents a random forest model. - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. """ def predict(self, x): """ - Predict values for a single data point or an RDD of points using the model trained. + Predict values for a single data point or an RDD of points using + the model trained. """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -225,7 +229,8 @@ class RandomForest(object): """ Learning algorithm for a random forest model for classification or regression. - EXPERIMENTAL: This is an experimental API. It will probably be modified in future. + EXPERIMENTAL: This is an experimental API. + It will probably be modified in future. """ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") @@ -248,28 +253,31 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=None): """ - Method to train a decision tree model for binary or multiclass classification. + Method to train a decision tree model for binary or multiclass + classification. - :param data: Training dataset: RDD of LabeledPoint. - Labels should take values {0, 1, ..., numClasses-1}. + :param data: Training dataset: RDD of LabeledPoint. Labels should take + values {0, 1, ..., numClasses-1}. :param numClassesForClassification: number of classes for classification. :param categoricalFeaturesInfo: Map storing arity of categorical features. - E.g., an entry (n -> k) indicates that feature n is categorical - with k categories indexed from 0: {0, 1, ..., k-1}. + E.g., an entry (n -> k) indicates that feature n is categorical + with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt" for classification and - to "onethird" for regression. + :param featureSubsetStrategy: Number of features to consider for splits at + each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt" for classification and to + "onethird" for regression. :param impurity: Criterion used for information gain calculation. - Supported values: "gini" (recommended) or "entropy". - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means - 1 internal node + 2 leaf nodes. (default: 4) - :param maxBins: maximum number of bins used for splitting features (default: 100) - :param seed: Random seed for bootstrapping and choosing feature subsets. - :return: WeightedEnsembleModel that can be used for prediction + Supported values: "gini" (recommended) or "entropy". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) + :param maxBins: maximum number of bins used for splitting features + (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: RandomForestModel that can be used for prediction Example usage: @@ -322,26 +330,29 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt """ Method to train a decision tree model for regression. - :param data: Training dataset: RDD of LabeledPoint. - Labels are real numbers. - :param categoricalFeaturesInfo: Map storing arity of categorical features. - E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. + :param data: Training dataset: RDD of LabeledPoint. Labels are + real numbers. + :param categoricalFeaturesInfo: Map storing arity of categorical + features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: + {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt" for classification and - to "onethird" for regression. + :param featureSubsetStrategy: Number of features to consider for + splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt" for classification and + to "onethird" for regression. :param impurity: Criterion used for information gain calculation. - Supported values: "variance". - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes.(default: 4) - :param maxBins: maximum number of bins used for splitting features (default: 100) - :param seed: Random seed for bootstrapping and choosing feature subsets. - :return: WeightedEnsembleModel that can be used for prediction + Supported values: "variance". + :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 + leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: maximum number of bins used for splitting features + (default: 100) + :param seed: Random seed for bootstrapping and choosing feature subsets. + :return: RandomForestModel that can be used for prediction Example usage: From 4ca593d2751a20e9b53b4b047c3dbb5eed09773d Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 10:45:14 -0800 Subject: [PATCH 7/9] fix docs --- python/docs/epytext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/epytext.py b/python/docs/epytext.py index 19fefbfc057a4..e884d5e6b19c7 100644 --- a/python/docs/epytext.py +++ b/python/docs/epytext.py @@ -1,7 +1,7 @@ import re RULES = ( - (r"<[\w.]+>", r""), + (r"<(!BLANKLINE)[\w.]+>", r""), (r"L{([\w.()]+)}", r":class:`\1`"), (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"), (r"C{([\w.()]+)}", r":class:`\1`"), From 53cf510ca2cfaefd6c7b1e466308b486b2b46df1 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 13:00:47 -0800 Subject: [PATCH 8/9] fix docs --- .../org/apache/spark/mllib/tree/RandomForest.scala | 12 ++++-------- python/pyspark/mllib/tree.py | 6 ++---- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index ca0b6eea9aeb6..3ae6fa2a0ec2f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -230,8 +230,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "sqrt". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ @@ -261,8 +260,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". * @param maxDepth Maximum depth of the tree. @@ -318,8 +316,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "onethird". * @param seed Random seed for bootstrapping and choosing feature subsets. * @return a random forest model that can be used for prediction */ @@ -348,8 +345,7 @@ object RandomForest extends Serializable with Logging { * Supported: "auto" (default), "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; - * if numTrees > 1 (forest) set to "sqrt" for classification and - * to "onethird" for regression. + * if numTrees > 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. * Supported values: "variance". * @param maxDepth Maximum depth of the tree. diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 8f3cfe6840ec2..15a868e0daf2d 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -268,8 +268,7 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt" for classification and to - "onethird" for regression. + if numTrees > 1 (forest) set to "sqrt". :param impurity: Criterion used for information gain calculation. Supported values: "gini" (recommended) or "entropy". :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; @@ -342,8 +341,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt" for classification and - to "onethird" for regression. + if numTrees > 1 (forest) set to "onethird" for regression. :param impurity: Criterion used for information gain calculation. Supported values: "variance". :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 From 8003dfc674fedeca520cfceaa6e48845cd5138be Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Thu, 20 Nov 2014 13:09:15 -0800 Subject: [PATCH 9/9] reorder --- .../spark/mllib/api/python/PythonMLLibAPI.scala | 4 ++-- python/pyspark/mllib/tree.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index d9a4e4da8e94a..b6f7618171224 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -510,11 +510,11 @@ class PythonMLLibAPI extends Serializable { algoStr: String, numClasses: Int, categoricalFeaturesInfo: JMap[Int, Int], + numTrees: Int, + featureSubsetStrategy: String, impurityStr: String, maxDepth: Int, maxBins: Int, - numTrees: Int, - featureSubsetStrategy: String, seed: Int): RandomForestModel = { val algo = Algo.fromString(algoStr) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 15a868e0daf2d..46e253991aa56 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -236,16 +236,17 @@ class RandomForest(object): supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") @classmethod - def _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, - numTrees, featureSubsetStrategy, seed): + def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, impurity, maxDepth, maxBins, seed): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) if seed is None: seed = random.randint(0, 1 << 30) - model = callMLlibFunc("trainRandomForestModel", data, type, numClasses, features, - impurity, maxDepth, maxBins, numTrees, featureSubsetStrategy, seed) + model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses, + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, + maxDepth, maxBins, seed) return RandomForestModel(model) @classmethod @@ -320,8 +321,8 @@ def trainClassifier(cls, data, numClassesForClassification, categoricalFeaturesI [1.0, 0.0] """ return cls._train(data, "classification", numClassesForClassification, - categoricalFeaturesInfo, impurity, maxDepth, maxBins, numTrees, - featureSubsetStrategy, seed) + categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, + maxDepth, maxBins, seed) @classmethod def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", @@ -378,8 +379,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt >>> model.predict(rdd).collect() [1.0, 0.5] """ - return cls._train(data, "regression", 0, categoricalFeaturesInfo, impurity, - maxDepth, maxBins, numTrees, featureSubsetStrategy, seed) + return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, impurity, maxDepth, maxBins, seed) def _test():