From 1ffb02fcf6e502baa3dfdf750f480fb16cf06c42 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 4 May 2016 21:23:39 -0700 Subject: [PATCH 01/13] Add totalNumNodes and toDebugString to TreeEnsembleModels --- python/pyspark/ml/regression.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 04f566dfecd60..51b0671981a97 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -743,6 +743,18 @@ def treeWeights(self): """Return the weights for each tree""" return list(self._call_java("javaTreeWeights")) + @property + @since("2.0.0") + def totalNumNodes(self): + """Total number of nodes, summed over all trees in the ensemble.""" + return self._call_java("totalNumNodes") + + @property + @since("2.0.0") + def toDebugString(self): + """Full description of model.""" + return self._call_java("toDebugString") + def __repr__(self): return self._call_java("toString") From 6400e9e3d7e096b05475cca548e31aabca8cbaff Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 4 May 2016 21:23:53 -0700 Subject: [PATCH 02/13] Add tests using GBTClassifier --- python/pyspark/ml/classification.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f032963334469..db9f9d6153537 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -760,6 +760,10 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 + >>> model.totalNumNodes + 15 + >>> print(str(model.toDebugString)) + GBTClassificationModel (uid=...)...with 5 trees... >>> gbtc_path = temp_path + "gbtc" >>> gbt.save(gbtc_path) >>> gbt2 = GBTClassifier.load(gbtc_path) From 91c293af911d173670fb71857a38601745322285 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 11:15:19 -0700 Subject: [PATCH 03/13] Add trees method to TreeEnsembleModels and override with type specific in the enesemble models --- python/pyspark/ml/classification.py | 12 ++++++++++++ python/pyspark/ml/regression.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index db9f9d6153537..cc320f40cf3b9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -728,6 +728,12 @@ def featureImportances(self): """ return self._call_java("featureImportances") + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] + @inherit_doc class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, @@ -861,6 +867,12 @@ def featureImportances(self): """ return self._call_java("featureImportances") + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] + @inherit_doc class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 51b0671981a97..69fdf60ebc88e 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -737,6 +737,12 @@ class TreeEnsembleModels(JavaModel): .. versionadded:: 1.5.0 """ + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] + @property @since("1.5.0") def treeWeights(self): @@ -881,6 +887,12 @@ class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLRead .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + @property @since("2.0.0") def featureImportances(self): @@ -1020,6 +1032,12 @@ def featureImportances(self): """ return self._call_java("featureImportances") + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + @inherit_doc class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, From 2d8c40ff3a76e5605f2663c0a68554c6ea60e231 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 11:32:04 -0700 Subject: [PATCH 04/13] Check the two types of trees calls --- python/pyspark/ml/classification.py | 2 ++ python/pyspark/ml/regression.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index cc320f40cf3b9..0b0ba9b91f46a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -648,6 +648,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 + >>> model.trees + [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...] >>> rfc_path = temp_path + "/rfc" >>> rf.save(rfc_path) >>> rf2 = RandomForestClassifier.load(rfc_path) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 69fdf60ebc88e..97652c2ed7476 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -818,6 +818,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 + >>> model.trees + [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 0.5 From 26d7fed3c22f181574c9f77acf27634aaf7e14fd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 12:54:47 -0700 Subject: [PATCH 05/13] Add toDebugString to DecisionTreeModel --- python/pyspark/ml/classification.py | 2 ++ python/pyspark/ml/regression.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 0b0ba9b91f46a..cc57177421abc 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -510,6 +510,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred 1 >>> model.featureImportances SparseVector(1, {0: 1.0}) + >>> print(model.toDebugString) + DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> result = model.transform(test0).head() >>> result.prediction diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 97652c2ed7476..b2d1a9d42b9b1 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -726,6 +726,12 @@ def depth(self): """Return depth of the decision tree.""" return self._call_java("depth") + @property + @since("2.0.0") + def toDebugString(self): + """Full description of model.""" + return self._call_java("toDebugString") + def __repr__(self): return self._call_java("toString") From 5f8b0cec5f9dcd4f6715835edb50248f21faa27e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 7 May 2016 20:16:23 -0700 Subject: [PATCH 06/13] Make the change to DecisionTreeRegressionModel in the base and just depend on Classification and other treeesneble models overriding if they don't match DecisionTreeRegressionModel --- python/pyspark/ml/regression.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index b2d1a9d42b9b1..0c8a6c0cf9896 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -747,7 +747,7 @@ class TreeEnsembleModels(JavaModel): @since("2.0.0") def trees(self): """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] @property @since("1.5.0") @@ -895,12 +895,6 @@ class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLRead .. versionadded:: 1.4.0 """ - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] - @property @since("2.0.0") def featureImportances(self): @@ -1040,12 +1034,6 @@ def featureImportances(self): """ return self._call_java("featureImportances") - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] - @inherit_doc class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, From 17cae03515bd51d6107eabaf44269004db4ac50b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 9 May 2016 11:27:10 -0700 Subject: [PATCH 07/13] Remove override trees from GBT since it isn't bosted classifier trees rather boosted regression trees for classification --- python/pyspark/ml/classification.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index cc57177421abc..c74694620d99b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -871,12 +871,6 @@ def featureImportances(self): """ return self._call_java("featureImportances") - @property - @since("2.0.0") - def trees(self): - """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeClassificationModel(m) for m in list(self._call_java("trees"))] - @inherit_doc class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, From 3cfc9965f7d158565fe75e056c26cfd73e93a09d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 10 May 2016 11:28:16 -0700 Subject: [PATCH 08/13] Add new option to PyDoc from https://github.com/apache/spark/pull/11989 --- python/pyspark/ml/regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 0c8a6c0cf9896..e3637b212e1e0 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -580,7 +580,7 @@ class RandomForestParams(TreeEnsembleParams): featureSubsetStrategy = \ Param(Params._dummy(), "featureSubsetStrategy", "The number of features to consider for splits at each tree node. Supported " + - "options: " + ", ".join(supportedFeatureSubsetStrategies), + "options: " + ", ".join(supportedFeatureSubsetStrategies) + " (0.0-1.0], [1-n].", typeConverter=TypeConverters.toString) def __init__(self): From 4b183ec2eb6d32d0c9da2ed63891bc2e96290c53 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 13 May 2016 10:53:15 -0700 Subject: [PATCH 09/13] Revert "Make the change to DecisionTreeRegressionModel in the base and just depend on Classification and other treeesneble models overriding if they don't match DecisionTreeRegressionModel" This reverts commit 5f8b0cec5f9dcd4f6715835edb50248f21faa27e. --- python/pyspark/ml/regression.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index e3637b212e1e0..c3af33d6b8c3a 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -747,7 +747,7 @@ class TreeEnsembleModels(JavaModel): @since("2.0.0") def trees(self): """Trees in this ensemble. Warning: These have null parent Estimators.""" - return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] @property @since("1.5.0") @@ -895,6 +895,12 @@ class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLRead .. versionadded:: 1.4.0 """ + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + @property @since("2.0.0") def featureImportances(self): @@ -1034,6 +1040,12 @@ def featureImportances(self): """ return self._call_java("featureImportances") + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + @inherit_doc class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, From 30219a29415abea441ff5fa7baca3db15581777b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 16 May 2016 01:23:54 -0700 Subject: [PATCH 10/13] Override trees in GBTClassificationModel to return DecisionTreeRegressionModels --- python/pyspark/ml/classification.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 9a3420b3397e6..d045c91b4a180 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -883,6 +883,12 @@ def featureImportances(self): """ return self._call_java("featureImportances") + @property + @since("2.0.0") + def trees(self): + """Trees in this ensemble. Warning: These have null parent Estimators.""" + return [DecisionTreeRegressionModel(m) for m in list(self._call_java("trees"))] + @inherit_doc class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, From 05ec32a2a8d304c9c3c55cc050e0698c6b02b175 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 18 May 2016 14:55:07 -0700 Subject: [PATCH 11/13] remove superfolous str in docstring print --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index d045c91b4a180..de8849a82b68d 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -784,7 +784,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol 1.0 >>> model.totalNumNodes 15 - >>> print(str(model.toDebugString)) + >>> print(model.toDebugString) GBTClassificationModel (uid=...)...with 5 trees... >>> gbtc_path = temp_path + "gbtc" >>> gbt.save(gbtc_path) From 2e2c976266792dec171663eafe7d5cc2cc7a4742 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 19 May 2016 11:58:18 -0700 Subject: [PATCH 12/13] Add getNumTreees as well --- python/pyspark/ml/regression.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 2eef22cfdc1f6..62ba5a4ab0a25 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -766,6 +766,12 @@ def trees(self): """Trees in this ensemble. Warning: These have null parent Estimators.""" return [DecisionTreeModel(m) for m in list(self._call_java("trees"))] + @property + @since("2.0.0") + def getNumTrees(self): + """Number of trees in ensemble.""" + return self._call_java("getNumTrees") + @property @since("1.5.0") def treeWeights(self): From e498db7dc06b41cf9856c3f3796da419a3dc58bd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 23 May 2016 15:39:55 -0700 Subject: [PATCH 13/13] Add a doctest with getNumTrees --- python/pyspark/ml/regression.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 764a6db41bb34..bec0acd02d243 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -853,6 +853,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi 0.0 >>> model.trees [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] + >>> model.getNumTrees + 2 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 0.5