From 47d702399b35580977d2e47a3a344f06059c860e Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 30 Jun 2015 10:27:55 +0530 Subject: [PATCH] Use np.allclose and treeEnsembleModel -> TreeEnsembleMethods --- python/pyspark/ml/classification.py | 16 +++++++++------- python/pyspark/ml/regression.py | 20 +++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 53572c67a384e..89117e492846b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -19,7 +19,7 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * from pyspark.ml.regression import ( - RandomForestParams, DecisionTreeModel, treeEnsembleModels) + RandomForestParams, DecisionTreeModel, TreeEnsembleModels) from pyspark.mllib.common import inherit_doc @@ -290,6 +290,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ @@ -300,8 +301,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> td = si_model.transform(df) >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42) >>> model = rf.fit(td) - >>> model.treeWeights - [1.0, 1.0] + >>> allclose(model.treeWeights, [1.0, 1.0]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -431,7 +432,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestClassificationModel(treeEnsembleModels): +class RandomForestClassificationModel(TreeEnsembleModels): """ Model fitted by RandomForestClassifier. """ @@ -446,6 +447,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol It supports binary labels, as well as both continuous and categorical features. Note: Multiclass labels are not currently supported. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ @@ -456,8 +458,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> td = si_model.transform(df) >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") >>> model = gbt.fit(td) - >>> model.treeWeights - [1.0, 0.1, 0.1, 0.1, 0.1] + >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -568,7 +570,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTClassificationModel(treeEnsembleModels): +class GBTClassificationModel(TreeEnsembleModels): """ Model fitted by GBTClassifier. """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index ffa43459eea01..2142b2a7bd966 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -173,9 +173,9 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> dt = DecisionTreeRegressor(maxDepth=2) >>> model = dt.fit(df) >>> model.depth - 2 - >>> model.numNodes 1 + >>> model.numNodes + 3 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -261,7 +261,7 @@ def __repr__(self): @inherit_doc -class treeEnsembleModels(JavaModel): +class TreeEnsembleModels(JavaModel): @property def treeWeights(self): @@ -286,14 +286,15 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi learning algorithm for regression. It supports both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) >>> model = rf.fit(df) - >>> model.treeWeights - [1.0, 1.0] + >>> allclose(model.treeWeights, [1.0, 1.0]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -424,7 +425,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestRegressionModel(treeEnsembleModels): +class RandomForestRegressionModel(TreeEnsembleModels): """ Model fitted by RandomForestRegressor. """ @@ -438,14 +439,15 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, learning algorithm for regression. It supports both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) >>> model = gbt.fit(df) - >>> model.treeWeights - [1.0, 0.1, 0.1, 0.1, 0.1] + >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -555,7 +557,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTRegressionModel(treeEnsembleModels): +class GBTRegressionModel(TreeEnsembleModels): """ Model fitted by GBTRegressor. """