From 7daf4adf08460b2bfa4477a2ebcd75202fb0f288 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Tue, 5 Jan 2016 12:36:19 +0100 Subject: [PATCH 1/9] [SPARK-12634][DOC] Update param descriptions Updates the `param` descriptions to be consistent. See [SPARK-11219] for more details. --- python/pyspark/mllib/tree.py | 285 ++++++++++++++++++++--------------- 1 file changed, 165 insertions(+), 120 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 0001b60093a69..954717289cacb 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -92,8 +92,8 @@ def predict(self, x): transformation or action. Call predict directly on the RDD instead. - :param x: Data point (feature vector), - or an RDD of data points (feature vectors). + :param x: + Data point (feature vector), or an RDD of data points (feature vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -154,21 +154,32 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, """ Train a DecisionTreeModel for classification. - :param data: Training data: RDD of LabeledPoint. - Labels are integers {0,1,...,numClasses}. - :param numClasses: Number of classes for classification. - :param categoricalFeaturesInfo: Map from categorical feature index - to number of categories. - Any feature not in this map - is treated as continuous. - :param impurity: Supported values: "entropy" or "gini" - :param maxDepth: Max depth of tree. - E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. - :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child - nodes to create the parent split - :param minInfoGain: Min info gain required to create a split + :param data: + Training data: RDD of LabeledPoint. Labels are integers + {0,1,...,numClasses}. + :param numClasses: + Number of classes for classification. + :param categoricalFeaturesInfo: + Map from categorical feature index to number of categories. + Any feature not in this map is treated as continuous. + :param impurity: + Supported values: "entropy" or "gini". + (default: "gini") + :param maxDepth: + Max depth of tree. E.g., depth 0 means 1 leaf node. + Depth 1 means 1 internal node + 2 leaf nodes. + (default: 5) + :param maxBins: + Number of bins used for finding splits at each node. + (default: 32) + :param minInstancesPerNode: + Min number of instances required at child nodes to create + the parent split. + (default: 1) + :param minInfoGain: + Min info gain required to create a split. + (default: 0.0) + :return: DecisionTreeModel Example usage: @@ -213,20 +224,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Train a DecisionTreeModel for regression. - :param data: Training data: RDD of LabeledPoint. - Labels are real numbers. - :param categoricalFeaturesInfo: Map from categorical feature - index to number of categories. - Any feature not in this map is treated as continuous. - :param impurity: Supported values: "variance" - :param maxDepth: Max depth of tree. - E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. - :param maxBins: Number of bins used for finding splits at each - node. - :param minInstancesPerNode: Min number of instances required at - child nodes to create the parent split - :param minInfoGain: Min info gain required to create a split + :param data: + Training data: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. + :param impurity: + Supported values: "variance" + (default: "variance") + :param maxDepth: + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means + 1 internal node + 2 leaf nodes. + (default: 5) + :param maxBins: + Number of bins used for finding splits at each node. + (default: 32) + :param minInstancesPerNode: + Min number of instances required at child nodes to create the + parent split. + (default: 1) + :param minInfoGain: + Min info gain required to create a split. + (default: 0.0) :return: DecisionTreeModel Example usage: @@ -305,30 +324,39 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Method to train a decision tree model for binary or multiclass classification. - :param data: Training dataset: RDD of LabeledPoint. Labels - should take values {0, 1, ..., numClasses-1}. - :param numClasses: number of classes for classification. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that - feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. - :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for - splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". - :param impurity: Criterion used for information gain calculation. - Supported values: "gini" (recommended) or "entropy". - :param maxDepth: Maximum depth of the tree. - E.g., depth 0 means 1 leaf node; depth 1 means - 1 internal node + 2 leaf nodes. (default: 4) - :param maxBins: maximum number of bins used for splitting - features - (default: 32) - :param seed: Random seed for bootstrapping and choosing feature - subsets. + :param data: Training dataset: + RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + :param numClasses: + Number of classes for classification. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + :param numTrees: + Number of trees in the random forest. + :param featureSubsetStrategy: + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". + (default: "auto") + :param impurity: + Criterion used for information gain calculation. + Supported values: "gini" (recommended) or "entropy". + (default: "gini") + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: + Maximum number of bins used for splitting features + (default: 32) + :param seed: + Random seed for bootstrapping and choosing feature subsets. + (default: None) + :return: RandomForestModel that can be used for prediction Example usage: @@ -385,29 +413,35 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt """ Method to train a decision tree model for regression. - :param data: Training dataset: RDD of LabeledPoint. Labels are - real numbers. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for - splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. - :param impurity: Criterion used for information gain - calculation. - Supported values: "variance". - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 4) - :param maxBins: maximum number of bins used for splitting - features (default: 32) - :param seed: Random seed for bootstrapping and choosing feature - subsets. + :param data: + Training dataset: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param numTrees: + Number of trees in the random forest. + :param featureSubsetStrategy: + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "onethird" for regression. + (default: "auto") + :param impurity: + Criterion used for information gain calculation. + Supported values: "variance". + (default: "variance") + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 + means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: + Maximum number of bins used for splitting features. + (default: 32) + :param seed: + Random seed for bootstrapping and choosing feature subsets. + (default: None) :return: RandomForestModel that can be used for prediction Example usage: @@ -483,28 +517,33 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Method to train a gradient-boosted trees model for classification. - :param data: Training dataset: RDD of LabeledPoint. - Labels should take values {0, 1}. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param loss: Loss function used for minimization during gradient - boosting. Supported: {"logLoss" (default), - "leastSquaresError", "leastAbsoluteError"}. - :param numIterations: Number of iterations of boosting. - (default: 100) - :param learningRate: Learning rate for shrinking the - contribution of each estimator. The learning rate - should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 3) - :param maxBins: maximum number of bins used for splitting - features (default: 32) DecisionTree requires maxBins >= max categories + :param data: + Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param loss: + Loss function used for minimization during gradient boosting. + Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + (default: "logLoss") + :param numIterations: + Number of iterations of boosting. + (default: 100) + :param learningRate: + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 3) + :param maxBins: + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories + (default: 32) + :return: GradientBoostedTreesModel that can be used for - prediction + prediction Example usage: @@ -545,28 +584,34 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Method to train a gradient-boosted trees model for regression. - :param data: Training dataset: RDD of LabeledPoint. Labels are - real numbers. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param loss: Loss function used for minimization during gradient - boosting. Supported: {"logLoss" (default), - "leastSquaresError", "leastAbsoluteError"}. - :param numIterations: Number of iterations of boosting. - (default: 100) - :param learningRate: Learning rate for shrinking the - contribution of each estimator. The learning rate - should be between in the interval (0, 1]. - (default: 0.1) - :param maxBins: maximum number of bins used for splitting - features (default: 32) DecisionTree requires maxBins >= max categories - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 3) + :param data: + Training dataset: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param loss: + Loss function used for minimization during gradient boosting. + Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + (default: "logLoss") + :param numIterations: + Number of iterations of boosting. + (default: 100) + :param learningRate: + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 + means 1 internal node + 2 leaf nodes. + (default: 3) + :param maxBins: + Maximum number of bins used for splitting features. + DecisionTree requires maxBins >= max categories + (default: 32) + :return: GradientBoostedTreesModel that can be used for - prediction + prediction Example usage: From a5346e266c211995c0d0306c278d63b4bf9a8781 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Wed, 6 Jan 2016 11:41:45 +0100 Subject: [PATCH 2/9] Style Fixes - Update fill-column to 100 on parameter descriptions. --- python/pyspark/mllib/tree.py | 96 +++++++++++++++++------------------- 1 file changed, 44 insertions(+), 52 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 954717289cacb..5e7d8a0e62ec5 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -155,31 +155,28 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, Train a DecisionTreeModel for classification. :param data: - Training data: RDD of LabeledPoint. Labels are integers - {0,1,...,numClasses}. + Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. - Any feature not in this map is treated as continuous. + Map from categorical feature index to number of categories. Any feature not in this map is + treated as continuous. :param impurity: Supported values: "entropy" or "gini". (default: "gini") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 + leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create - the parent split. + Min number of instances required at child nodes to create the parent split. (default: 1) :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel Example usage: @@ -227,14 +224,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any - feature not in this map is treated as continuous. + Map from categorical feature index to number of categories. Any feature not in this map is + treated as continuous. :param impurity: - Supported values: "variance" + Supported values: "variance". (default: "variance") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means - 1 internal node + 2 leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 + leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. @@ -325,33 +322,31 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, classification. :param data: Training dataset: - RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. + RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. Supported: "auto" (default), "all", + "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". (default: "auto") :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" (recommended) or "entropy". + Criterion used for information gain calculation. Supported values: "gini" (recommended) or + "entropy". (default: "gini") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 4) :param maxBins: - Maximum number of bins used for splitting features + Maximum number of bins used for splitting features. (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. @@ -416,25 +411,23 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", + "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") :param impurity: - Criterion used for information gain calculation. - Supported values: "variance". + Criterion used for information gain calculation. Supported values: "variance". (default: "variance") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -520,9 +513,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. @@ -531,15 +523,16 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. The learning rate should + be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree requires + maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for @@ -587,9 +580,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that + feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. @@ -598,16 +590,16 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. The learning rate should + be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal + node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. - DecisionTree requires maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree requires + maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for From 9337098f54d5b015163c86c319c81aebe124797f Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Fri, 22 Jan 2016 15:16:24 +0100 Subject: [PATCH 3/9] Limit parameter descriptions to 74th column --- python/pyspark/mllib/tree.py | 104 +++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 5e7d8a0e62ec5..f1d16561ec81a 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -93,7 +93,8 @@ def predict(self, x): Call predict directly on the RDD instead. :param x: - Data point (feature vector), or an RDD of data points (feature vectors). + Data point (feature vector), or an RDD of data points (feature + vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -155,24 +156,26 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, Train a DecisionTreeModel for classification. :param data: - Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. + Training data: RDD of LabeledPoint. Labels are integers + {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any feature not in this map is - treated as continuous. + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. :param impurity: Supported values: "entropy" or "gini". (default: "gini") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 - leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 + means 1 internal node + 2 leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create the parent split. + Min number of instances required at child nodes to create the + parent split. (default: 1) :param minInfoGain: Min info gain required to create a split. @@ -224,14 +227,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any feature not in this map is - treated as continuous. + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. :param impurity: Supported values: "variance". (default: "variance") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 - leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 + means 1 internal node + 2 leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. @@ -322,28 +325,30 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, classification. :param data: Training dataset: - RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. + RDD of LabeledPoint. Labels should take values {0, 1, ..., + numClasses-1}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. Supported: "auto" (default), "all", - "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported values: "gini" (recommended) or - "entropy". + Criterion used for information gain calculation. Supported + values: "gini" (recommended) or "entropy". (default: "gini") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + - 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -411,23 +416,25 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", - "log2", "onethird". + Number of features to consider for splits at each node. + Supported: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported values: "variance". + Criterion used for information gain calculation. Supported + values: "variance". (default: "variance") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + - 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -511,28 +518,31 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, classification. :param data: - Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1}. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + Supported values: {"logLoss", "leastSquaresError", + "leastAbsoluteError"}. (default: "logLoss") :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. The learning rate should - be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node - + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree requires - maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for @@ -580,26 +590,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that - feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + Supported values: {"logLoss", "leastSquaresError", + "leastAbsoluteError"}. (default: "logLoss") :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. The learning rate should - be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal - node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree requires - maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for From 36be47c4986cd43d04053cc1c458bdd77dd918bc Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Sat, 23 Jan 2016 08:02:21 +0100 Subject: [PATCH 4/9] :return: formatting and minor style fixes --- python/pyspark/mllib/tree.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index f1d16561ec81a..d8d193335c385 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -180,7 +180,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel + :return: + DecisionTreeModel. Example usage: @@ -246,7 +247,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel + :return: + DecisionTreeModel. Example usage: @@ -324,11 +326,10 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Method to train a decision tree model for binary or multiclass classification. - :param data: Training dataset: - RDD of LabeledPoint. Labels should take values {0, 1, ..., - numClasses-1}. - :param numClasses: - Number of classes for classification. + :param data: + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}.:param numClasses: Number of classes + for classification. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories @@ -356,8 +357,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, :param seed: Random seed for bootstrapping and choosing feature subsets. (default: None) - - :return: RandomForestModel that can be used for prediction + :return: + RandomForestModel that can be used for prediction. Example usage: @@ -442,7 +443,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param seed: Random seed for bootstrapping and choosing feature subsets. (default: None) - :return: RandomForestModel that can be used for prediction + :return: + RandomForestModel that can be used for prediction. Example usage: @@ -544,9 +546,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories (default: 32) - - :return: GradientBoostedTreesModel that can be used for - prediction + :return: + GradientBoostedTreesModel that can be used for prediction. Example usage: @@ -613,9 +614,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories (default: 32) - - :return: GradientBoostedTreesModel that can be used for - prediction + :return: + GradientBoostedTreesModel that can be used for prediction. Example usage: From 41d9d60ed6a85ea936b9b81cb3d78ced749cffcf Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 24 Feb 2016 14:15:07 -0800 Subject: [PATCH 5/9] [SPARK-12634] Fix Sphinx errors, cleanup, sync with Scala api --- docs/mllib-decision-tree.md | 6 +- .../spark/mllib/tree/DecisionTree.scala | 176 +++++++++--------- python/pyspark/mllib/tree.py | 119 ++++++------ 3 files changed, 157 insertions(+), 144 deletions(-) diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md index a8612b6c84fe9..9af48357b3dfc 100644 --- a/docs/mllib-decision-tree.md +++ b/docs/mllib-decision-tree.md @@ -121,12 +121,12 @@ The parameters are listed below roughly in order of descending importance. New These parameters describe the problem you want to solve and your dataset. They should be specified and do not require tuning. -* **`algo`**: `Classification` or `Regression` +* **`algo`**: Type of decision tree, either `Classification` or `Regression`. -* **`numClasses`**: Number of classes (for `Classification` only) +* **`numClasses`**: Number of classes (for `Classification` only). * **`categoricalFeaturesInfo`**: Specifies which features are categorical and how many categorical values each of those features can take. This is given as a map from feature indices to feature arity (number of categories). Any features not in this map are treated as continuous. - * E.g., `Map(0 -> 2, 4 -> 10)` specifies that feature `0` is binary (taking values `0` or `1`) and that feature `4` has 10 categories (values `{0, 1, ..., 9}`). Note that feature indices are 0-based: features `0` and `4` are the 1st and 5th elements of an instance's feature vector. + * For example, `Map(0 -> 2, 4 -> 10)` specifies that feature `0` is binary (taking values `0` or `1`) and that feature `4` has 10 categories (values `{0, 1, ..., 9}`). Note that feature indices are 0-based: features `0` and `4` are the 1st and 5th elements of an instance's feature vector. * Note that you do not have to specify `categoricalFeaturesInfo`. The algorithm will still run and may get reasonable results. However, performance should be better if categorical features are properly designated. ### Stopping criteria diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 07ba0d8ccb2a8..72424f639ead2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -38,8 +38,9 @@ import org.apache.spark.util.random.XORShiftRandom /** * A class which implements a decision tree learning algorithm for classification and regression. * It supports both continuous and categorical features. - * @param strategy The configuration parameters for the tree algorithm which specify the type - * of algorithm (classification, regression, etc.), feature type (continuous, + * + * @param strategy The configuration parameters for the tree algorithm which specify the type of + * decision tree (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. */ @Since("1.0.0") @@ -50,8 +51,8 @@ class DecisionTree @Since("1.0.0") (private val strategy: Strategy) /** * Method to train a decision tree model over an RDD - * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] - * @return DecisionTreeModel that can be used for prediction + * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.2.0") def run(input: RDD[LabeledPoint]): DecisionTreeModel = { @@ -76,10 +77,10 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. - * @param strategy The configuration parameters for the tree algorithm which specify the type - * of algorithm (classification, regression, etc.), feature type (continuous, + * @param strategy The configuration parameters for the tree algorithm which specify the type of + * decision tree (classification, regression, etc.), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. - * @return DecisionTreeModel that can be used for prediction + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.0.0") def train(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeModel = { @@ -97,11 +98,11 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. - * @param algo algorithm, classification or regression - * @param impurity impurity criterion used for information gain calculation - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * @return DecisionTreeModel that can be used for prediction + * @param algo Type of decision tree, either classification or regression. + * @param impurity Criterion used for information gain calculation. + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.0.0") def train( @@ -124,12 +125,12 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. - * @param algo algorithm, classification or regression - * @param impurity impurity criterion used for information gain calculation - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * @param numClasses number of classes for classification. Default value of 2. - * @return DecisionTreeModel that can be used for prediction + * @param algo Type of decision tree, either classification or regression. + * @param impurity Criterion used for information gain calculation. + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). + * @param numClasses Number of classes for classification. Default value of 2. + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.2.0") def train( @@ -153,17 +154,17 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. - * @param algo classification or regression - * @param impurity criterion used for information gain calculation - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * @param numClasses number of classes for classification. Default value of 2. - * @param maxBins maximum number of bins used for splitting features - * @param quantileCalculationStrategy algorithm for calculating quantiles - * @param categoricalFeaturesInfo Map storing arity of categorical features. - * E.g., an entry (n -> k) indicates that feature n is categorical - * with k categories indexed from 0: {0, 1, ..., k-1}. - * @return DecisionTreeModel that can be used for prediction + * @param algo Type of decision tree, either classification or regression. + * @param impurity Criterion used for information gain calculation. + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). + * @param numClasses Number of classes for classification. Default value of 2. + * @param maxBins Maximum number of bins used for splitting features. + * @param quantileCalculationStrategy Algorithm for calculating quantiles. + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.0.0") def train( @@ -185,18 +186,18 @@ object DecisionTree extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. - * @param numClasses number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. - * E.g., an entry (n -> k) indicates that feature n is categorical - * with k categories indexed from 0: {0, 1, ..., k-1}. + * @param numClasses Number of classes for classification. + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * (suggested value: 5) - * @param maxBins maximum number of bins used for splitting features - * (suggested value: 32) - * @return DecisionTreeModel that can be used for prediction + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). + * (suggested value: 5) + * @param maxBins Maximum number of bins used for splitting features. + * (suggested value: 32) + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.1.0") def trainClassifier( @@ -232,17 +233,17 @@ object DecisionTree extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. - * E.g., an entry (n -> k) indicates that feature n is categorical - * with k categories indexed from 0: {0, 1, ..., k-1}. + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. * @param impurity Criterion used for information gain calculation. - * Supported values: "variance". - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * (suggested value: 5) - * @param maxBins maximum number of bins used for splitting features - * (suggested value: 32) - * @return DecisionTreeModel that can be used for prediction + * The only supported value for regression is "variance". + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). + * (suggested value: 5) + * @param maxBins Maximum number of bins used for splitting features. + * (suggested value: 32) + * @return DecisionTreeModel that can be used for prediction. */ @Since("1.1.0") def trainRegressor( @@ -277,7 +278,7 @@ object DecisionTree extends Serializable with Logging { * * @param node Node in tree from which to classify the given data point. * @param binnedFeatures Binned feature vector for data point. - * @param bins possible bins for all features, indexed (numFeatures)(numBins) + * @param bins Possible bins for all features, indexed (numFeatures)(numBins). * @param unorderedFeatures Set of indices of unordered features. * @return Leaf index if the data point reaches a leaf. * Otherwise, last node reachable in tree matching this example. @@ -333,12 +334,12 @@ object DecisionTree extends Serializable with Logging { * For unordered features, bins correspond to subsets of categories; either the left or right bin * for each subset is updated. * - * @param agg Array storing aggregate calculation, with a set of sufficient statistics for - * each (feature, bin). - * @param treePoint Data point being aggregated. - * @param splits possible splits indexed (numFeatures)(numSplits) - * @param unorderedFeatures Set of indices of unordered features. - * @param instanceWeight Weight (importance) of instance in dataset. + * @param agg Array storing aggregate calculation, with a set of sufficient statistics for + * each (feature, bin). + * @param treePoint Data point being aggregated. + * @param splits Possible splits indexed (numFeatures)(numSplits). + * @param unorderedFeatures Set of indices of unordered features. + * @param instanceWeight Weight (importance) of instance in dataset. */ private def mixedBinSeqOp( agg: DTStatsAggregator, @@ -394,10 +395,10 @@ object DecisionTree extends Serializable with Logging { * * For each feature, the sufficient statistics of one bin are updated. * - * @param agg Array storing aggregate calculation, with a set of sufficient statistics for - * each (feature, bin). - * @param treePoint Data point being aggregated. - * @param instanceWeight Weight (importance) of instance in dataset. + * @param agg Array storing aggregate calculation, with a set of sufficient statistics for + * each (feature, bin). + * @param treePoint Data point being aggregated. + * @param instanceWeight Weight (importance) of instance in dataset. */ private def orderedBinSeqOp( agg: DTStatsAggregator, @@ -430,17 +431,17 @@ object DecisionTree extends Serializable with Logging { /** * Given a group of nodes, this finds the best split for each node. * - * @param input Training data: RDD of [[org.apache.spark.mllib.tree.impl.TreePoint]] - * @param metadata Learning and dataset metadata + * @param input Training data: RDD of [[org.apache.spark.mllib.tree.impl.TreePoint]]. + * @param metadata Learning and dataset metadata. * @param topNodes Root node for each tree. Used for matching instances with nodes. - * @param nodesForGroup Mapping: treeIndex --> nodes to be split in tree + * @param nodesForGroup Mapping: treeIndex --> nodes to be split in tree. * @param treeToNodeToIndexInfo Mapping: treeIndex --> nodeIndex --> nodeIndexInfo, * where nodeIndexInfo stores the index in the group and the * feature subsets (if using feature subsets). - * @param splits possible splits for all features, indexed (numFeatures)(numSplits) - * @param bins possible bins for all features, indexed (numFeatures)(numBins) - * @param nodeQueue Queue of nodes to split, with values (treeIndex, node). - * Updated with new non-leaf nodes which are created. + * @param splits Possible splits for all features, indexed (numFeatures)(numSplits). + * @param bins Possible bins for all features, indexed (numFeatures)(numBins). + * @param nodeQueue Queue of nodes to split, with values (treeIndex, node). + * Updated with new non-leaf nodes which are created. * @param nodeIdCache Node Id cache containing an RDD of Array[Int] where * each value in the array is the data point's node Id * for a corresponding tree. This is used to prevent the need @@ -527,10 +528,10 @@ object DecisionTree extends Serializable with Logging { * Each data point contributes to one node. For each feature, * the aggregate sufficient statistics are updated for the relevant bins. * - * @param agg Array storing aggregate calculation, with a set of sufficient statistics for - * each (node, feature, bin). - * @param baggedPoint Data point being aggregated. - * @return agg + * @param agg Array storing aggregate calculation, with a set of sufficient statistics for + * each (node, feature, bin). + * @param baggedPoint Data point being aggregated. + * @return Array of decision tree statistics. */ def binSeqOp( agg: Array[DTStatsAggregator], @@ -563,6 +564,7 @@ object DecisionTree extends Serializable with Logging { /** * Get node index in group --> features indices map, * which is a short cut to find feature indices for a node given node index in group + * * @param treeToNodeToIndexInfo * @return */ @@ -719,9 +721,10 @@ object DecisionTree extends Serializable with Logging { /** * Calculate the information gain for a given (feature, split) based upon left/right aggregates. - * @param leftImpurityCalculator left node aggregates for this (feature, split) - * @param rightImpurityCalculator right node aggregate for this (feature, split) - * @return information gain and statistics for split + * + * @param leftImpurityCalculator Left node aggregates for this (feature, split). + * @param rightImpurityCalculator Right node aggregate for this (feature, split). + * @return Information gain and statistics for split. */ private def calculateGainForSplit( leftImpurityCalculator: ImpurityCalculator, @@ -771,9 +774,10 @@ object DecisionTree extends Serializable with Logging { /** * Calculate predict value for current node, given stats of any split. * Note that this function is called only once for each node. - * @param leftImpurityCalculator left node aggregates for a split - * @param rightImpurityCalculator right node aggregates for a split - * @return predict value and impurity for current node + * + * @param leftImpurityCalculator Left node aggregates for a split. + * @param rightImpurityCalculator Right node aggregates for a split. + * @return Predict value and impurity for current node. */ private def calculatePredictImpurity( leftImpurityCalculator: ImpurityCalculator, @@ -788,8 +792,9 @@ object DecisionTree extends Serializable with Logging { /** * Find the best split for a node. + * * @param binAggregates Bin statistics. - * @return tuple for best split: (Split, information gain, prediction at node) + * @return Tuple for best split: (Split, information gain, prediction at node). */ private def binsToBestSplit( binAggregates: DTStatsAggregator, @@ -956,8 +961,8 @@ object DecisionTree extends Serializable with Logging { * and for multiclass classification with a high-arity feature, * there is one bin per category. * - * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] - * @param metadata Learning and dataset metadata + * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + * @param metadata Learning and dataset metadata. * @return A tuple of (splits, bins). * Splits is an Array of [[org.apache.spark.mllib.tree.model.Split]] * of size (numFeatures, numSplits). @@ -1103,12 +1108,13 @@ object DecisionTree extends Serializable with Logging { * NOTE: Returned number of splits is set based on `featureSamples` and * could be different from the specified `numSplits`. * The `numSplits` attribute in the `DecisionTreeMetadata` class will be set accordingly. - * @param featureSamples feature values of each sample - * @param metadata decision tree metadata + * + * @param featureSamples Feature values of each sample. + * @param metadata Decision tree metadata. * NOTE: `metadata.numbins` will be changed accordingly - * if there are not enough splits to be found - * @param featureIndex feature index to find splits - * @return array of splits + * if there are not enough splits to be found. + * @param featureIndex Feature index to find splits. + * @return Array of splits. */ private[tree] def findSplitsForContinuousFeature( featureSamples: Array[Double], diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index d8d193335c385..4442acf664222 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -60,8 +60,7 @@ def numTrees(self): @since("1.3.0") def totalNumNodes(self): """ - Get total number of nodes, summed over all trees in the - ensemble. + Get total number of nodes, summed over all trees in the ensemble. """ return self.call("totalNumNodes") @@ -109,8 +108,9 @@ def numNodes(self): @since("1.1.0") def depth(self): - """Get depth of tree. - E.g.: Depth 0 means 1 leaf node. Depth 1 means 1 internal node and 2 leaf nodes. + """ + Get depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). """ return self._java_model.depth() @@ -156,29 +156,31 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, Train a DecisionTreeModel for classification. :param data: - Training data: RDD of LabeledPoint. Labels are integers - {0,1,...,numClasses}. + Training data: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any - feature not in this map is treated as continuous. + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param impurity: - Supported values: "entropy" or "gini". + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". (default: "gini") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create the - parent split. + Minimum number of instances required at child nodes to create + the parent split. (default: 1) :param minInfoGain: - Min info gain required to create a split. + Minimum info gain required to create a split. (default: 0.0) :return: DecisionTreeModel. @@ -228,24 +230,26 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any - feature not in this map is treated as continuous. + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param impurity: - Supported values: "variance". + Criterion used for information gain calculation. + The only supported value for regression is "variance". (default: "variance") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create the - parent split. + Minimum number of instances required at child nodes to create + the parent split. (default: 1) :param minInfoGain: - Min info gain required to create a split. + Minimum info gain required to create a split. (default: 0.0) :return: DecisionTreeModel. @@ -328,34 +332,36 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, :param data: Training dataset: RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}.:param numClasses: Number of classes - for classification. + {0, 1, ..., numClasses-1}. + :param numClasses: + Number of classes for classification. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> - k) indicates that feature n is categorical with k categories + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + Supported values: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported - values: "gini" (recommended) or "entropy". + Criterion used for information gain calculation. + Supported values: "gini" or "entropy". (default: "gini") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 4) :param maxBins: Maximum number of bins used for splitting features. (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. (default: None) :return: RandomForestModel that can be used for prediction. @@ -417,31 +423,32 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> - k) indicates that feature n is categorical with k categories + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: Number of features to consider for splits at each node. - Supported: "auto", "all", "sqrt", "log2", "onethird". + Supported values: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported - values: "variance". + Criterion used for information gain calculation. + The only supported value for regression is "variance". (default: "variance") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 4) :param maxBins: Maximum number of bins used for splitting features. (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. + Set as None to generate seed based on system time. (default: None) :return: RandomForestModel that can be used for prediction. @@ -523,13 +530,13 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> - k) indicates that feature n is categorical with k categories + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", - "leastAbsoluteError"}. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". (default: "logLoss") :param numIterations: Number of iterations of boosting. @@ -539,12 +546,12 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 3) :param maxBins: Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories + requires maxBins >= max categories. (default: 32) :return: GradientBoostedTreesModel that can be used for prediction. @@ -591,13 +598,13 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> - k) indicates that feature n is categorical with k categories + Map storing arity of categorical features. An entry (n -> k) + indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", - "leastAbsoluteError"}. + Supported values: "logLoss", "leastSquaresError", + "leastAbsoluteError". (default: "logLoss") :param numIterations: Number of iterations of boosting. @@ -607,12 +614,12 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1 + means 1 internal node + 2 leaf nodes). (default: 3) :param maxBins: Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories + requires maxBins >= max categories. (default: 32) :return: GradientBoostedTreesModel that can be used for prediction. From fcaa7c10679467d9d4694585177f13ece85ca6fa Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 25 Feb 2016 12:20:13 -0800 Subject: [PATCH 6/9] Fixed incorrect default value, cleaned up wording --- python/pyspark/mllib/tree.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 4442acf664222..f7ea466b43291 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -153,7 +153,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ - Train a DecisionTreeModel for classification. + Train a decision tree model for classification. :param data: Training data: RDD of LabeledPoint. Labels should take values @@ -225,7 +225,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ - Train a DecisionTreeModel for regression. + Train a decision tree model for regression. :param data: Training data: RDD of LabeledPoint. Labels are real numbers. @@ -327,7 +327,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=None): """ - Method to train a decision tree model for binary or multiclass + Train a random forest model for binary or multiclass classification. :param data: @@ -418,7 +418,7 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32, seed=None): """ - Method to train a decision tree model for regression. + Train a random forest model for regression. :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. @@ -523,8 +523,7 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ - Method to train a gradient-boosted trees model for - classification. + Train a gradient-boosted trees model for classification. :param data: Training dataset: RDD of LabeledPoint. Labels should take values @@ -593,7 +592,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ - Method to train a gradient-boosted trees model for regression. + Train a gradient-boosted trees model for regression. :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. @@ -605,7 +604,7 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, Loss function used for minimization during gradient boosting. Supported values: "logLoss", "leastSquaresError", "leastAbsoluteError". - (default: "logLoss") + (default: "leastSquaresError") :param numIterations: Number of iterations of boosting. (default: 100) From 28b24503820c64766920031a13f6769020982ba8 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 25 Feb 2016 12:36:56 -0800 Subject: [PATCH 7/9] synced param desc with RandomForest and GBTs --- .../mllib/tree/GradientBoostedTrees.scala | 18 ++--- .../spark/mllib/tree/RandomForest.scala | 69 ++++++++++--------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index 1b71256c585bd..d131f5da6c7eb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -54,8 +54,9 @@ class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: Boosti /** * Method to train a gradient boosting model + * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. - * @return a gradient boosted trees model that can be used for prediction + * @return GradientBoostedTreesModel that can be used for prediction. */ @Since("1.2.0") def run(input: RDD[LabeledPoint]): GradientBoostedTreesModel = { @@ -82,13 +83,14 @@ class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: Boosti /** * Method to validate a gradient boosting model + * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * @param validationInput Validation dataset. * This dataset should be different from the training dataset, * but it should follow the same distribution. * E.g., these two datasets could be created from an original dataset * by using [[org.apache.spark.rdd.RDD.randomSplit()]] - * @return a gradient boosted trees model that can be used for prediction + * @return GradientBoostedTreesModel that can be used for prediction. */ @Since("1.4.0") def runWithValidation( @@ -132,7 +134,7 @@ object GradientBoostedTrees extends Logging { * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. * @param boostingStrategy Configuration options for the boosting algorithm. - * @return a gradient boosted trees model that can be used for prediction + * @return GradientBoostedTreesModel that can be used for prediction. */ @Since("1.2.0") def train( @@ -153,11 +155,11 @@ object GradientBoostedTrees extends Logging { /** * Internal method for performing regression using trees as base learners. - * @param input training dataset - * @param validationInput validation dataset, ignored if validate is set to false. - * @param boostingStrategy boosting parameters - * @param validate whether or not to use the validation dataset. - * @return a gradient boosted trees model that can be used for prediction + * @param input Training dataset. + * @param validationInput Validation dataset, ignored if validate is set to false. + * @param boostingStrategy Boosting parameters. + * @param validate Whether or not to use the validation dataset. + * @return GradientBoostedTreesModel that can be used for prediction. */ private def boost( input: RDD[LabeledPoint], diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 570a76f960796..37635d3b12a6c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -53,12 +53,12 @@ import org.apache.spark.util.random.SamplingUtils * random forests]] * * @param strategy The configuration parameters for the random forest algorithm which specify - * the type of algorithm (classification, regression, etc.), feature type + * the type of random forest (classification, regression, etc.), feature type * (continuous, categorical), depth of the tree, quantile calculation strategy, * etc. * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. * @param featureSubsetStrategy Number of features to consider for splits at each node. - * Supported: "auto", "all", "sqrt", "log2", "onethird". + * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "sqrt" for classification and @@ -121,8 +121,9 @@ private class RandomForest ( /** * Method to train a decision tree model over an RDD - * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]] - * @return a random forest model that can be used for prediction + * + * @param input Training data: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. + * @return RandomForestModel that can be used for prediction. */ def run(input: RDD[LabeledPoint]): RandomForestModel = { @@ -269,12 +270,12 @@ object RandomForest extends Serializable with Logging { * @param strategy Parameters for training each tree in the forest. * @param numTrees Number of trees in the random forest. * @param featureSubsetStrategy Number of features to consider for splits at each node. - * Supported: "auto", "all", "sqrt", "log2", "onethird". + * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "sqrt". - * @param seed Random seed for bootstrapping and choosing feature subsets. - * @return a random forest model that can be used for prediction + * @param seed Random seed for bootstrapping and choosing feature subsets. + * @return RandomForestModel that can be used for prediction. */ @Since("1.2.0") def trainClassifier( @@ -294,25 +295,25 @@ object RandomForest extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels should take values {0, 1, ..., numClasses-1}. - * @param numClasses number of classes for classification. - * @param categoricalFeaturesInfo Map storing arity of categorical features. - * E.g., an entry (n -> k) indicates that feature n is categorical - * with k categories indexed from 0: {0, 1, ..., k-1}. + * @param numClasses Number of classes for classification. + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. * @param featureSubsetStrategy Number of features to consider for splits at each node. - * Supported: "auto", "all", "sqrt", "log2", "onethird". + * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * (suggested value: 4) - * @param maxBins maximum number of bins used for splitting features - * (suggested value: 100) - * @param seed Random seed for bootstrapping and choosing feature subsets. - * @return a random forest model that can be used for prediction + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node; depth 1 means + * 1 internal node + 2 leaf nodes). + * (suggested value: 4) + * @param maxBins Maximum number of bins used for splitting features + * (suggested value: 100) + * @param seed Random seed for bootstrapping and choosing feature subsets. + * @return RandomForestModel that can be used for prediction. */ @Since("1.2.0") def trainClassifier( @@ -358,12 +359,12 @@ object RandomForest extends Serializable with Logging { * @param strategy Parameters for training each tree in the forest. * @param numTrees Number of trees in the random forest. * @param featureSubsetStrategy Number of features to consider for splits at each node. - * Supported: "auto", "all", "sqrt", "log2", "onethird". + * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "onethird". - * @param seed Random seed for bootstrapping and choosing feature subsets. - * @return a random forest model that can be used for prediction + * @param seed Random seed for bootstrapping and choosing feature subsets. + * @return RandomForestModel that can be used for prediction. */ @Since("1.2.0") def trainRegressor( @@ -383,24 +384,24 @@ object RandomForest extends Serializable with Logging { * * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * Labels are real numbers. - * @param categoricalFeaturesInfo Map storing arity of categorical features. - * E.g., an entry (n -> k) indicates that feature n is categorical - * with k categories indexed from 0: {0, 1, ..., k-1}. + * @param categoricalFeaturesInfo Map storing arity of categorical features. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. * @param numTrees Number of trees in the random forest. * @param featureSubsetStrategy Number of features to consider for splits at each node. - * Supported: "auto", "all", "sqrt", "log2", "onethird". + * Supported values: "auto", "all", "sqrt", "log2", "onethird". * If "auto" is set, this parameter is set based on numTrees: * if numTrees == 1, set to "all"; * if numTrees > 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. - * Supported values: "variance". - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - * (suggested value: 4) - * @param maxBins maximum number of bins used for splitting features - * (suggested value: 100) - * @param seed Random seed for bootstrapping and choosing feature subsets. - * @return a random forest model that can be used for prediction + * The only supported value for regression is "variance". + * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node; depth 1 means + * 1 internal node + 2 leaf nodes). + * (suggested value: 4) + * @param maxBins Maximum number of bins used for splitting features. + * (suggested value: 100) + * @param seed Random seed for bootstrapping and choosing feature subsets. + * @return RandomForestModel that can be used for prediction. */ @Since("1.2.0") def trainRegressor( From 3f034698ed50ed325ee0a01037fdb461ea69ba81 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 25 Feb 2016 12:40:53 -0800 Subject: [PATCH 8/9] got rid of etc when specifying decision tree learner type --- .../scala/org/apache/spark/mllib/tree/DecisionTree.scala | 8 ++++---- .../scala/org/apache/spark/mllib/tree/RandomForest.scala | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 72424f639ead2..188d2e53d0111 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -39,8 +39,8 @@ import org.apache.spark.util.random.XORShiftRandom * A class which implements a decision tree learning algorithm for classification and regression. * It supports both continuous and categorical features. * - * @param strategy The configuration parameters for the tree algorithm which specify the type of - * decision tree (classification, regression, etc.), feature type (continuous, + * @param strategy The configuration parameters for the tree algorithm which specify the type + * of decision tree (classification or regression), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. */ @Since("1.0.0") @@ -77,8 +77,8 @@ object DecisionTree extends Serializable with Logging { * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]]. * For classification, labels should take values {0, 1, ..., numClasses-1}. * For regression, labels are real numbers. - * @param strategy The configuration parameters for the tree algorithm which specify the type of - * decision tree (classification, regression, etc.), feature type (continuous, + * @param strategy The configuration parameters for the tree algorithm which specify the type + * of decision tree (classification or regression), feature type (continuous, * categorical), depth of the tree, quantile calculation strategy, etc. * @return DecisionTreeModel that can be used for prediction. */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 37635d3b12a6c..794f2c9074239 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -53,7 +53,7 @@ import org.apache.spark.util.random.SamplingUtils * random forests]] * * @param strategy The configuration parameters for the random forest algorithm which specify - * the type of random forest (classification, regression, etc.), feature type + * the type of random forest (classification or regression), feature type * (continuous, categorical), depth of the tree, quantile calculation strategy, * etc. * @param numTrees If 1, then no bootstrapping is used. If > 1, then bootstrapping is done. From 3926db507826bedd9bbb830b984459980fb68212 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 25 Feb 2016 14:24:02 -0800 Subject: [PATCH 9/9] synced descriptions in decisiont tree Strategy params --- .../org/apache/spark/mllib/tree/RandomForest.scala | 4 ++-- .../spark/mllib/tree/configuration/Strategy.scala | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala index 794f2c9074239..b7714b382a594 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/RandomForest.scala @@ -307,7 +307,7 @@ object RandomForest extends Serializable with Logging { * if numTrees > 1 (forest) set to "sqrt". * @param impurity Criterion used for information gain calculation. * Supported values: "gini" (recommended) or "entropy". - * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node; depth 1 means + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means * 1 internal node + 2 leaf nodes). * (suggested value: 4) * @param maxBins Maximum number of bins used for splitting features @@ -395,7 +395,7 @@ object RandomForest extends Serializable with Logging { * if numTrees > 1 (forest) set to "onethird". * @param impurity Criterion used for information gain calculation. * The only supported value for regression is "variance". - * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node; depth 1 means + * @param maxDepth Maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means * 1 internal node + 2 leaf nodes). * (suggested value: 4) * @param maxBins Maximum number of bins used for splitting features. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index 6c04403f1ad75..9e3e50192d507 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -34,8 +34,8 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} * Supported for Classification: [[org.apache.spark.mllib.tree.impurity.Gini]], * [[org.apache.spark.mllib.tree.impurity.Entropy]]. * Supported for Regression: [[org.apache.spark.mllib.tree.impurity.Variance]]. - * @param maxDepth Maximum depth of the tree. - * E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. + * @param maxDepth Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means + * 1 internal node + 2 leaf nodes). * @param numClasses Number of classes for classification. * (Ignored for regression.) * Default value is 2 (binary classification). @@ -45,10 +45,9 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} * @param quantileCalculationStrategy Algorithm for calculating quantiles. Supported: * [[org.apache.spark.mllib.tree.configuration.QuantileStrategy.Sort]] * @param categoricalFeaturesInfo A map storing information about the categorical variables and the - * number of discrete values they take. For example, an entry (n -> - * k) implies the feature n is categorical with k categories 0, - * 1, 2, ... , k-1. It's important to note that features are - * zero-indexed. + * number of discrete values they take. An entry (n -> k) + * indicates that feature n is categorical with k categories + * indexed from 0: {0, 1, ..., k-1}. * @param minInstancesPerNode Minimum number of instances each child must have after split. * Default value is 1. If a split cause left or right child * to have less than minInstancesPerNode, @@ -60,7 +59,7 @@ import org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity, Variance} * 256 MB. * @param subsamplingRate Fraction of the training data used for learning decision tree. * @param useNodeIdCache If this is true, instead of passing trees to executors, the algorithm will - * maintain a separate RDD of node Id cache for each row. + * maintain a separate RDD of node Id cache for each row. * @param checkpointInterval How often to checkpoint when the node Id cache gets updated. * E.g. 10 means that the cache will get checkpointed every 10 updates. If * the checkpoint directory is not set in