From 7daf4adf08460b2bfa4477a2ebcd75202fb0f288 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Tue, 5 Jan 2016 12:36:19 +0100 Subject: [PATCH 1/4] [SPARK-12634][DOC] Update param descriptions Updates the `param` descriptions to be consistent. See [SPARK-11219] for more details. --- python/pyspark/mllib/tree.py | 285 ++++++++++++++++++++--------------- 1 file changed, 165 insertions(+), 120 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 0001b60093a6..954717289cac 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -92,8 +92,8 @@ def predict(self, x): transformation or action. Call predict directly on the RDD instead. - :param x: Data point (feature vector), - or an RDD of data points (feature vectors). + :param x: + Data point (feature vector), or an RDD of data points (feature vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -154,21 +154,32 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, """ Train a DecisionTreeModel for classification. - :param data: Training data: RDD of LabeledPoint. - Labels are integers {0,1,...,numClasses}. - :param numClasses: Number of classes for classification. - :param categoricalFeaturesInfo: Map from categorical feature index - to number of categories. - Any feature not in this map - is treated as continuous. - :param impurity: Supported values: "entropy" or "gini" - :param maxDepth: Max depth of tree. - E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. - :param maxBins: Number of bins used for finding splits at each node. - :param minInstancesPerNode: Min number of instances required at child - nodes to create the parent split - :param minInfoGain: Min info gain required to create a split + :param data: + Training data: RDD of LabeledPoint. Labels are integers + {0,1,...,numClasses}. + :param numClasses: + Number of classes for classification. + :param categoricalFeaturesInfo: + Map from categorical feature index to number of categories. + Any feature not in this map is treated as continuous. + :param impurity: + Supported values: "entropy" or "gini". + (default: "gini") + :param maxDepth: + Max depth of tree. E.g., depth 0 means 1 leaf node. + Depth 1 means 1 internal node + 2 leaf nodes. + (default: 5) + :param maxBins: + Number of bins used for finding splits at each node. + (default: 32) + :param minInstancesPerNode: + Min number of instances required at child nodes to create + the parent split. + (default: 1) + :param minInfoGain: + Min info gain required to create a split. + (default: 0.0) + :return: DecisionTreeModel Example usage: @@ -213,20 +224,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Train a DecisionTreeModel for regression. - :param data: Training data: RDD of LabeledPoint. - Labels are real numbers. - :param categoricalFeaturesInfo: Map from categorical feature - index to number of categories. - Any feature not in this map is treated as continuous. - :param impurity: Supported values: "variance" - :param maxDepth: Max depth of tree. - E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. - :param maxBins: Number of bins used for finding splits at each - node. - :param minInstancesPerNode: Min number of instances required at - child nodes to create the parent split - :param minInfoGain: Min info gain required to create a split + :param data: + Training data: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. + :param impurity: + Supported values: "variance" + (default: "variance") + :param maxDepth: + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means + 1 internal node + 2 leaf nodes. + (default: 5) + :param maxBins: + Number of bins used for finding splits at each node. + (default: 32) + :param minInstancesPerNode: + Min number of instances required at child nodes to create the + parent split. + (default: 1) + :param minInfoGain: + Min info gain required to create a split. + (default: 0.0) :return: DecisionTreeModel Example usage: @@ -305,30 +324,39 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Method to train a decision tree model for binary or multiclass classification. - :param data: Training dataset: RDD of LabeledPoint. Labels - should take values {0, 1, ..., numClasses-1}. - :param numClasses: number of classes for classification. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that - feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. - :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for - splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "sqrt". - :param impurity: Criterion used for information gain calculation. - Supported values: "gini" (recommended) or "entropy". - :param maxDepth: Maximum depth of the tree. - E.g., depth 0 means 1 leaf node; depth 1 means - 1 internal node + 2 leaf nodes. (default: 4) - :param maxBins: maximum number of bins used for splitting - features - (default: 32) - :param seed: Random seed for bootstrapping and choosing feature - subsets. + :param data: Training dataset: + RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}. + :param numClasses: + Number of classes for classification. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. + :param numTrees: + Number of trees in the random forest. + :param featureSubsetStrategy: + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "sqrt". + (default: "auto") + :param impurity: + Criterion used for information gain calculation. + Supported values: "gini" (recommended) or "entropy". + (default: "gini") + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: + Maximum number of bins used for splitting features + (default: 32) + :param seed: + Random seed for bootstrapping and choosing feature subsets. + (default: None) + :return: RandomForestModel that can be used for prediction Example usage: @@ -385,29 +413,35 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt """ Method to train a decision tree model for regression. - :param data: Training dataset: RDD of LabeledPoint. Labels are - real numbers. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param numTrees: Number of trees in the random forest. - :param featureSubsetStrategy: Number of features to consider for - splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". - If "auto" is set, this parameter is set based on numTrees: - if numTrees == 1, set to "all"; - if numTrees > 1 (forest) set to "onethird" for regression. - :param impurity: Criterion used for information gain - calculation. - Supported values: "variance". - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 4) - :param maxBins: maximum number of bins used for splitting - features (default: 32) - :param seed: Random seed for bootstrapping and choosing feature - subsets. + :param data: + Training dataset: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param numTrees: + Number of trees in the random forest. + :param featureSubsetStrategy: + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + If "auto" is set, this parameter is set based on numTrees: + if numTrees == 1, set to "all"; + if numTrees > 1 (forest) set to "onethird" for regression. + (default: "auto") + :param impurity: + Criterion used for information gain calculation. + Supported values: "variance". + (default: "variance") + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 + means 1 internal node + 2 leaf nodes. + (default: 4) + :param maxBins: + Maximum number of bins used for splitting features. + (default: 32) + :param seed: + Random seed for bootstrapping and choosing feature subsets. + (default: None) :return: RandomForestModel that can be used for prediction Example usage: @@ -483,28 +517,33 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Method to train a gradient-boosted trees model for classification. - :param data: Training dataset: RDD of LabeledPoint. - Labels should take values {0, 1}. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param loss: Loss function used for minimization during gradient - boosting. Supported: {"logLoss" (default), - "leastSquaresError", "leastAbsoluteError"}. - :param numIterations: Number of iterations of boosting. - (default: 100) - :param learningRate: Learning rate for shrinking the - contribution of each estimator. The learning rate - should be between in the interval (0, 1]. - (default: 0.1) - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 3) - :param maxBins: maximum number of bins used for splitting - features (default: 32) DecisionTree requires maxBins >= max categories + :param data: + Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param loss: + Loss function used for minimization during gradient boosting. + Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + (default: "logLoss") + :param numIterations: + Number of iterations of boosting. + (default: 100) + :param learningRate: + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. + (default: 3) + :param maxBins: + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories + (default: 32) + :return: GradientBoostedTreesModel that can be used for - prediction + prediction Example usage: @@ -545,28 +584,34 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, """ Method to train a gradient-boosted trees model for regression. - :param data: Training dataset: RDD of LabeledPoint. Labels are - real numbers. - :param categoricalFeaturesInfo: Map storing arity of categorical - features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: - {0, 1, ..., k-1}. - :param loss: Loss function used for minimization during gradient - boosting. Supported: {"logLoss" (default), - "leastSquaresError", "leastAbsoluteError"}. - :param numIterations: Number of iterations of boosting. - (default: 100) - :param learningRate: Learning rate for shrinking the - contribution of each estimator. The learning rate - should be between in the interval (0, 1]. - (default: 0.1) - :param maxBins: maximum number of bins used for splitting - features (default: 32) DecisionTree requires maxBins >= max categories - :param maxDepth: Maximum depth of the tree. E.g., depth 0 means - 1 leaf node; depth 1 means 1 internal node + 2 leaf - nodes. (default: 3) + :param data: + Training dataset: RDD of LabeledPoint. Labels are real numbers. + :param categoricalFeaturesInfo: + Map storing arity of categorical features. E.g., an entry (n -> k) + indicates that feature n is categorical with k categories indexed + from 0: {0, 1, ..., k-1}. + :param loss: + Loss function used for minimization during gradient boosting. + Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + (default: "logLoss") + :param numIterations: + Number of iterations of boosting. + (default: 100) + :param learningRate: + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. + (default: 0.1) + :param maxDepth: + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 + means 1 internal node + 2 leaf nodes. + (default: 3) + :param maxBins: + Maximum number of bins used for splitting features. + DecisionTree requires maxBins >= max categories + (default: 32) + :return: GradientBoostedTreesModel that can be used for - prediction + prediction Example usage: From a5346e266c211995c0d0306c278d63b4bf9a8781 Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Wed, 6 Jan 2016 11:41:45 +0100 Subject: [PATCH 2/4] Style Fixes - Update fill-column to 100 on parameter descriptions. --- python/pyspark/mllib/tree.py | 96 +++++++++++++++++------------------- 1 file changed, 44 insertions(+), 52 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 954717289cac..5e7d8a0e62ec 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -155,31 +155,28 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, Train a DecisionTreeModel for classification. :param data: - Training data: RDD of LabeledPoint. Labels are integers - {0,1,...,numClasses}. + Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. - Any feature not in this map is treated as continuous. + Map from categorical feature index to number of categories. Any feature not in this map is + treated as continuous. :param impurity: Supported values: "entropy" or "gini". (default: "gini") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. - Depth 1 means 1 internal node + 2 leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 + leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create - the parent split. + Min number of instances required at child nodes to create the parent split. (default: 1) :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel Example usage: @@ -227,14 +224,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any - feature not in this map is treated as continuous. + Map from categorical feature index to number of categories. Any feature not in this map is + treated as continuous. :param impurity: - Supported values: "variance" + Supported values: "variance". (default: "variance") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means - 1 internal node + 2 leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 + leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. @@ -325,33 +322,31 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, classification. :param data: Training dataset: - RDD of LabeledPoint. Labels should take values - {0, 1, ..., numClasses-1}. + RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories - indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. Supported: "auto" (default), "all", + "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". (default: "auto") :param impurity: - Criterion used for information gain calculation. - Supported values: "gini" (recommended) or "entropy". + Criterion used for information gain calculation. Supported values: "gini" (recommended) or + "entropy". (default: "gini") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; - depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 4) :param maxBins: - Maximum number of bins used for splitting features + Maximum number of bins used for splitting features. (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. @@ -416,25 +411,23 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. - Supported: "auto" (default), "all", "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", + "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") :param impurity: - Criterion used for information gain calculation. - Supported values: "variance". + Criterion used for information gain calculation. Supported values: "variance". (default: "variance") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -520,9 +513,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature + n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. @@ -531,15 +523,16 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. The learning rate should + be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree - requires maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree requires + maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for @@ -587,9 +580,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) - indicates that feature n is categorical with k categories indexed - from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> k) indicates that + feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. @@ -598,16 +590,16 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. - The learning rate should be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. The learning rate should + be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 - means 1 internal node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal + node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. - DecisionTree requires maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree requires + maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for From 9337098f54d5b015163c86c319c81aebe124797f Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Fri, 22 Jan 2016 15:16:24 +0100 Subject: [PATCH 3/4] Limit parameter descriptions to 74th column --- python/pyspark/mllib/tree.py | 104 +++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 46 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index 5e7d8a0e62ec..f1d16561ec81 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -93,7 +93,8 @@ def predict(self, x): Call predict directly on the RDD instead. :param x: - Data point (feature vector), or an RDD of data points (feature vectors). + Data point (feature vector), or an RDD of data points (feature + vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) @@ -155,24 +156,26 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, Train a DecisionTreeModel for classification. :param data: - Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. + Training data: RDD of LabeledPoint. Labels are integers + {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any feature not in this map is - treated as continuous. + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. :param impurity: Supported values: "entropy" or "gini". (default: "gini") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 - leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 + means 1 internal node + 2 leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. (default: 32) :param minInstancesPerNode: - Min number of instances required at child nodes to create the parent split. + Min number of instances required at child nodes to create the + parent split. (default: 1) :param minInfoGain: Min info gain required to create a split. @@ -224,14 +227,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map from categorical feature index to number of categories. Any feature not in this map is - treated as continuous. + Map from categorical feature index to number of categories. Any + feature not in this map is treated as continuous. :param impurity: Supported values: "variance". (default: "variance") :param maxDepth: - Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 - leaf nodes. + Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 + means 1 internal node + 2 leaf nodes. (default: 5) :param maxBins: Number of bins used for finding splits at each node. @@ -322,28 +325,30 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, classification. :param data: Training dataset: - RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. + RDD of LabeledPoint. Labels should take values {0, 1, ..., + numClasses-1}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. Supported: "auto" (default), "all", - "sqrt", "log2", "onethird". + Number of features to consider for splits at each node. + Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported values: "gini" (recommended) or - "entropy". + Criterion used for information gain calculation. Supported + values: "gini" (recommended) or "entropy". (default: "gini") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + - 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -411,23 +416,25 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: - Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", - "log2", "onethird". + Number of features to consider for splits at each node. + Supported: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "onethird" for regression. (default: "auto") :param impurity: - Criterion used for information gain calculation. Supported values: "variance". + Criterion used for information gain calculation. Supported + values: "variance". (default: "variance") :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + - 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: Maximum number of bins used for splitting features. @@ -511,28 +518,31 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, classification. :param data: - Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1}. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature - n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + Supported values: {"logLoss", "leastSquaresError", + "leastAbsoluteError"}. (default: "logLoss") :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. The learning rate should - be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node - + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree requires - maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for @@ -580,26 +590,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: - Map storing arity of categorical features. E.g., an entry (n -> k) indicates that - feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. + Map storing arity of categorical features. E.g., an entry (n -> + k) indicates that feature n is categorical with k categories + indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. - Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}. + Supported values: {"logLoss", "leastSquaresError", + "leastAbsoluteError"}. (default: "logLoss") :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: - Learning rate for shrinking the contribution of each estimator. The learning rate should - be between in the interval (0, 1]. + Learning rate for shrinking the contribution of each estimator. + The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: - Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal - node + 2 leaf nodes. + Maximum depth of the tree. E.g., depth 0 means 1 leaf node; + depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :param maxBins: - Maximum number of bins used for splitting features. DecisionTree requires - maxBins >= max categories + Maximum number of bins used for splitting features. DecisionTree + requires maxBins >= max categories (default: 32) :return: GradientBoostedTreesModel that can be used for From 36be47c4986cd43d04053cc1c458bdd77dd918bc Mon Sep 17 00:00:00 2001 From: vijaykiran Date: Sat, 23 Jan 2016 08:02:21 +0100 Subject: [PATCH 4/4] :return: formatting and minor style fixes --- python/pyspark/mllib/tree.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py index f1d16561ec81..d8d193335c38 100644 --- a/python/pyspark/mllib/tree.py +++ b/python/pyspark/mllib/tree.py @@ -180,7 +180,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel + :return: + DecisionTreeModel. Example usage: @@ -246,7 +247,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, :param minInfoGain: Min info gain required to create a split. (default: 0.0) - :return: DecisionTreeModel + :return: + DecisionTreeModel. Example usage: @@ -324,11 +326,10 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, Method to train a decision tree model for binary or multiclass classification. - :param data: Training dataset: - RDD of LabeledPoint. Labels should take values {0, 1, ..., - numClasses-1}. - :param numClasses: - Number of classes for classification. + :param data: + Training dataset: RDD of LabeledPoint. Labels should take values + {0, 1, ..., numClasses-1}.:param numClasses: Number of classes + for classification. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories @@ -356,8 +357,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, :param seed: Random seed for bootstrapping and choosing feature subsets. (default: None) - - :return: RandomForestModel that can be used for prediction + :return: + RandomForestModel that can be used for prediction. Example usage: @@ -442,7 +443,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt :param seed: Random seed for bootstrapping and choosing feature subsets. (default: None) - :return: RandomForestModel that can be used for prediction + :return: + RandomForestModel that can be used for prediction. Example usage: @@ -544,9 +546,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo, Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories (default: 32) - - :return: GradientBoostedTreesModel that can be used for - prediction + :return: + GradientBoostedTreesModel that can be used for prediction. Example usage: @@ -613,9 +614,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, Maximum number of bins used for splitting features. DecisionTree requires maxBins >= max categories (default: 32) - - :return: GradientBoostedTreesModel that can be used for - prediction + :return: + GradientBoostedTreesModel that can be used for prediction. Example usage: