From 7daf4adf08460b2bfa4477a2ebcd75202fb0f288 Mon Sep 17 00:00:00 2001
From: vijaykiran <mail@vijaykiran.com>
Date: Tue, 5 Jan 2016 12:36:19 +0100
Subject: [PATCH 1/4] [SPARK-12634][DOC] Update param descriptions

Updates the `param` descriptions to be consistent. See [SPARK-11219] for
more details.
---
 python/pyspark/mllib/tree.py | 285 ++++++++++++++++++++---------------
 1 file changed, 165 insertions(+), 120 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 0001b60093a6..954717289cac 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -92,8 +92,8 @@ def predict(self, x):
               transformation or action.
               Call predict directly on the RDD instead.
 
-        :param x:  Data point (feature vector),
-                   or an RDD of data points (feature vectors).
+        :param x:
+          Data point (feature vector), or an RDD of data points (feature vectors).
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -154,21 +154,32 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
         """
         Train a DecisionTreeModel for classification.
 
-        :param data: Training data: RDD of LabeledPoint.
-                     Labels are integers {0,1,...,numClasses}.
-        :param numClasses: Number of classes for classification.
-        :param categoricalFeaturesInfo: Map from categorical feature index
-                                        to number of categories.
-                                        Any feature not in this map
-                                        is treated as continuous.
-        :param impurity: Supported values: "entropy" or "gini"
-        :param maxDepth: Max depth of tree.
-                         E.g., depth 0 means 1 leaf node.
-                         Depth 1 means 1 internal node + 2 leaf nodes.
-        :param maxBins: Number of bins used for finding splits at each node.
-        :param minInstancesPerNode: Min number of instances required at child
-                                    nodes to create the parent split
-        :param minInfoGain: Min info gain required to create a split
+        :param data:
+          Training data: RDD of LabeledPoint. Labels are integers
+          {0,1,...,numClasses}.
+        :param numClasses:
+          Number of classes for classification.
+        :param categoricalFeaturesInfo:
+          Map from categorical feature index to number of categories.
+          Any feature not in this map is treated as continuous.
+        :param impurity:
+          Supported values: "entropy" or "gini".
+          (default: "gini")
+        :param maxDepth:
+          Max depth of tree. E.g., depth 0 means 1 leaf node.
+          Depth 1 means 1 internal node + 2 leaf nodes.
+          (default: 5)
+        :param maxBins:
+          Number of bins used for finding splits at each node.
+          (default: 32)
+        :param minInstancesPerNode:
+          Min number of instances required at child nodes to create
+          the parent split.
+          (default: 1)
+        :param minInfoGain:
+          Min info gain required to create a split.
+          (default: 0.0)
+
         :return: DecisionTreeModel
 
         Example usage:
@@ -213,20 +224,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         """
         Train a DecisionTreeModel for regression.
 
-        :param data: Training data: RDD of LabeledPoint.
-                     Labels are real numbers.
-        :param categoricalFeaturesInfo: Map from categorical feature
-                 index to number of categories.
-                 Any feature not in this map is treated as continuous.
-        :param impurity: Supported values: "variance"
-        :param maxDepth: Max depth of tree.
-                 E.g., depth 0 means 1 leaf node.
-                 Depth 1 means 1 internal node + 2 leaf nodes.
-        :param maxBins: Number of bins used for finding splits at each
-                 node.
-        :param minInstancesPerNode: Min number of instances required at
-                 child nodes to create the parent split
-        :param minInfoGain: Min info gain required to create a split
+        :param data:
+          Training data: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map from categorical feature index to number of categories. Any
+          feature not in this map is treated as continuous.
+        :param impurity:
+          Supported values: "variance"
+          (default: "variance")
+        :param maxDepth:
+          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means
+          1 internal node + 2 leaf nodes.
+          (default: 5)
+        :param maxBins:
+          Number of bins used for finding splits at each node.
+          (default: 32)
+        :param minInstancesPerNode:
+          Min number of instances required at child nodes to create the
+          parent split.
+          (default: 1)
+        :param minInfoGain:
+          Min info gain required to create a split.
+          (default: 0.0)
         :return: DecisionTreeModel
 
         Example usage:
@@ -305,30 +324,39 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         Method to train a decision tree model for binary or multiclass
         classification.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels
-                 should take values {0, 1, ..., numClasses-1}.
-        :param numClasses: number of classes for classification.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-                 features.  E.g., an entry (n -> k) indicates that
-                 feature n is categorical with k categories indexed
-                 from 0: {0, 1, ..., k-1}.
-        :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for
-                 splits at each node.
-                 Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                 If "auto" is set, this parameter is set based on numTrees:
-                 if numTrees == 1, set to "all";
-                 if numTrees > 1 (forest) set to "sqrt".
-        :param impurity: Criterion used for information gain calculation.
-               Supported values: "gini" (recommended) or "entropy".
-        :param maxDepth: Maximum depth of the tree.
-                 E.g., depth 0 means 1 leaf node; depth 1 means
-                 1 internal node + 2 leaf nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting
-                 features
-                 (default: 32)
-        :param seed: Random seed for bootstrapping and choosing feature
-                 subsets.
+        :param data: Training dataset:
+          RDD of LabeledPoint. Labels should take values
+          {0, 1, ..., numClasses-1}.
+        :param numClasses:
+          Number of classes for classification.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features.  E.g., an entry (n -> k)
+          indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
+        :param numTrees:
+          Number of trees in the random forest.
+        :param featureSubsetStrategy:
+          Number of features to consider for splits at each node.
+          Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+          If "auto" is set, this parameter is set based on numTrees:
+            if numTrees == 1, set to "all";
+            if numTrees > 1 (forest) set to "sqrt".
+          (default: "auto")
+        :param impurity:
+          Criterion used for information gain calculation.
+          Supported values: "gini" (recommended) or "entropy".
+          (default: "gini")
+        :param maxDepth:
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+          depth 1 means 1 internal node + 2 leaf nodes.
+          (default: 4)
+        :param maxBins:
+          Maximum number of bins used for splitting features
+          (default: 32)
+        :param seed:
+          Random seed for bootstrapping and choosing feature subsets.
+          (default: None)
+
         :return: RandomForestModel that can be used for prediction
 
         Example usage:
@@ -385,29 +413,35 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         """
         Method to train a decision tree model for regression.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels are
-               real numbers.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param numTrees: Number of trees in the random forest.
-        :param featureSubsetStrategy: Number of features to consider for
-                 splits at each node.
-                 Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
-                 If "auto" is set, this parameter is set based on numTrees:
-                 if numTrees == 1, set to "all";
-                 if numTrees > 1 (forest) set to "onethird" for regression.
-        :param impurity: Criterion used for information gain
-                 calculation.
-                 Supported values: "variance".
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes. (default: 4)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32)
-        :param seed: Random seed for bootstrapping and choosing feature
-                 subsets.
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. E.g., an entry (n -> k)
+          indicates that feature n is categorical with k categories indexed
+          from 0: {0, 1, ..., k-1}.
+        :param numTrees:
+          Number of trees in the random forest.
+        :param featureSubsetStrategy:
+          Number of features to consider for splits at each node.
+          Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+          If "auto" is set, this parameter is set based on numTrees:
+            if numTrees == 1, set to "all";
+            if numTrees > 1 (forest) set to "onethird" for regression.
+          (default: "auto")
+        :param impurity:
+          Criterion used for information gain calculation.
+          Supported values: "variance".
+          (default: "variance")
+        :param maxDepth:
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1
+          means 1 internal node + 2 leaf nodes.
+          (default: 4)
+        :param maxBins:
+          Maximum number of bins used for splitting features.
+          (default: 32)
+        :param seed:
+          Random seed for bootstrapping and choosing feature subsets.
+          (default: None)
         :return: RandomForestModel that can be used for prediction
 
         Example usage:
@@ -483,28 +517,33 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
         Method to train a gradient-boosted trees model for
         classification.
 
-        :param data: Training dataset: RDD of LabeledPoint.
-                 Labels should take values {0, 1}.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient
-                 boosting. Supported: {"logLoss" (default),
-                 "leastSquaresError", "leastAbsoluteError"}.
-        :param numIterations: Number of iterations of boosting.
-                              (default: 100)
-        :param learningRate: Learning rate for shrinking the
-                 contribution of each estimator. The learning rate
-                 should be between in the interval (0, 1].
-                 (default: 0.1)
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes. (default: 3)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32) DecisionTree requires maxBins >= max categories
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. E.g., an entry (n -> k)
+          indicates that feature n is categorical with k categories indexed
+          from 0: {0, 1, ..., k-1}.
+        :param loss:
+          Loss function used for minimization during gradient boosting.
+          Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
+          (default: "logLoss")
+        :param numIterations:
+          Number of iterations of boosting.
+          (default: 100)
+        :param learningRate:
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
+          (default: 0.1)
+        :param maxDepth:
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+          (default: 3)
+        :param maxBins:
+          Maximum number of bins used for splitting features. DecisionTree
+          requires maxBins >= max categories
+          (default: 32)
+
         :return: GradientBoostedTreesModel that can be used for
-                   prediction
+                 prediction
 
         Example usage:
 
@@ -545,28 +584,34 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         """
         Method to train a gradient-boosted trees model for regression.
 
-        :param data: Training dataset: RDD of LabeledPoint. Labels are
-               real numbers.
-        :param categoricalFeaturesInfo: Map storing arity of categorical
-               features. E.g., an entry (n -> k) indicates that feature
-               n is categorical with k categories indexed from 0:
-               {0, 1, ..., k-1}.
-        :param loss: Loss function used for minimization during gradient
-                 boosting. Supported: {"logLoss" (default),
-                 "leastSquaresError", "leastAbsoluteError"}.
-        :param numIterations: Number of iterations of boosting.
-                              (default: 100)
-        :param learningRate: Learning rate for shrinking the
-                 contribution of each estimator. The learning rate
-                 should be between in the interval (0, 1].
-                 (default: 0.1)
-        :param maxBins: maximum number of bins used for splitting
-                 features (default: 32) DecisionTree requires maxBins >= max categories
-        :param maxDepth: Maximum depth of the tree. E.g., depth 0 means
-                 1 leaf node; depth 1 means 1 internal node + 2 leaf
-                 nodes.  (default: 3)
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels are real numbers.
+        :param categoricalFeaturesInfo:
+          Map storing arity of categorical features. E.g., an entry (n -> k)
+          indicates that feature n is categorical with k categories indexed
+          from 0: {0, 1, ..., k-1}.
+        :param loss:
+          Loss function used for minimization during gradient boosting.
+          Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
+          (default: "logLoss")
+        :param numIterations:
+          Number of iterations of boosting.
+          (default: 100)
+        :param learningRate:
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
+          (default: 0.1)
+        :param maxDepth:
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1
+          means 1 internal node + 2 leaf nodes.
+          (default: 3)
+        :param maxBins:
+          Maximum number of bins used for splitting features.
+          DecisionTree requires maxBins >= max categories
+          (default: 32)
+
         :return: GradientBoostedTreesModel that can be used for
-                   prediction
+                 prediction
 
         Example usage:
 

From a5346e266c211995c0d0306c278d63b4bf9a8781 Mon Sep 17 00:00:00 2001
From: vijaykiran <mail@vijaykiran.com>
Date: Wed, 6 Jan 2016 11:41:45 +0100
Subject: [PATCH 2/4] Style Fixes

- Update fill-column to 100 on parameter descriptions.
---
 python/pyspark/mllib/tree.py | 96 +++++++++++++++++-------------------
 1 file changed, 44 insertions(+), 52 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 954717289cac..5e7d8a0e62ec 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -155,31 +155,28 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
         Train a DecisionTreeModel for classification.
 
         :param data:
-          Training data: RDD of LabeledPoint. Labels are integers
-          {0,1,...,numClasses}.
+          Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}.
         :param numClasses:
           Number of classes for classification.
         :param categoricalFeaturesInfo:
-          Map from categorical feature index to number of categories.
-          Any feature not in this map is treated as continuous.
+          Map from categorical feature index to number of categories. Any feature not in this map is
+          treated as continuous.
         :param impurity:
           Supported values: "entropy" or "gini".
           (default: "gini")
         :param maxDepth:
-          Max depth of tree. E.g., depth 0 means 1 leaf node.
-          Depth 1 means 1 internal node + 2 leaf nodes.
+          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2
+          leaf nodes.
           (default: 5)
         :param maxBins:
           Number of bins used for finding splits at each node.
           (default: 32)
         :param minInstancesPerNode:
-          Min number of instances required at child nodes to create
-          the parent split.
+          Min number of instances required at child nodes to create the parent split.
           (default: 1)
         :param minInfoGain:
           Min info gain required to create a split.
           (default: 0.0)
-
         :return: DecisionTreeModel
 
         Example usage:
@@ -227,14 +224,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         :param data:
           Training data: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map from categorical feature index to number of categories. Any
-          feature not in this map is treated as continuous.
+          Map from categorical feature index to number of categories. Any feature not in this map is
+          treated as continuous.
         :param impurity:
-          Supported values: "variance"
+          Supported values: "variance".
           (default: "variance")
         :param maxDepth:
-          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means
-          1 internal node + 2 leaf nodes.
+          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2
+          leaf nodes.
           (default: 5)
         :param maxBins:
           Number of bins used for finding splits at each node.
@@ -325,33 +322,31 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         classification.
 
         :param data: Training dataset:
-          RDD of LabeledPoint. Labels should take values
-          {0, 1, ..., numClasses-1}.
+          RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}.
         :param numClasses:
           Number of classes for classification.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features.  E.g., an entry (n -> k)
-          indicates that feature n is categorical with k categories
-          indexed from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features.  E.g., an entry (n -> k) indicates that feature
+          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
         :param numTrees:
           Number of trees in the random forest.
         :param featureSubsetStrategy:
-          Number of features to consider for splits at each node.
-          Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+          Number of features to consider for splits at each node. Supported: "auto" (default), "all",
+          "sqrt", "log2", "onethird".
           If "auto" is set, this parameter is set based on numTrees:
             if numTrees == 1, set to "all";
             if numTrees > 1 (forest) set to "sqrt".
           (default: "auto")
         :param impurity:
-          Criterion used for information gain calculation.
-          Supported values: "gini" (recommended) or "entropy".
+          Criterion used for information gain calculation. Supported values: "gini" (recommended) or
+          "entropy".
           (default: "gini")
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
-          depth 1 means 1 internal node + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node +
+          2 leaf nodes.
           (default: 4)
         :param maxBins:
-          Maximum number of bins used for splitting features
+          Maximum number of bins used for splitting features.
           (default: 32)
         :param seed:
           Random seed for bootstrapping and choosing feature subsets.
@@ -416,25 +411,23 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         :param data:
           Training dataset: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k)
-          indicates that feature n is categorical with k categories indexed
-          from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature
+          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
         :param numTrees:
           Number of trees in the random forest.
         :param featureSubsetStrategy:
-          Number of features to consider for splits at each node.
-          Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
+          Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt",
+          "log2", "onethird".
           If "auto" is set, this parameter is set based on numTrees:
             if numTrees == 1, set to "all";
             if numTrees > 1 (forest) set to "onethird" for regression.
           (default: "auto")
         :param impurity:
-          Criterion used for information gain calculation.
-          Supported values: "variance".
+          Criterion used for information gain calculation. Supported values: "variance".
           (default: "variance")
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1
-          means 1 internal node + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node +
+          2 leaf nodes.
           (default: 4)
         :param maxBins:
           Maximum number of bins used for splitting features.
@@ -520,9 +513,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
         :param data:
           Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k)
-          indicates that feature n is categorical with k categories indexed
-          from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature
+          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
         :param loss:
           Loss function used for minimization during gradient boosting.
           Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
@@ -531,15 +523,16 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
           Number of iterations of boosting.
           (default: 100)
         :param learningRate:
-          Learning rate for shrinking the contribution of each estimator.
-          The learning rate should be between in the interval (0, 1].
+          Learning rate for shrinking the contribution of each estimator. The learning rate should
+          be between in the interval (0, 1].
           (default: 0.1)
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node
+          + 2 leaf nodes.
           (default: 3)
         :param maxBins:
-          Maximum number of bins used for splitting features. DecisionTree
-          requires maxBins >= max categories
+          Maximum number of bins used for splitting features. DecisionTree requires
+          maxBins >= max categories
           (default: 32)
 
         :return: GradientBoostedTreesModel that can be used for
@@ -587,9 +580,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         :param data:
           Training dataset: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k)
-          indicates that feature n is categorical with k categories indexed
-          from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that
+          feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
         :param loss:
           Loss function used for minimization during gradient boosting.
           Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
@@ -598,16 +590,16 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
           Number of iterations of boosting.
           (default: 100)
         :param learningRate:
-          Learning rate for shrinking the contribution of each estimator.
-          The learning rate should be between in the interval (0, 1].
+          Learning rate for shrinking the contribution of each estimator. The learning rate should
+          be between in the interval (0, 1].
           (default: 0.1)
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1
-          means 1 internal node + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal
+          node + 2 leaf nodes.
           (default: 3)
         :param maxBins:
-          Maximum number of bins used for splitting features.
-          DecisionTree requires maxBins >= max categories
+          Maximum number of bins used for splitting features. DecisionTree requires
+          maxBins >= max categories
           (default: 32)
 
         :return: GradientBoostedTreesModel that can be used for

From 9337098f54d5b015163c86c319c81aebe124797f Mon Sep 17 00:00:00 2001
From: vijaykiran <mail@vijaykiran.com>
Date: Fri, 22 Jan 2016 15:16:24 +0100
Subject: [PATCH 3/4] Limit parameter descriptions to 74th column

---
 python/pyspark/mllib/tree.py | 104 +++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index 5e7d8a0e62ec..f1d16561ec81 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -93,7 +93,8 @@ def predict(self, x):
               Call predict directly on the RDD instead.
 
         :param x:
-          Data point (feature vector), or an RDD of data points (feature vectors).
+          Data point (feature vector), or an RDD of data points (feature
+          vectors).
         """
         if isinstance(x, RDD):
             return self.call("predict", x.map(_convert_to_vector))
@@ -155,24 +156,26 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
         Train a DecisionTreeModel for classification.
 
         :param data:
-          Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}.
+          Training data: RDD of LabeledPoint. Labels are integers
+          {0,1,...,numClasses}.
         :param numClasses:
           Number of classes for classification.
         :param categoricalFeaturesInfo:
-          Map from categorical feature index to number of categories. Any feature not in this map is
-          treated as continuous.
+          Map from categorical feature index to number of categories. Any
+          feature not in this map is treated as continuous.
         :param impurity:
           Supported values: "entropy" or "gini".
           (default: "gini")
         :param maxDepth:
-          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2
-          leaf nodes.
+          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1
+          means 1 internal node + 2 leaf nodes.
           (default: 5)
         :param maxBins:
           Number of bins used for finding splits at each node.
           (default: 32)
         :param minInstancesPerNode:
-          Min number of instances required at child nodes to create the parent split.
+          Min number of instances required at child nodes to create the
+          parent split.
           (default: 1)
         :param minInfoGain:
           Min info gain required to create a split.
@@ -224,14 +227,14 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         :param data:
           Training data: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map from categorical feature index to number of categories. Any feature not in this map is
-          treated as continuous.
+          Map from categorical feature index to number of categories. Any
+          feature not in this map is treated as continuous.
         :param impurity:
           Supported values: "variance".
           (default: "variance")
         :param maxDepth:
-          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2
-          leaf nodes.
+          Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1
+          means 1 internal node + 2 leaf nodes.
           (default: 5)
         :param maxBins:
           Number of bins used for finding splits at each node.
@@ -322,28 +325,30 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         classification.
 
         :param data: Training dataset:
-          RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}.
+          RDD of LabeledPoint. Labels should take values {0, 1, ...,
+          numClasses-1}.
         :param numClasses:
           Number of classes for classification.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features.  E.g., an entry (n -> k) indicates that feature
-          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n ->
+          k) indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
         :param numTrees:
           Number of trees in the random forest.
         :param featureSubsetStrategy:
-          Number of features to consider for splits at each node. Supported: "auto" (default), "all",
-          "sqrt", "log2", "onethird".
+          Number of features to consider for splits at each node.
+          Supported: "auto" (default), "all", "sqrt", "log2", "onethird".
           If "auto" is set, this parameter is set based on numTrees:
             if numTrees == 1, set to "all";
             if numTrees > 1 (forest) set to "sqrt".
           (default: "auto")
         :param impurity:
-          Criterion used for information gain calculation. Supported values: "gini" (recommended) or
-          "entropy".
+          Criterion used for information gain calculation. Supported
+          values: "gini" (recommended) or "entropy".
           (default: "gini")
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node +
-          2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+          depth 1 means 1 internal node + 2 leaf nodes.
           (default: 4)
         :param maxBins:
           Maximum number of bins used for splitting features.
@@ -411,23 +416,25 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         :param data:
           Training dataset: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature
-          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n ->
+          k) indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
         :param numTrees:
           Number of trees in the random forest.
         :param featureSubsetStrategy:
-          Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt",
-          "log2", "onethird".
+          Number of features to consider for splits at each node.
+          Supported: "auto", "all", "sqrt", "log2", "onethird".
           If "auto" is set, this parameter is set based on numTrees:
             if numTrees == 1, set to "all";
             if numTrees > 1 (forest) set to "onethird" for regression.
           (default: "auto")
         :param impurity:
-          Criterion used for information gain calculation. Supported values: "variance".
+          Criterion used for information gain calculation. Supported
+          values: "variance".
           (default: "variance")
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node +
-          2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+          depth 1 means 1 internal node + 2 leaf nodes.
           (default: 4)
         :param maxBins:
           Maximum number of bins used for splitting features.
@@ -511,28 +518,31 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
         classification.
 
         :param data:
-          Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}.
+          Training dataset: RDD of LabeledPoint. Labels should take values
+          {0, 1}.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature
-          n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n ->
+          k) indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
         :param loss:
           Loss function used for minimization during gradient boosting.
-          Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
+          Supported values: {"logLoss", "leastSquaresError",
+          "leastAbsoluteError"}.
           (default: "logLoss")
         :param numIterations:
           Number of iterations of boosting.
           (default: 100)
         :param learningRate:
-          Learning rate for shrinking the contribution of each estimator. The learning rate should
-          be between in the interval (0, 1].
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
           (default: 0.1)
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node
-          + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+          depth 1 means 1 internal node + 2 leaf nodes.
           (default: 3)
         :param maxBins:
-          Maximum number of bins used for splitting features. DecisionTree requires
-          maxBins >= max categories
+          Maximum number of bins used for splitting features. DecisionTree
+          requires maxBins >= max categories
           (default: 32)
 
         :return: GradientBoostedTreesModel that can be used for
@@ -580,26 +590,28 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         :param data:
           Training dataset: RDD of LabeledPoint. Labels are real numbers.
         :param categoricalFeaturesInfo:
-          Map storing arity of categorical features. E.g., an entry (n -> k) indicates that
-          feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
+          Map storing arity of categorical features. E.g., an entry (n ->
+          k) indicates that feature n is categorical with k categories
+          indexed from 0: {0, 1, ..., k-1}.
         :param loss:
           Loss function used for minimization during gradient boosting.
-          Supported values: {"logLoss", "leastSquaresError", "leastAbsoluteError"}.
+          Supported values: {"logLoss", "leastSquaresError",
+          "leastAbsoluteError"}.
           (default: "logLoss")
         :param numIterations:
           Number of iterations of boosting.
           (default: 100)
         :param learningRate:
-          Learning rate for shrinking the contribution of each estimator. The learning rate should
-          be between in the interval (0, 1].
+          Learning rate for shrinking the contribution of each estimator.
+          The learning rate should be between in the interval (0, 1].
           (default: 0.1)
         :param maxDepth:
-          Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal
-          node + 2 leaf nodes.
+          Maximum depth of the tree. E.g., depth 0 means 1 leaf node;
+          depth 1 means 1 internal node + 2 leaf nodes.
           (default: 3)
         :param maxBins:
-          Maximum number of bins used for splitting features. DecisionTree requires
-          maxBins >= max categories
+          Maximum number of bins used for splitting features. DecisionTree
+          requires maxBins >= max categories
           (default: 32)
 
         :return: GradientBoostedTreesModel that can be used for

From 36be47c4986cd43d04053cc1c458bdd77dd918bc Mon Sep 17 00:00:00 2001
From: vijaykiran <mail@vijaykiran.com>
Date: Sat, 23 Jan 2016 08:02:21 +0100
Subject: [PATCH 4/4] :return: formatting and minor style fixes

---
 python/pyspark/mllib/tree.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index f1d16561ec81..d8d193335c38 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -180,7 +180,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,
         :param minInfoGain:
           Min info gain required to create a split.
           (default: 0.0)
-        :return: DecisionTreeModel
+        :return:
+          DecisionTreeModel.
 
         Example usage:
 
@@ -246,7 +247,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
         :param minInfoGain:
           Min info gain required to create a split.
           (default: 0.0)
-        :return: DecisionTreeModel
+        :return:
+          DecisionTreeModel.
 
         Example usage:
 
@@ -324,11 +326,10 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         Method to train a decision tree model for binary or multiclass
         classification.
 
-        :param data: Training dataset:
-          RDD of LabeledPoint. Labels should take values {0, 1, ...,
-          numClasses-1}.
-        :param numClasses:
-          Number of classes for classification.
+        :param data:
+          Training dataset: RDD of LabeledPoint. Labels should take values
+          {0, 1, ..., numClasses-1}.:param numClasses: Number of classes
+          for classification.
         :param categoricalFeaturesInfo:
           Map storing arity of categorical features. E.g., an entry (n ->
           k) indicates that feature n is categorical with k categories
@@ -356,8 +357,8 @@ def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,
         :param seed:
           Random seed for bootstrapping and choosing feature subsets.
           (default: None)
-
-        :return: RandomForestModel that can be used for prediction
+        :return:
+          RandomForestModel that can be used for prediction.
 
         Example usage:
 
@@ -442,7 +443,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetSt
         :param seed:
           Random seed for bootstrapping and choosing feature subsets.
           (default: None)
-        :return: RandomForestModel that can be used for prediction
+        :return:
+          RandomForestModel that can be used for prediction.
 
         Example usage:
 
@@ -544,9 +546,8 @@ def trainClassifier(cls, data, categoricalFeaturesInfo,
           Maximum number of bins used for splitting features. DecisionTree
           requires maxBins >= max categories
           (default: 32)
-
-        :return: GradientBoostedTreesModel that can be used for
-                 prediction
+        :return:
+          GradientBoostedTreesModel that can be used for prediction.
 
         Example usage:
 
@@ -613,9 +614,8 @@ def trainRegressor(cls, data, categoricalFeaturesInfo,
           Maximum number of bins used for splitting features. DecisionTree
           requires maxBins >= max categories
           (default: 32)
-
-        :return: GradientBoostedTreesModel that can be used for
-                 prediction
+        :return:
+          GradientBoostedTreesModel that can be used for prediction.
 
         Example usage: