Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-10509][PYSPARK] Reduce excessive param boiler plate code #10216

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 0 additions & 32 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
.. versionadded:: 1.3.0
"""

# a placeholder to make it appear in the generated doc
threshold = Param(Params._dummy(), "threshold",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can hopefully remain the same.

"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.")
Expand All @@ -92,10 +91,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
super(LogisticRegression, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
#: param for threshold in binary classification, in range [0, 1].
self.threshold = Param(self, "threshold",
"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.")
self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
Expand Down Expand Up @@ -232,18 +227,13 @@ class TreeClassifierParams(object):
"""
supportedImpurities = ["entropy", "gini"]

# a placeholder to make it appear in the generated doc
impurity = Param(Params._dummy(), "impurity",
"Criterion used for information gain calculation (case-insensitive). " +
"Supported options: " +
", ".join(supportedImpurities))

def __init__(self):
super(TreeClassifierParams, self).__init__()
#: param for Criterion used for information gain calculation (case-insensitive).
self.impurity = Param(self, "impurity", "Criterion used for information " +
"gain calculation (case-insensitive). Supported options: " +
", ".join(self.supportedImpurities))

@since("1.6.0")
def setImpurity(self, value):
Expand Down Expand Up @@ -485,7 +475,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
.. versionadded:: 1.4.0
"""

# a placeholder to make it appear in the generated doc
lossType = Param(Params._dummy(), "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
Expand All @@ -504,10 +493,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
super(GBTClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.GBTClassifier", self.uid)
#: param for Loss function which GBT tries to minimize (case-insensitive).
self.lossType = Param(self, "lossType",
"Loss function which GBT tries to minimize (case-insensitive). " +
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
lossType="logistic", maxIter=20, stepSize=0.1)
Expand Down Expand Up @@ -597,7 +582,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
.. versionadded:: 1.5.0
"""

# a placeholder to make it appear in the generated doc
smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
"default is 1.0")
modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
Expand All @@ -615,13 +599,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
super(NaiveBayes, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.NaiveBayes", self.uid)
#: param for the smoothing parameter.
self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
"default is 1.0")
#: param for the model type.
self.modelType = Param(self, "modelType", "The model type which is a string " +
"(case-sensitive). Supported options: multinomial (default) " +
"and bernoulli.")
self._setDefault(smoothing=1.0, modelType="multinomial")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
Expand Down Expand Up @@ -734,7 +711,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
.. versionadded:: 1.6.0
"""

# a placeholder to make it appear in the generated doc
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
"neurons and output layer of 10 neurons, default is [1, 1].")
Expand All @@ -753,14 +729,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
super(MultilayerPerceptronClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
"100 neurons and output layer of 10 neurons, default is [1, 1].")
self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
"matrices. Data is stacked within partitions. If block size is " +
"more than remaining data in a partition then it is adjusted to " +
"the size of this data. Recommended size is between 10 and 1000, " +
"default is 128.")
self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
Expand Down
7 changes: 0 additions & 7 deletions python/pyspark/ml/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
.. versionadded:: 1.5.0
"""

# a placeholder to make it appear in the generated doc
k = Param(Params._dummy(), "k", "number of clusters to create")
initMode = Param(Params._dummy(), "initMode",
"the initialization algorithm. This can be either \"random\" to " +
Expand All @@ -90,12 +89,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
"""
super(KMeans, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
self.k = Param(self, "k", "number of clusters to create")
self.initMode = Param(self, "initMode",
"the initialization algorithm. This can be either \"random\" to " +
"choose random points as initial cluster centers, or \"k-means||\" " +
"to use a parallel variant of k-means++")
self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode")
self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
Expand Down
12 changes: 0 additions & 12 deletions python/pyspark/ml/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
.. versionadded:: 1.4.0
"""

# a placeholder to make it appear in the generated doc
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation (areaUnderROC|areaUnderPR)")

Expand All @@ -138,9 +137,6 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
super(BinaryClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
#: param for metric name in evaluation (areaUnderROC|areaUnderPR)
self.metricName = Param(self, "metricName",
"metric name in evaluation (areaUnderROC|areaUnderPR)")
self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
metricName="areaUnderROC")
kwargs = self.__init__._input_kwargs
Expand Down Expand Up @@ -210,9 +206,6 @@ def __init__(self, predictionCol="prediction", labelCol="label",
super(RegressionEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
#: param for metric name in evaluation (mse|rmse|r2|mae)
self.metricName = Param(self, "metricName",
"metric name in evaluation (mse|rmse|r2|mae)")
self._setDefault(predictionCol="prediction", labelCol="label",
metricName="rmse")
kwargs = self.__init__._input_kwargs
Expand Down Expand Up @@ -265,7 +258,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio

.. versionadded:: 1.5.0
"""
# a placeholder to make it appear in the generated doc
metricName = Param(Params._dummy(), "metricName",
"metric name in evaluation "
"(f1|precision|recall|weightedPrecision|weightedRecall)")
Expand All @@ -280,10 +272,6 @@ def __init__(self, predictionCol="prediction", labelCol="label",
super(MulticlassClassificationEvaluator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
# param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall)
self.metricName = Param(self, "metricName",
"metric name in evaluation"
" (f1|precision|recall|weightedPrecision|weightedRecall)")
self._setDefault(predictionCol="prediction", labelCol="label",
metricName="f1")
kwargs = self.__init__._input_kwargs
Expand Down
Loading