From 564339bb54635df084fadb813eb8a26e9fe6c8b9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Dec 2015 20:24:34 -0800 Subject: [PATCH 01/12] Add a copy to new parent function to make it easier to handle custom params --- python/pyspark/ml/param/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 35c9b776a3d5e..fc36d56f9cd74 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -39,6 +39,15 @@ def __init__(self, parent, name, doc): self.name = str(name) self.doc = str(doc) + def _copy_new_parent(self, parent): + """Copy the current param to a new parent, must be a dummy param.""" + if self.parent == "undefined": + param = copy.copy(self) + param.parent = parent.uid + return param + else: + raise ValueError("Cannot copy from non-dummy parent %s." % parent) + def __str__(self): return str(self.parent) + "__" + self.name From 429172b65e7a193d86286ee05a7f797cee634050 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Dec 2015 20:24:42 -0800 Subject: [PATCH 02/12] Use the copy to new parent function for custom params --- python/pyspark/ml/classification.py | 29 +++-------- python/pyspark/ml/clustering.py | 9 ++-- python/pyspark/ml/evaluation.py | 10 ++-- python/pyspark/ml/feature.py | 81 ++++++++++------------------- python/pyspark/ml/pipeline.py | 5 +- python/pyspark/ml/recommendation.py | 19 ++++--- python/pyspark/ml/regression.py | 41 ++++----------- python/pyspark/ml/tuning.py | 10 ++-- 8 files changed, 68 insertions(+), 136 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5599b8f3ecd88..814d673fd9213 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -93,9 +93,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.LogisticRegression", self.uid) #: param for threshold in binary classification, in range [0, 1]. - self.threshold = Param(self, "threshold", - "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match.") + self.threshold = LogisticRegression.threshold._copy_new_parent(self) self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -241,9 +239,7 @@ class TreeClassifierParams(object): def __init__(self): super(TreeClassifierParams, self).__init__() #: param for Criterion used for information gain calculation (case-insensitive). - self.impurity = Param(self, "impurity", "Criterion used for information " + - "gain calculation (case-insensitive). Supported options: " + - ", ".join(self.supportedImpurities)) + self.impurity = TreeClassifierParams.impurity._copy_new_parent(self) @since("1.6.0") def setImpurity(self, value): @@ -502,9 +498,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.GBTClassifier", self.uid) #: param for Loss function which GBT tries to minimize (case-insensitive). - self.lossType = Param(self, "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) + self.lossType = GBTClassifier.lossType._copy_new_parent(self) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1) @@ -613,12 +607,9 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.NaiveBayes", self.uid) #: param for the smoothing parameter. - self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0") + self.smoothing = NaiveBayes.smoothing._copy_new_parent(self) #: param for the model type. - self.modelType = Param(self, "modelType", "The model type which is a string " + - "(case-sensitive). Supported options: multinomial (default) " + - "and bernoulli.") + self.modelType = NaiveBayes.modelType._copy_new_parent(self) self._setDefault(smoothing=1.0, modelType="multinomial") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -750,14 +741,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " + - "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " + - "100 neurons and output layer of 10 neurons, default is [1, 1].") - self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " + - "matrices. Data is stacked within partitions. If block size is " + - "more than remaining data in a partition then it is adjusted to " + - "the size of this data. Recommended size is between 10 and 1000, " + - "default is 128.") + self.layers = MultilayerPerceptronClassifier.layers._copy_new_parent(self) + self.blockSize = MultilayerPerceptronClassifier.blockSize._copy_new_parent(self) self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 7bb8ab94e17df..35bac03780ddb 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -80,12 +80,9 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2, """ super(KMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) - self.k = Param(self, "k", "number of clusters to create") - self.initMode = Param(self, "initMode", - "the initialization algorithm. This can be either \"random\" to " + - "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++") - self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode") + self.k = KMeans.k._copy_new_parent(self) + self.initMode = KMeans.initMode._copy_new_parent(self) + self.initSteps = KMeans.initSteps._copy_new_parent(self) self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index dcc1738ec518b..999384faf9c96 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -138,8 +138,7 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label", self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid) #: param for metric name in evaluation (areaUnderROC|areaUnderPR) - self.metricName = Param(self, "metricName", - "metric name in evaluation (areaUnderROC|areaUnderPR)") + self.metricName = BinaryClassificationEvaluator.metricName._copy_new_parent(self) self._setDefault(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC") kwargs = self.__init__._input_kwargs @@ -210,8 +209,7 @@ def __init__(self, predictionCol="prediction", labelCol="label", self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) #: param for metric name in evaluation (mse|rmse|r2|mae) - self.metricName = Param(self, "metricName", - "metric name in evaluation (mse|rmse|r2|mae)") + self.metricName = RegressionEvaluator.metricName._copy_new_parent(self) self._setDefault(predictionCol="prediction", labelCol="label", metricName="rmse") kwargs = self.__init__._input_kwargs @@ -280,9 +278,7 @@ def __init__(self, predictionCol="prediction", labelCol="label", self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid) # param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall) - self.metricName = Param(self, "metricName", - "metric name in evaluation" - " (f1|precision|recall|weightedPrecision|weightedRecall)") + self.metricName = MulticlassClassificationEvaluator.metricName._copy_new_parent(self) self._setDefault(predictionCol="prediction", labelCol="label", metricName="f1") kwargs = self.__init__._input_kwargs diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index b02d41b52ab25..c8a4147236f62 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -67,8 +67,7 @@ def __init__(self, threshold=0.0, inputCol=None, outputCol=None): """ super(Binarizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid) - self.threshold = Param(self, "threshold", - "threshold in binary classification prediction, in range [0, 1]") + self.threshold = Binarizer.threshold._copy_new_parent(self) self._setDefault(threshold=0.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -146,14 +145,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None): # except the last bucket, which also includes y. The splits should be strictly increasing. # Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, # values outside the splits specified will be treated as errors. - self.splits = \ - Param(self, "splits", - "Split points for mapping continuous features into buckets. With n+1 splits, " + - "there are n buckets. A bucket defined by splits x,y holds values in the " + - "range [x,y) except the last bucket, which also includes y. The splits " + - "should be strictly increasing. Values at -inf, inf must be explicitly " + - "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.") + self.splits = Bucketizer.splits._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -369,8 +361,7 @@ def __init__(self, inverse=False, inputCol=None, outputCol=None): """ super(DCT, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid) - self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " + - "default False.") + self.inverse = DCT.inverse._copy_new_parent(self) self._setDefault(inverse=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -434,8 +425,7 @@ def __init__(self, scalingVec=None, inputCol=None, outputCol=None): super(ElementwiseProduct, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct", self.uid) - self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " + - "it must be MLlib Vector type.") + self.scalingVec = ElementwiseProduct.scalingVec._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -541,8 +531,7 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): """ super(IDF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self.minDocFreq = Param(self, "minDocFreq", - "minimum of documents in which a term should appear for filtering") + self.minDocFreq = IDF.minDocFreq._copy_new_parent(self) self._setDefault(minDocFreq=0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -629,8 +618,8 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): """ super(MinMaxScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self.min = Param(self, "min", "Lower bound of the output feature range") - self.max = Param(self, "max", "Upper bound of the output feature range") + self.min = MinMaxScaler.min._copy_new_parent(self) + self.max = MinMaxScaler.max._copy_new_parent(self) self._setDefault(min=0.0, max=1.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -734,7 +723,7 @@ def __init__(self, n=2, inputCol=None, outputCol=None): """ super(NGram, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid) - self.n = Param(self, "n", "number of elements per n-gram (>=1)") + self.n = NGram.n._copy_new_parent(self) self._setDefault(n=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -797,7 +786,7 @@ def __init__(self, p=2.0, inputCol=None, outputCol=None): """ super(Normalizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid) - self.p = Param(self, "p", "the p norm value.") + self.p = Normalizer.p._copy_new_parent(self) self._setDefault(p=2.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -876,7 +865,7 @@ def __init__(self, dropLast=True, inputCol=None, outputCol=None): """ super(OneHotEncoder, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self.dropLast = Param(self, "dropLast", "whether to drop the last category") + self.dropLast = OneHotEncoder.dropLast._copy_new_parent(self) self._setDefault(dropLast=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -940,7 +929,7 @@ def __init__(self, degree=2, inputCol=None, outputCol=None): super(PolynomialExpansion, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.PolynomialExpansion", self.uid) - self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)") + self.degree = PolynomialExpansion.degree._copy_new_parent(self) self._setDefault(degree=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1017,9 +1006,9 @@ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, o """ super(RegexTokenizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid) - self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)") - self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens") - self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing") + self.minTokenLength = RegexTokenizer.minTokenLength._copy_new_parent(self) + self.gaps = RegexTokenizer.gaps._copy_new_parent(self) + self.pattern = RegexTokenizer.pattern._copy_new_parent(self) self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1108,7 +1097,7 @@ def __init__(self, statement=None): """ super(SQLTransformer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid) - self.statement = Param(self, "statement", "SQL statement") + self.statement = SQLTransformer.statement._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1171,8 +1160,8 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): """ super(StandardScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self.withMean = Param(self, "withMean", "Center data with mean") - self.withStd = Param(self, "withStd", "Scale to unit standard deviation") + self.withMean = StandardScaler.withMean._copy_new_parent(self) + self.withStd = StandardScaler.withStd._copy_new_parent(self) self._setDefault(withMean=False, withStd=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1341,9 +1330,7 @@ def __init__(self, inputCol=None, outputCol=None, labels=None): super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) - self.labels = Param(self, "labels", - "Optional array of labels specifying index-string mapping. If not" + - " provided or if empty, then metadata from inputCol is used instead.") + self.labels = IndexToString.labels._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1397,9 +1384,8 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self.stopWords = Param(self, "stopWords", "The words to be filtered out") - self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " + - "sensitive comparison over the stop words") + self.stopWords = StopWordsRemover.stopWords._copy_new_parent(self) + self.caseSensitive = StopWordsRemover.caseSensitive._copy_new_parent(self) stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords defaultStopWords = stopWordsObj.English() self._setDefault(stopWords=defaultStopWords) @@ -1615,10 +1601,7 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None): """ super(VectorIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self.maxCategories = Param(self, "maxCategories", - "Threshold for the number of values a categorical feature " + - "can take (>= 2). If a feature is found to have " + - "> maxCategories values, then it is declared continuous.") + self.maxCategories = VectorIndexer.maxCategories._copy_new_parent(self) self._setDefault(maxCategories=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1722,12 +1705,8 @@ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): """ super(VectorSlicer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) - self.indices = Param(self, "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with names.") - self.names = Param(self, "names", "An array of feature names to select features from " + - "a vector column. These names must be specified by ML " + - "org.apache.spark.ml.attribute.Attribute. There can be no overlap " + - "with indices.") + self.indices = VectorSlicer.indices._copy_new_parent(self) + self.names = VectorSlicer.names._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1825,13 +1804,9 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self.vectorSize = Param(self, "vectorSize", - "the dimension of codes after transforming from words") - self.numPartitions = Param(self, "numPartitions", - "number of partitions for sentences of words") - self.minCount = Param(self, "minCount", - "the minimum number of times a token must appear to be included " + - "in the word2vec model's vocabulary") + self.vectorSize = Word2Vec.vectorSize._copy_new_parent(self) + self.numPartitions = Word2Vec.numPartitions._copy_new_parent(self) + self.minCount = Word2Vec.minCount._copy_new_parent(self) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None) kwargs = self.__init__._input_kwargs @@ -1958,7 +1933,7 @@ def __init__(self, k=None, inputCol=None, outputCol=None): """ super(PCA, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid) - self.k = Param(self, "k", "the number of principal components") + self.k = PCA.k._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -2050,7 +2025,7 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label"): """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self.formula = Param(self, "formula", "R model formula") + self.formula = RFormula.formula._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 4475451edb781..30959d74dbaa9 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -149,6 +149,9 @@ class Pipeline(Estimator): .. versionadded:: 1.3.0 """ + # a placeholder to make it appear in the generated doc + stages = Param(Params._dummy(), "stages", "pipeline stages") + @keyword_only def __init__(self, stages=None): """ @@ -158,7 +161,7 @@ def __init__(self, stages=None): stages = [] super(Pipeline, self).__init__() #: Param for pipeline stages. - self.stages = Param(self, "stages", "pipeline stages") + self.stages = Pipeline.stages._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index b44c66f73cc49..e19073b9e75e3 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -108,16 +108,15 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB """ super(ALS, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid) - self.rank = Param(self, "rank", "rank of the factorization") - self.numUserBlocks = Param(self, "numUserBlocks", "number of user blocks") - self.numItemBlocks = Param(self, "numItemBlocks", "number of item blocks") - self.implicitPrefs = Param(self, "implicitPrefs", "whether to use implicit preference") - self.alpha = Param(self, "alpha", "alpha for implicit preference") - self.userCol = Param(self, "userCol", "column name for user ids") - self.itemCol = Param(self, "itemCol", "column name for item ids") - self.ratingCol = Param(self, "ratingCol", "column name for ratings") - self.nonnegative = Param(self, "nonnegative", - "whether to use nonnegative constraint for least squares") + self.rank = ALS.rank._copy_new_parent(self) + self.numUserBlocks = ALS.numUserBlocks._copy_new_parent(self) + self.numItemBlocks = ALS.numItemBlocks._copy_new_parent(self) + self.implicitPrefs = ALS.implicitPrefs._copy_new_parent(self) + self.alpha = ALS.alpha._copy_new_parent(self) + self.userCol = ALS.userCol._copy_new_parent(self) + self.itemCol = ALS.itemCol._copy_new_parent(self) + self.ratingCol = ALS.ratingCol._copy_new_parent(self) + self.nonnegative = ALS.nonnegative._copy_new_parent(self) self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, ratingCol="rating", nonnegative=False, checkpointInterval=10) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index a0bb8ceed8861..9d65e36bb8bea 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -181,14 +181,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(IsotonicRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.IsotonicRegression", self.uid) - self.isotonic = \ - Param(self, "isotonic", - "whether the output sequence should be isotonic/increasing (true) or" + - "antitonic/decreasing (false).") - self.featureIndex = \ - Param(self, "featureIndex", - "The index of the feature if featuresCol is a vector column, no effect " + - "otherwise.") + self.isotonic = IsotonicRegression.isotonic._copy_new_parent(self) + self.featureIndex = IsotonicRegression.featureIndex._copy_new_parent(self) self._setDefault(isotonic=True, featureIndex=0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -269,8 +263,7 @@ class TreeEnsembleParams(DecisionTreeParams): def __init__(self): super(TreeEnsembleParams, self).__init__() #: param for Fraction of the training data, in range (0, 1]. - self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " + - "used for learning each decision tree, in range (0, 1].") + self.subsamplingRate = TreeEnsembleParams.subsamplingRate._copy_new_parent(self) @since("1.4.0") def setSubsamplingRate(self, value): @@ -303,9 +296,7 @@ class TreeRegressorParams(Params): def __init__(self): super(TreeRegressorParams, self).__init__() #: param for Criterion used for information gain calculation (case-insensitive). - self.impurity = Param(self, "impurity", "Criterion used for information " + - "gain calculation (case-insensitive). Supported options: " + - ", ".join(self.supportedImpurities)) + self.impurity = TreeRegressorParams.impurity._copy_new_parent(self) @since("1.4.0") def setImpurity(self, value): @@ -339,12 +330,9 @@ class RandomForestParams(TreeEnsembleParams): def __init__(self): super(RandomForestParams, self).__init__() #: param for Number of trees to train (>= 1). - self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).") + self.numTrees = RandomForestParams.numTrees._copy_new_parent(self) #: param for The number of features to consider for splits at each tree node. - self.featureSubsetStrategy = \ - Param(self, "featureSubsetStrategy", - "The number of features to consider for splits at each tree node. Supported " + - "options: " + ", ".join(self.supportedFeatureSubsetStrategies)) + self.featureSubsetStrategy = RandomForestParams.featureSubsetStrategy._copy_new_parent(self) @since("1.4.0") def setNumTrees(self, value): @@ -624,9 +612,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) #: param for Loss function which GBT tries to minimize (case-insensitive). - self.lossType = Param(self, "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) + self.lossType = GBTRegressor.lossType._copy_new_parent(self) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) @@ -736,19 +722,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid) #: Param for censor column name - self.censorCol = Param(self, "censorCol", - "censor column name. The value of this column could be 0 or 1. " + - "If the value is 1, it means the event has occurred i.e. " + - "uncensored; otherwise censored.") + self.censorCol = AFTSurvivalRegression.censorCol._copy_new_parent(self) #: Param for quantile probabilities array self.quantileProbabilities = \ - Param(self, "quantileProbabilities", - "quantile probabilities array. Values of the quantile probabilities array " + - "should be in the range (0, 1) and the array should be non-empty.") + AFTSurvivalRegression.quantileProbabilities._copy_new_parent(self) #: Param for quantiles column name - self.quantilesCol = Param(self, "quantilesCol", - "quantiles column name. This column will output quantiles of " + - "corresponding quantileProbabilities if it is set.") + self.quantilesCol = AFTSurvivalRegression.quantilesCol._copy_new_parent(self) self._setDefault(censorCol="censor", quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) kwargs = self.__init__._input_kwargs diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 705ee53685752..3984668b2d889 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -135,16 +135,14 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF """ super(CrossValidator, self).__init__() #: param for estimator to be cross-validated - self.estimator = Param(self, "estimator", "estimator to be cross-validated") + self.estimator = CrossValidator.estimator._copy_new_parent(self) #: param for estimator param maps - self.estimatorParamMaps = Param(self, "estimatorParamMaps", "estimator param maps") + self.estimatorParamMaps = CrossValidator.estimatorParamMaps._copy_new_parent(self) #: param for the evaluator used to select hyper-parameters that #: maximize the cross-validated metric - self.evaluator = Param( - self, "evaluator", - "evaluator used to select hyper-parameters that maximize the cross-validated metric") + self.evaluator = CrossValidator.evaluator._copy_new_parent(self) #: param for number of folds for cross validation - self.numFolds = Param(self, "numFolds", "number of folds for cross validation") + self.numFolds = CrossValidator.numFolds._copy_new_parent(self) self._setDefault(numFolds=3) kwargs = self.__init__._input_kwargs self._set(**kwargs) From a976b78969fb5392df82c8b486126606857ac3da Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 8 Dec 2015 20:34:08 -0800 Subject: [PATCH 03/12] Add an explicit test for the only copying new parent --- python/pyspark/ml/tests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 7a16cf52cccb2..a78e0583bf22a 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -163,6 +163,14 @@ def setParams(self, seed=None): class ParamTests(PySparkTestCase): + def test_copy_new_parent(self): + testParams = TestParams() + # Copying an instantiated param should fail + with self.assertRaises(ValueError): + testParams.maxIter._copy_new_parent(testParams) + # Copying a dummy param should succeed + TestParams.maxIter._copy_new_parent(testParams) + def test_param(self): testParams = TestParams() maxIter = testParams.maxIter From 79642e09a85de3a393107c83ca8bf1b2bf89afb8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 8 Jan 2016 15:02:31 -0800 Subject: [PATCH 04/12] Remove the comments about place holders since now we use them on copy --- python/pyspark/ml/classification.py | 5 ----- python/pyspark/ml/clustering.py | 1 - python/pyspark/ml/evaluation.py | 2 -- 3 files changed, 8 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 814d673fd9213..24e66b14b24c9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -72,7 +72,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti .. versionadded:: 1.3.0 """ - # a placeholder to make it appear in the generated doc threshold = Param(Params._dummy(), "threshold", "Threshold in binary classification prediction, in range [0, 1]." + " If threshold and thresholds are both set, they must match.") @@ -230,7 +229,6 @@ class TreeClassifierParams(object): """ supportedImpurities = ["entropy", "gini"] - # a placeholder to make it appear in the generated doc impurity = Param(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + @@ -478,7 +476,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc lossType = Param(Params._dummy(), "lossType", "Loss function which GBT tries to minimize (case-insensitive). " + "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) @@ -588,7 +585,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + "default is 1.0") modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + @@ -722,7 +718,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + "neurons and output layer of 10 neurons, default is [1, 1].") diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 35bac03780ddb..039e735b1b55a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -63,7 +63,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc k = Param(Params._dummy(), "k", "number of clusters to create") initMode = Param(Params._dummy(), "initMode", "the initialization algorithm. This can be either \"random\" to " + diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 999384faf9c96..d1d4e652d58c9 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -123,7 +123,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc metricName = Param(Params._dummy(), "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)") @@ -262,7 +261,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc metricName = Param(Params._dummy(), "metricName", "metric name in evaluation " "(f1|precision|recall|weightedPrecision|weightedRecall)") From e0f3f00d761b0b53860dd0f06de320c9fdc84958 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Fri, 8 Jan 2016 15:11:10 -0800 Subject: [PATCH 05/12] Add a test that the name/doc/parent got set correctly --- python/pyspark/ml/tests.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a78e0583bf22a..4b1a5078a79b3 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -170,6 +170,10 @@ def test_copy_new_parent(self): testParams.maxIter._copy_new_parent(testParams) # Copying a dummy param should succeed TestParams.maxIter._copy_new_parent(testParams) + maxIter = testParams.maxIter + self.assertEqual(maxIter.name, "maxIter") + self.assertEqual(maxIter.doc, "max number of iterations (>= 0).") + self.assertTrue(maxIter.parent == testParams.uid) def test_param(self): testParams = TestParams() From 53edd3d23c2c746171e0403b9f238646a849ec19 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 25 Jan 2016 13:11:39 -0800 Subject: [PATCH 06/12] Remove a whole bunch of no longer relevant a placeholder... statements and update a new param to also use the new copy syntax instead of duplicated text --- python/pyspark/ml/feature.py | 41 +++-------------------------- python/pyspark/ml/pipeline.py | 1 - python/pyspark/ml/recommendation.py | 1 - python/pyspark/ml/regression.py | 6 ----- python/pyspark/ml/tuning.py | 7 ----- 5 files changed, 4 insertions(+), 52 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1027654947c42..20428ce2d1678 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -56,7 +56,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc threshold = Param(Params._dummy(), "threshold", "threshold in binary classification prediction, in range [0, 1]") @@ -123,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.3.0 """ - # a placeholder to make it appear in the generated doc splits = \ Param(Params._dummy(), "splits", "Split points for mapping continuous features into buckets. With n+1 splits, " + @@ -201,7 +199,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc minTF = Param( Params._dummy(), "minTF", "Filter to ignore rare words in" + " a document. For each document, terms with frequency/count less than the given" + @@ -226,22 +223,9 @@ def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outpu super(CountVectorizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer", self.uid) - self.minTF = Param( - self, "minTF", "Filter to ignore rare words in" + - " a document. For each document, terms with frequency/count less than the given" + - " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" + - " times the term must appear in the document); if this is a double in [0,1), then " + - "this specifies a fraction (out of the document's token count). Note that the " + - "parameter is only used in transform of CountVectorizerModel and does not affect" + - "fitting. Default 1.0") - self.minDF = Param( - self, "minDF", "Specifies the minimum number of" + - " different documents a term must appear in to be included in the vocabulary." + - " If this is an integer >= 1, this specifies the number of documents the term must" + - " appear in; if this is a double in [0,1), then this specifies the fraction of " + - "documents. Default 1.0") - self.vocabSize = Param( - self, "vocabSize", "max size of the vocabulary. Default 1 << 18.") + self.minTF = CountVectorizer.minTF._copy_new_parent(self) + self.minDF = CountVectorizer.minDF._copy_new_parent(self) + self.vocabSize = CountVectorizer.vocabSize._copy_new_parent(self) self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -350,7 +334,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + "default False.") @@ -413,7 +396,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " + "it must be MLlib Vector type.") @@ -520,7 +502,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc minDocFreq = Param(Params._dummy(), "minDocFreq", "minimum of documents in which a term should appear for filtering") @@ -611,7 +592,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc min = Param(Params._dummy(), "min", "Lower bound of the output feature range") max = Param(Params._dummy(), "max", "Upper bound of the output feature range") @@ -733,7 +713,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)") @keyword_only @@ -796,7 +775,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc p = Param(Params._dummy(), "p", "the p norm value.") @keyword_only @@ -875,7 +853,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category") @keyword_only @@ -938,7 +915,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)") @keyword_only @@ -1014,7 +990,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing") @@ -1129,7 +1104,6 @@ class SQLTransformer(JavaTransformer): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc statement = Param(Params._dummy(), "statement", "SQL statement") @keyword_only @@ -1191,7 +1165,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc withMean = Param(Params._dummy(), "withMean", "Center data with mean") withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation") @@ -1359,7 +1332,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make the labels show up in generated doc labels = Param(Params._dummy(), "labels", "Optional array of labels specifying index-string mapping." + " If not provided or if empty, then metadata from inputCol is used instead.") @@ -1411,7 +1383,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make the stopwords show up in generated doc + stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + "comparison over the stop words") @@ -1630,7 +1602,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc maxCategories = Param(Params._dummy(), "maxCategories", "Threshold for the number of values a categorical feature can take " + "(>= 2). If a feature is found to have > maxCategories values, then " + @@ -1732,7 +1703,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + "a vector column. There can be no overlap with names.") names = Param(Params._dummy(), "names", "An array of feature names to select features from " + @@ -1828,7 +1798,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words") numPartitions = Param(Params._dummy(), "numPartitions", @@ -1965,7 +1934,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc k = Param(Params._dummy(), "k", "the number of principal components") @keyword_only @@ -2066,7 +2034,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): .. versionadded:: 1.5.0 """ - # a placeholder to make it appear in the generated doc formula = Param(Params._dummy(), "formula", "R model formula") @keyword_only diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 42a50505d3512..54e59e3733227 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -149,7 +149,6 @@ class Pipeline(Estimator): .. versionadded:: 1.3.0 """ - # a placeholder to make it appear in the generated doc stages = Param(Params._dummy(), "stages", "pipeline stages") @keyword_only diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index e19073b9e75e3..f9febad3f3935 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -85,7 +85,6 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc rank = Param(Params._dummy(), "rank", "rank of the factorization") numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks") numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks") diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 55a9596ca9bfd..d0fb065146a70 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -162,7 +162,6 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti DenseVector([0.0, 1.0]) """ - # a placeholder to make it appear in the generated doc isotonic = \ Param(Params._dummy(), "isotonic", "whether the output sequence should be isotonic/increasing (true) or" + @@ -256,7 +255,6 @@ class TreeEnsembleParams(DecisionTreeParams): Mixin for Decision Tree-based ensemble algorithms parameters. """ - # a placeholder to make it appear in the generated doc subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + "used for learning each decision tree, in range (0, 1].") @@ -287,7 +285,6 @@ class TreeRegressorParams(Params): """ supportedImpurities = ["variance"] - # a placeholder to make it appear in the generated doc impurity = Param(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + @@ -320,7 +317,6 @@ class RandomForestParams(TreeEnsembleParams): """ supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] - # a placeholder to make it appear in the generated doc numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).") featureSubsetStrategy = \ Param(Params._dummy(), "featureSubsetStrategy", @@ -597,7 +593,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc lossType = Param(Params._dummy(), "lossType", "Loss function which GBT tries to minimize (case-insensitive). " + "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) @@ -699,7 +694,6 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi .. versionadded:: 1.6.0 """ - # a placeholder to make it appear in the generated doc censorCol = Param(Params._dummy(), "censorCol", "censor column name. The value of this column could be 0 or 1. " + "If the value is 1, it means the event has occurred i.e. " + diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 4158131fdb1cd..900f785c6cebb 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -115,18 +115,11 @@ class CrossValidator(Estimator, HasSeed): .. versionadded:: 1.4.0 """ - # a placeholder to make it appear in the generated doc estimator = Param(Params._dummy(), "estimator", "estimator to be cross-validated") - - # a placeholder to make it appear in the generated doc estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps", "estimator param maps") - - # a placeholder to make it appear in the generated doc evaluator = Param( Params._dummy(), "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") - - # a placeholder to make it appear in the generated doc numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation") @keyword_only From 69025f188094e1da64417a9bb758a884ab15f5ba Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 10:52:41 -0800 Subject: [PATCH 07/12] Copy the params in the init on param base instead --- python/pyspark/ml/param/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index abca63999e04c..ba1b8a185cc87 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -86,6 +86,18 @@ def __init__(self): #: value returned by :py:func:`params` self._params = None + self._copy_params() + + def _copy_params(self): + """ + Copy all params defined on the class to current object. + """ + cls = self.__class__ + src_params = list(filter(lambda attr: isinstance(attr, Param), + [getattr(cls, x) for x in dir(cls)])) + for param in src_params: + param._copy_new_parent(self) + @property @since("1.3.0") def params(self): From a39bea52aab0a258f1d99530b17623647482131f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 10:56:04 -0800 Subject: [PATCH 08/12] Update shared param codegen to use the param copy logic as well --- .../ml/param/_shared_params_code_gen.py | 16 ++-- python/pyspark/ml/param/shared.py | 80 ------------------- 2 files changed, 5 insertions(+), 91 deletions(-) diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 82855bc4c75ba..5d0183f183ef0 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -50,13 +50,11 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType): Mixin for param $name: $doc """ - # a placeholder to make it appear in the generated doc $name = Param(Params._dummy(), "$name", "$doc", $expectedType) def __init__(self): - super(Has$Name, self).__init__() - #: param for $doc - self.$name = Param(self, "$name", "$doc", $expectedType)''' + super(Has$Name, self).__init__()''' + if defaultValueStr is not None: template += ''' self._setDefault($name=$defaultValueStr)''' @@ -171,12 +169,10 @@ def get$Name(self): Mixin for Decision Tree parameters. """ - # a placeholder to make it appear in the generated doc $dummyPlaceHolders def __init__(self): - super(DecisionTreeParams, self).__init__() - $realParams''' + super(DecisionTreeParams, self).__init__()''' dtParamMethods = "" dummyPlaceholders = "" realParams = "" @@ -184,9 +180,7 @@ def __init__(self): for name, doc in decisionTreeParams: variable = paramTemplate.replace("$name", name).replace("$doc", doc) dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " - realParams += "#: param for " + doc + "\n " - realParams += "self." + variable.replace("$owner", "self") + "\n " dtParamMethods += _gen_param_code(name, doc, None) + "\n" - code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) - .replace("$realParams", realParams) + dtParamMethods) + code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + + dtParamMethods) print("\n\n\n".join(code)) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 23f94314844f6..b74214255156b 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -25,13 +25,10 @@ class HasMaxIter(Params): Mixin for param maxIter: max number of iterations (>= 0). """ - # a placeholder to make it appear in the generated doc maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", int) def __init__(self): super(HasMaxIter, self).__init__() - #: param for max number of iterations (>= 0). - self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0).", int) def setMaxIter(self, value): """ @@ -52,13 +49,10 @@ class HasRegParam(Params): Mixin for param regParam: regularization parameter (>= 0). """ - # a placeholder to make it appear in the generated doc regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", float) def __init__(self): super(HasRegParam, self).__init__() - #: param for regularization parameter (>= 0). - self.regParam = Param(self, "regParam", "regularization parameter (>= 0).", float) def setRegParam(self, value): """ @@ -79,13 +73,10 @@ class HasFeaturesCol(Params): Mixin for param featuresCol: features column name. """ - # a placeholder to make it appear in the generated doc featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", str) def __init__(self): super(HasFeaturesCol, self).__init__() - #: param for features column name. - self.featuresCol = Param(self, "featuresCol", "features column name.", str) self._setDefault(featuresCol='features') def setFeaturesCol(self, value): @@ -107,13 +98,10 @@ class HasLabelCol(Params): Mixin for param labelCol: label column name. """ - # a placeholder to make it appear in the generated doc labelCol = Param(Params._dummy(), "labelCol", "label column name.", str) def __init__(self): super(HasLabelCol, self).__init__() - #: param for label column name. - self.labelCol = Param(self, "labelCol", "label column name.", str) self._setDefault(labelCol='label') def setLabelCol(self, value): @@ -135,13 +123,10 @@ class HasPredictionCol(Params): Mixin for param predictionCol: prediction column name. """ - # a placeholder to make it appear in the generated doc predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", str) def __init__(self): super(HasPredictionCol, self).__init__() - #: param for prediction column name. - self.predictionCol = Param(self, "predictionCol", "prediction column name.", str) self._setDefault(predictionCol='prediction') def setPredictionCol(self, value): @@ -163,13 +148,10 @@ class HasProbabilityCol(Params): Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. """ - # a placeholder to make it appear in the generated doc probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str) def __init__(self): super(HasProbabilityCol, self).__init__() - #: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. - self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str) self._setDefault(probabilityCol='probability') def setProbabilityCol(self, value): @@ -191,13 +173,10 @@ class HasRawPredictionCol(Params): Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name. """ - # a placeholder to make it appear in the generated doc rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str) def __init__(self): super(HasRawPredictionCol, self).__init__() - #: param for raw prediction (a.k.a. confidence) column name. - self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str) self._setDefault(rawPredictionCol='rawPrediction') def setRawPredictionCol(self, value): @@ -219,13 +198,10 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - # a placeholder to make it appear in the generated doc inputCol = Param(Params._dummy(), "inputCol", "input column name.", str) def __init__(self): super(HasInputCol, self).__init__() - #: param for input column name. - self.inputCol = Param(self, "inputCol", "input column name.", str) def setInputCol(self, value): """ @@ -246,13 +222,10 @@ class HasInputCols(Params): Mixin for param inputCols: input column names. """ - # a placeholder to make it appear in the generated doc inputCols = Param(Params._dummy(), "inputCols", "input column names.", None) def __init__(self): super(HasInputCols, self).__init__() - #: param for input column names. - self.inputCols = Param(self, "inputCols", "input column names.", None) def setInputCols(self, value): """ @@ -273,13 +246,10 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - # a placeholder to make it appear in the generated doc outputCol = Param(Params._dummy(), "outputCol", "output column name.", str) def __init__(self): super(HasOutputCol, self).__init__() - #: param for output column name. - self.outputCol = Param(self, "outputCol", "output column name.", str) self._setDefault(outputCol=self.uid + '__output') def setOutputCol(self, value): @@ -301,13 +271,10 @@ class HasNumFeatures(Params): Mixin for param numFeatures: number of features. """ - # a placeholder to make it appear in the generated doc numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", int) def __init__(self): super(HasNumFeatures, self).__init__() - #: param for number of features. - self.numFeatures = Param(self, "numFeatures", "number of features.", int) def setNumFeatures(self, value): """ @@ -328,13 +295,10 @@ class HasCheckpointInterval(Params): Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. """ - # a placeholder to make it appear in the generated doc checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int) def __init__(self): super(HasCheckpointInterval, self).__init__() - #: param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. - self.checkpointInterval = Param(self, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int) def setCheckpointInterval(self, value): """ @@ -355,13 +319,10 @@ class HasSeed(Params): Mixin for param seed: random seed. """ - # a placeholder to make it appear in the generated doc seed = Param(Params._dummy(), "seed", "random seed.", int) def __init__(self): super(HasSeed, self).__init__() - #: param for random seed. - self.seed = Param(self, "seed", "random seed.", int) self._setDefault(seed=hash(type(self).__name__)) def setSeed(self, value): @@ -383,13 +344,10 @@ class HasTol(Params): Mixin for param tol: the convergence tolerance for iterative algorithms. """ - # a placeholder to make it appear in the generated doc tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", float) def __init__(self): super(HasTol, self).__init__() - #: param for the convergence tolerance for iterative algorithms. - self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms.", float) def setTol(self, value): """ @@ -410,13 +368,10 @@ class HasStepSize(Params): Mixin for param stepSize: Step size to be used for each iteration of optimization. """ - # a placeholder to make it appear in the generated doc stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", float) def __init__(self): super(HasStepSize, self).__init__() - #: param for Step size to be used for each iteration of optimization. - self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.", float) def setStepSize(self, value): """ @@ -437,13 +392,10 @@ class HasHandleInvalid(Params): Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later. """ - # a placeholder to make it appear in the generated doc handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str) def __init__(self): super(HasHandleInvalid, self).__init__() - #: param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later. - self.handleInvalid = Param(self, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str) def setHandleInvalid(self, value): """ @@ -464,13 +416,10 @@ class HasElasticNetParam(Params): Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. """ - # a placeholder to make it appear in the generated doc elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float) def __init__(self): super(HasElasticNetParam, self).__init__() - #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. - self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float) self._setDefault(elasticNetParam=0.0) def setElasticNetParam(self, value): @@ -492,13 +441,10 @@ class HasFitIntercept(Params): Mixin for param fitIntercept: whether to fit an intercept term. """ - # a placeholder to make it appear in the generated doc fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", bool) def __init__(self): super(HasFitIntercept, self).__init__() - #: param for whether to fit an intercept term. - self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.", bool) self._setDefault(fitIntercept=True) def setFitIntercept(self, value): @@ -520,13 +466,10 @@ class HasStandardization(Params): Mixin for param standardization: whether to standardize the training features before fitting the model. """ - # a placeholder to make it appear in the generated doc standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", bool) def __init__(self): super(HasStandardization, self).__init__() - #: param for whether to standardize the training features before fitting the model. - self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.", bool) self._setDefault(standardization=True) def setStandardization(self, value): @@ -548,13 +491,10 @@ class HasThresholds(Params): Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold. """ - # a placeholder to make it appear in the generated doc thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None) def __init__(self): super(HasThresholds, self).__init__() - #: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold. - self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None) def setThresholds(self, value): """ @@ -575,13 +515,10 @@ class HasWeightCol(Params): Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. """ - # a placeholder to make it appear in the generated doc weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str) def __init__(self): super(HasWeightCol, self).__init__() - #: param for weight column name. If this is not set or empty, we treat all instance weights as 1.0. - self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str) def setWeightCol(self, value): """ @@ -602,13 +539,10 @@ class HasSolver(Params): Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'. """ - # a placeholder to make it appear in the generated doc solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str) def __init__(self): super(HasSolver, self).__init__() - #: param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'. - self.solver = Param(self, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str) self._setDefault(solver='auto') def setSolver(self, value): @@ -630,7 +564,6 @@ class DecisionTreeParams(Params): Mixin for Decision Tree parameters. """ - # a placeholder to make it appear in the generated doc maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.") maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.") minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.") @@ -641,19 +574,6 @@ class DecisionTreeParams(Params): def __init__(self): super(DecisionTreeParams, self).__init__() - #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. - self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.") - #: param for Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature. - self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.") - #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1. - self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.") - #: param for Minimum information gain for a split to be considered at a tree node. - self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.") - #: param for Maximum memory in MB allocated to histogram aggregation. - self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.") - #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. - self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.") - def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. From 10ed8da017ef3a6439982c82fff2f64cfb6e8431 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 11:16:01 -0800 Subject: [PATCH 09/12] Explicitly call _copy_params in the child object so the class resolves correctly --- python/pyspark/ml/param/__init__.py | 2 -- .../ml/param/_shared_params_code_gen.py | 6 +++-- python/pyspark/ml/param/shared.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index ba1b8a185cc87..c0c7863fde518 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -86,8 +86,6 @@ def __init__(self): #: value returned by :py:func:`params` self._params = None - self._copy_params() - def _copy_params(self): """ Copy all params defined on the class to current object. diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 5d0183f183ef0..0648bcc82fdea 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -53,7 +53,8 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType): $name = Param(Params._dummy(), "$name", "$doc", $expectedType) def __init__(self): - super(Has$Name, self).__init__()''' + super(Has$Name, self).__init__() + self._copy_params()''' if defaultValueStr is not None: template += ''' @@ -172,7 +173,8 @@ def get$Name(self): $dummyPlaceHolders def __init__(self): - super(DecisionTreeParams, self).__init__()''' + super(DecisionTreeParams, self).__init__() + self._copy_params()''' dtParamMethods = "" dummyPlaceholders = "" realParams = "" diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index b74214255156b..039e3b49aeecd 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -29,6 +29,7 @@ class HasMaxIter(Params): def __init__(self): super(HasMaxIter, self).__init__() + self._copy_params() def setMaxIter(self, value): """ @@ -53,6 +54,7 @@ class HasRegParam(Params): def __init__(self): super(HasRegParam, self).__init__() + self._copy_params() def setRegParam(self, value): """ @@ -77,6 +79,7 @@ class HasFeaturesCol(Params): def __init__(self): super(HasFeaturesCol, self).__init__() + self._copy_params() self._setDefault(featuresCol='features') def setFeaturesCol(self, value): @@ -102,6 +105,7 @@ class HasLabelCol(Params): def __init__(self): super(HasLabelCol, self).__init__() + self._copy_params() self._setDefault(labelCol='label') def setLabelCol(self, value): @@ -127,6 +131,7 @@ class HasPredictionCol(Params): def __init__(self): super(HasPredictionCol, self).__init__() + self._copy_params() self._setDefault(predictionCol='prediction') def setPredictionCol(self, value): @@ -152,6 +157,7 @@ class HasProbabilityCol(Params): def __init__(self): super(HasProbabilityCol, self).__init__() + self._copy_params() self._setDefault(probabilityCol='probability') def setProbabilityCol(self, value): @@ -177,6 +183,7 @@ class HasRawPredictionCol(Params): def __init__(self): super(HasRawPredictionCol, self).__init__() + self._copy_params() self._setDefault(rawPredictionCol='rawPrediction') def setRawPredictionCol(self, value): @@ -202,6 +209,7 @@ class HasInputCol(Params): def __init__(self): super(HasInputCol, self).__init__() + self._copy_params() def setInputCol(self, value): """ @@ -226,6 +234,7 @@ class HasInputCols(Params): def __init__(self): super(HasInputCols, self).__init__() + self._copy_params() def setInputCols(self, value): """ @@ -250,6 +259,7 @@ class HasOutputCol(Params): def __init__(self): super(HasOutputCol, self).__init__() + self._copy_params() self._setDefault(outputCol=self.uid + '__output') def setOutputCol(self, value): @@ -275,6 +285,7 @@ class HasNumFeatures(Params): def __init__(self): super(HasNumFeatures, self).__init__() + self._copy_params() def setNumFeatures(self, value): """ @@ -299,6 +310,7 @@ class HasCheckpointInterval(Params): def __init__(self): super(HasCheckpointInterval, self).__init__() + self._copy_params() def setCheckpointInterval(self, value): """ @@ -323,6 +335,7 @@ class HasSeed(Params): def __init__(self): super(HasSeed, self).__init__() + self._copy_params() self._setDefault(seed=hash(type(self).__name__)) def setSeed(self, value): @@ -348,6 +361,7 @@ class HasTol(Params): def __init__(self): super(HasTol, self).__init__() + self._copy_params() def setTol(self, value): """ @@ -372,6 +386,7 @@ class HasStepSize(Params): def __init__(self): super(HasStepSize, self).__init__() + self._copy_params() def setStepSize(self, value): """ @@ -396,6 +411,7 @@ class HasHandleInvalid(Params): def __init__(self): super(HasHandleInvalid, self).__init__() + self._copy_params() def setHandleInvalid(self, value): """ @@ -420,6 +436,7 @@ class HasElasticNetParam(Params): def __init__(self): super(HasElasticNetParam, self).__init__() + self._copy_params() self._setDefault(elasticNetParam=0.0) def setElasticNetParam(self, value): @@ -445,6 +462,7 @@ class HasFitIntercept(Params): def __init__(self): super(HasFitIntercept, self).__init__() + self._copy_params() self._setDefault(fitIntercept=True) def setFitIntercept(self, value): @@ -470,6 +488,7 @@ class HasStandardization(Params): def __init__(self): super(HasStandardization, self).__init__() + self._copy_params() self._setDefault(standardization=True) def setStandardization(self, value): @@ -495,6 +514,7 @@ class HasThresholds(Params): def __init__(self): super(HasThresholds, self).__init__() + self._copy_params() def setThresholds(self, value): """ @@ -519,6 +539,7 @@ class HasWeightCol(Params): def __init__(self): super(HasWeightCol, self).__init__() + self._copy_params() def setWeightCol(self, value): """ @@ -543,6 +564,7 @@ class HasSolver(Params): def __init__(self): super(HasSolver, self).__init__() + self._copy_params() self._setDefault(solver='auto') def setSolver(self, value): @@ -574,6 +596,7 @@ class DecisionTreeParams(Params): def __init__(self): super(DecisionTreeParams, self).__init__() + self._copy_params() def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. From b755008bbea72584f6ace648646d8859c33a4a8e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 12:18:08 -0800 Subject: [PATCH 10/12] wait that was unecessary --- python/pyspark/ml/classification.py | 12 ------ python/pyspark/ml/clustering.py | 3 -- python/pyspark/ml/evaluation.py | 6 --- python/pyspark/ml/feature.py | 37 ------------------- python/pyspark/ml/param/__init__.py | 13 ++++--- .../ml/param/_shared_params_code_gen.py | 6 +-- python/pyspark/ml/param/shared.py | 23 ------------ python/pyspark/ml/pipeline.py | 2 - python/pyspark/ml/recommendation.py | 9 ----- python/pyspark/ml/regression.py | 22 ----------- python/pyspark/ml/tuning.py | 9 ----- 11 files changed, 10 insertions(+), 132 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b6fa750c54fe7..3179fb30ab4d7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -91,8 +91,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(LogisticRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.LogisticRegression", self.uid) - #: param for threshold in binary classification, in range [0, 1]. - self.threshold = LogisticRegression.threshold._copy_new_parent(self) self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -236,8 +234,6 @@ class TreeClassifierParams(object): def __init__(self): super(TreeClassifierParams, self).__init__() - #: param for Criterion used for information gain calculation (case-insensitive). - self.impurity = TreeClassifierParams.impurity._copy_new_parent(self) @since("1.6.0") def setImpurity(self, value): @@ -497,8 +493,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.GBTClassifier", self.uid) - #: param for Loss function which GBT tries to minimize (case-insensitive). - self.lossType = GBTClassifier.lossType._copy_new_parent(self) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", maxIter=20, stepSize=0.1) @@ -605,10 +599,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(NaiveBayes, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.NaiveBayes", self.uid) - #: param for the smoothing parameter. - self.smoothing = NaiveBayes.smoothing._copy_new_parent(self) - #: param for the model type. - self.modelType = NaiveBayes.modelType._copy_new_parent(self) self._setDefault(smoothing=1.0, modelType="multinomial") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -739,8 +729,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self.layers = MultilayerPerceptronClassifier.layers._copy_new_parent(self) - self.blockSize = MultilayerPerceptronClassifier.blockSize._copy_new_parent(self) self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index d44b53c97110d..60d1c9aaec988 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -89,9 +89,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2, """ super(KMeans, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid) - self.k = KMeans.k._copy_new_parent(self) - self.initMode = KMeans.initMode._copy_new_parent(self) - self.initSteps = KMeans.initSteps._copy_new_parent(self) self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 596f8f725856d..c9b95b3bf45d9 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -137,8 +137,6 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label", super(BinaryClassificationEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid) - #: param for metric name in evaluation (areaUnderROC|areaUnderPR) - self.metricName = BinaryClassificationEvaluator.metricName._copy_new_parent(self) self._setDefault(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC") kwargs = self.__init__._input_kwargs @@ -208,8 +206,6 @@ def __init__(self, predictionCol="prediction", labelCol="label", super(RegressionEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) - #: param for metric name in evaluation (mse|rmse|r2|mae) - self.metricName = RegressionEvaluator.metricName._copy_new_parent(self) self._setDefault(predictionCol="prediction", labelCol="label", metricName="rmse") kwargs = self.__init__._input_kwargs @@ -276,8 +272,6 @@ def __init__(self, predictionCol="prediction", labelCol="label", super(MulticlassClassificationEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid) - # param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall) - self.metricName = MulticlassClassificationEvaluator.metricName._copy_new_parent(self) self._setDefault(predictionCol="prediction", labelCol="label", metricName="f1") kwargs = self.__init__._input_kwargs diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 20428ce2d1678..c65c870bce194 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -66,7 +66,6 @@ def __init__(self, threshold=0.0, inputCol=None, outputCol=None): """ super(Binarizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid) - self.threshold = Binarizer.threshold._copy_new_parent(self) self._setDefault(threshold=0.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -138,12 +137,6 @@ def __init__(self, splits=None, inputCol=None, outputCol=None): """ super(Bucketizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) - #: param for Splitting points for mapping continuous features into buckets. With n+1 splits, - # there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) - # except the last bucket, which also includes y. The splits should be strictly increasing. - # Values at -inf, inf must be explicitly provided to cover all Double values; otherwise, - # values outside the splits specified will be treated as errors. - self.splits = Bucketizer.splits._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -223,9 +216,6 @@ def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outpu super(CountVectorizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer", self.uid) - self.minTF = CountVectorizer.minTF._copy_new_parent(self) - self.minDF = CountVectorizer.minDF._copy_new_parent(self) - self.vocabSize = CountVectorizer.vocabSize._copy_new_parent(self) self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -344,7 +334,6 @@ def __init__(self, inverse=False, inputCol=None, outputCol=None): """ super(DCT, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid) - self.inverse = DCT.inverse._copy_new_parent(self) self._setDefault(inverse=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -407,7 +396,6 @@ def __init__(self, scalingVec=None, inputCol=None, outputCol=None): super(ElementwiseProduct, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct", self.uid) - self.scalingVec = ElementwiseProduct.scalingVec._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -512,7 +500,6 @@ def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): """ super(IDF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid) - self.minDocFreq = IDF.minDocFreq._copy_new_parent(self) self._setDefault(minDocFreq=0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -602,8 +589,6 @@ def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): """ super(MinMaxScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid) - self.min = MinMaxScaler.min._copy_new_parent(self) - self.max = MinMaxScaler.max._copy_new_parent(self) self._setDefault(min=0.0, max=1.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -722,7 +707,6 @@ def __init__(self, n=2, inputCol=None, outputCol=None): """ super(NGram, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid) - self.n = NGram.n._copy_new_parent(self) self._setDefault(n=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -784,7 +768,6 @@ def __init__(self, p=2.0, inputCol=None, outputCol=None): """ super(Normalizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid) - self.p = Normalizer.p._copy_new_parent(self) self._setDefault(p=2.0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -862,7 +845,6 @@ def __init__(self, dropLast=True, inputCol=None, outputCol=None): """ super(OneHotEncoder, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid) - self.dropLast = OneHotEncoder.dropLast._copy_new_parent(self) self._setDefault(dropLast=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -925,7 +907,6 @@ def __init__(self, degree=2, inputCol=None, outputCol=None): super(PolynomialExpansion, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.feature.PolynomialExpansion", self.uid) - self.degree = PolynomialExpansion.degree._copy_new_parent(self) self._setDefault(degree=2) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1005,10 +986,6 @@ def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, """ super(RegexTokenizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid) - self.minTokenLength = RegexTokenizer.minTokenLength._copy_new_parent(self) - self.gaps = RegexTokenizer.gaps._copy_new_parent(self) - self.pattern = RegexTokenizer.pattern._copy_new_parent(self) - self.toLowercase = RegexTokenizer.toLowercase._copy_new_parent(self) self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1113,7 +1090,6 @@ def __init__(self, statement=None): """ super(SQLTransformer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid) - self.statement = SQLTransformer.statement._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1175,8 +1151,6 @@ def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): """ super(StandardScaler, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid) - self.withMean = StandardScaler.withMean._copy_new_parent(self) - self.withStd = StandardScaler.withStd._copy_new_parent(self) self._setDefault(withMean=False, withStd=True) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1344,7 +1318,6 @@ def __init__(self, inputCol=None, outputCol=None, labels=None): super(IndexToString, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", self.uid) - self.labels = IndexToString.labels._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1398,8 +1371,6 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self.stopWords = StopWordsRemover.stopWords._copy_new_parent(self) - self.caseSensitive = StopWordsRemover.caseSensitive._copy_new_parent(self) stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords defaultStopWords = stopWordsObj.English() self._setDefault(stopWords=defaultStopWords) @@ -1614,7 +1585,6 @@ def __init__(self, maxCategories=20, inputCol=None, outputCol=None): """ super(VectorIndexer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid) - self.maxCategories = VectorIndexer.maxCategories._copy_new_parent(self) self._setDefault(maxCategories=20) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1717,8 +1687,6 @@ def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): """ super(VectorSlicer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid) - self.indices = VectorSlicer.indices._copy_new_parent(self) - self.names = VectorSlicer.names._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1815,9 +1783,6 @@ def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) - self.vectorSize = Word2Vec.vectorSize._copy_new_parent(self) - self.numPartitions = Word2Vec.numPartitions._copy_new_parent(self) - self.minCount = Word2Vec.minCount._copy_new_parent(self) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None) kwargs = self.__init__._input_kwargs @@ -1943,7 +1908,6 @@ def __init__(self, k=None, inputCol=None, outputCol=None): """ super(PCA, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid) - self.k = PCA.k._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -2043,7 +2007,6 @@ def __init__(self, formula=None, featuresCol="features", labelCol="label"): """ super(RFormula, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid) - self.formula = RFormula.formula._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index c0c7863fde518..b5b3bb1564cb6 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -86,15 +86,18 @@ def __init__(self): #: value returned by :py:func:`params` self._params = None + print "setting up "+str(self) + self._copy_params() + def _copy_params(self): """ Copy all params defined on the class to current object. """ - cls = self.__class__ - src_params = list(filter(lambda attr: isinstance(attr, Param), - [getattr(cls, x) for x in dir(cls)])) - for param in src_params: - param._copy_new_parent(self) + cls = type(self) + src_name_attrs = [(x, getattr(cls, x)) for x in dir(cls)] + src_params = list(filter(lambda nameAttr: isinstance(nameAttr[1], Param), src_name_attrs)) + for name, param in src_params: + setattr(self, name, param._copy_new_parent(self)) @property @since("1.3.0") diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 0648bcc82fdea..5d0183f183ef0 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -53,8 +53,7 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType): $name = Param(Params._dummy(), "$name", "$doc", $expectedType) def __init__(self): - super(Has$Name, self).__init__() - self._copy_params()''' + super(Has$Name, self).__init__()''' if defaultValueStr is not None: template += ''' @@ -173,8 +172,7 @@ def get$Name(self): $dummyPlaceHolders def __init__(self): - super(DecisionTreeParams, self).__init__() - self._copy_params()''' + super(DecisionTreeParams, self).__init__()''' dtParamMethods = "" dummyPlaceholders = "" realParams = "" diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 039e3b49aeecd..b74214255156b 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -29,7 +29,6 @@ class HasMaxIter(Params): def __init__(self): super(HasMaxIter, self).__init__() - self._copy_params() def setMaxIter(self, value): """ @@ -54,7 +53,6 @@ class HasRegParam(Params): def __init__(self): super(HasRegParam, self).__init__() - self._copy_params() def setRegParam(self, value): """ @@ -79,7 +77,6 @@ class HasFeaturesCol(Params): def __init__(self): super(HasFeaturesCol, self).__init__() - self._copy_params() self._setDefault(featuresCol='features') def setFeaturesCol(self, value): @@ -105,7 +102,6 @@ class HasLabelCol(Params): def __init__(self): super(HasLabelCol, self).__init__() - self._copy_params() self._setDefault(labelCol='label') def setLabelCol(self, value): @@ -131,7 +127,6 @@ class HasPredictionCol(Params): def __init__(self): super(HasPredictionCol, self).__init__() - self._copy_params() self._setDefault(predictionCol='prediction') def setPredictionCol(self, value): @@ -157,7 +152,6 @@ class HasProbabilityCol(Params): def __init__(self): super(HasProbabilityCol, self).__init__() - self._copy_params() self._setDefault(probabilityCol='probability') def setProbabilityCol(self, value): @@ -183,7 +177,6 @@ class HasRawPredictionCol(Params): def __init__(self): super(HasRawPredictionCol, self).__init__() - self._copy_params() self._setDefault(rawPredictionCol='rawPrediction') def setRawPredictionCol(self, value): @@ -209,7 +202,6 @@ class HasInputCol(Params): def __init__(self): super(HasInputCol, self).__init__() - self._copy_params() def setInputCol(self, value): """ @@ -234,7 +226,6 @@ class HasInputCols(Params): def __init__(self): super(HasInputCols, self).__init__() - self._copy_params() def setInputCols(self, value): """ @@ -259,7 +250,6 @@ class HasOutputCol(Params): def __init__(self): super(HasOutputCol, self).__init__() - self._copy_params() self._setDefault(outputCol=self.uid + '__output') def setOutputCol(self, value): @@ -285,7 +275,6 @@ class HasNumFeatures(Params): def __init__(self): super(HasNumFeatures, self).__init__() - self._copy_params() def setNumFeatures(self, value): """ @@ -310,7 +299,6 @@ class HasCheckpointInterval(Params): def __init__(self): super(HasCheckpointInterval, self).__init__() - self._copy_params() def setCheckpointInterval(self, value): """ @@ -335,7 +323,6 @@ class HasSeed(Params): def __init__(self): super(HasSeed, self).__init__() - self._copy_params() self._setDefault(seed=hash(type(self).__name__)) def setSeed(self, value): @@ -361,7 +348,6 @@ class HasTol(Params): def __init__(self): super(HasTol, self).__init__() - self._copy_params() def setTol(self, value): """ @@ -386,7 +372,6 @@ class HasStepSize(Params): def __init__(self): super(HasStepSize, self).__init__() - self._copy_params() def setStepSize(self, value): """ @@ -411,7 +396,6 @@ class HasHandleInvalid(Params): def __init__(self): super(HasHandleInvalid, self).__init__() - self._copy_params() def setHandleInvalid(self, value): """ @@ -436,7 +420,6 @@ class HasElasticNetParam(Params): def __init__(self): super(HasElasticNetParam, self).__init__() - self._copy_params() self._setDefault(elasticNetParam=0.0) def setElasticNetParam(self, value): @@ -462,7 +445,6 @@ class HasFitIntercept(Params): def __init__(self): super(HasFitIntercept, self).__init__() - self._copy_params() self._setDefault(fitIntercept=True) def setFitIntercept(self, value): @@ -488,7 +470,6 @@ class HasStandardization(Params): def __init__(self): super(HasStandardization, self).__init__() - self._copy_params() self._setDefault(standardization=True) def setStandardization(self, value): @@ -514,7 +495,6 @@ class HasThresholds(Params): def __init__(self): super(HasThresholds, self).__init__() - self._copy_params() def setThresholds(self, value): """ @@ -539,7 +519,6 @@ class HasWeightCol(Params): def __init__(self): super(HasWeightCol, self).__init__() - self._copy_params() def setWeightCol(self, value): """ @@ -564,7 +543,6 @@ class HasSolver(Params): def __init__(self): super(HasSolver, self).__init__() - self._copy_params() self._setDefault(solver='auto') def setSolver(self, value): @@ -596,7 +574,6 @@ class DecisionTreeParams(Params): def __init__(self): super(DecisionTreeParams, self).__init__() - self._copy_params() def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 54e59e3733227..661074ca96212 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -159,8 +159,6 @@ def __init__(self, stages=None): if stages is None: stages = [] super(Pipeline, self).__init__() - #: Param for pipeline stages. - self.stages = Pipeline.stages._copy_new_parent(self) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index f9febad3f3935..08180a2f25eb9 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -107,15 +107,6 @@ def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemB """ super(ALS, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid) - self.rank = ALS.rank._copy_new_parent(self) - self.numUserBlocks = ALS.numUserBlocks._copy_new_parent(self) - self.numItemBlocks = ALS.numItemBlocks._copy_new_parent(self) - self.implicitPrefs = ALS.implicitPrefs._copy_new_parent(self) - self.alpha = ALS.alpha._copy_new_parent(self) - self.userCol = ALS.userCol._copy_new_parent(self) - self.itemCol = ALS.itemCol._copy_new_parent(self) - self.ratingCol = ALS.ratingCol._copy_new_parent(self) - self.nonnegative = ALS.nonnegative._copy_new_parent(self) self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None, ratingCol="rating", nonnegative=False, checkpointInterval=10) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d0fb065146a70..ba0d182e2ac5a 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -180,8 +180,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(IsotonicRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.IsotonicRegression", self.uid) - self.isotonic = IsotonicRegression.isotonic._copy_new_parent(self) - self.featureIndex = IsotonicRegression.featureIndex._copy_new_parent(self) self._setDefault(isotonic=True, featureIndex=0) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -260,8 +258,6 @@ class TreeEnsembleParams(DecisionTreeParams): def __init__(self): super(TreeEnsembleParams, self).__init__() - #: param for Fraction of the training data, in range (0, 1]. - self.subsamplingRate = TreeEnsembleParams.subsamplingRate._copy_new_parent(self) @since("1.4.0") def setSubsamplingRate(self, value): @@ -292,8 +288,6 @@ class TreeRegressorParams(Params): def __init__(self): super(TreeRegressorParams, self).__init__() - #: param for Criterion used for information gain calculation (case-insensitive). - self.impurity = TreeRegressorParams.impurity._copy_new_parent(self) @since("1.4.0") def setImpurity(self, value): @@ -325,10 +319,6 @@ class RandomForestParams(TreeEnsembleParams): def __init__(self): super(RandomForestParams, self).__init__() - #: param for Number of trees to train (>= 1). - self.numTrees = RandomForestParams.numTrees._copy_new_parent(self) - #: param for The number of features to consider for splits at each tree node. - self.featureSubsetStrategy = RandomForestParams.featureSubsetStrategy._copy_new_parent(self) @since("1.4.0") def setNumTrees(self, value): @@ -610,11 +600,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) - #: param for Loss function which GBT tries to minimize (case-insensitive). - self.lossType = GBTRegressor.lossType._copy_new_parent(self) - self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -719,13 +704,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(AFTSurvivalRegression, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid) - #: Param for censor column name - self.censorCol = AFTSurvivalRegression.censorCol._copy_new_parent(self) - #: Param for quantile probabilities array - self.quantileProbabilities = \ - AFTSurvivalRegression.quantileProbabilities._copy_new_parent(self) - #: Param for quantiles column name - self.quantilesCol = AFTSurvivalRegression.quantilesCol._copy_new_parent(self) self._setDefault(censorCol="censor", quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) kwargs = self.__init__._input_kwargs diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 900f785c6cebb..0cbe97f1d839f 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -130,15 +130,6 @@ def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numF seed=None) """ super(CrossValidator, self).__init__() - #: param for estimator to be cross-validated - self.estimator = CrossValidator.estimator._copy_new_parent(self) - #: param for estimator param maps - self.estimatorParamMaps = CrossValidator.estimatorParamMaps._copy_new_parent(self) - #: param for the evaluator used to select hyper-parameters that - #: maximize the cross-validated metric - self.evaluator = CrossValidator.evaluator._copy_new_parent(self) - #: param for number of folds for cross validation - self.numFolds = CrossValidator.numFolds._copy_new_parent(self) self._setDefault(numFolds=3) kwargs = self.__init__._input_kwargs self._set(**kwargs) From 0d2892200700ae2821e4ee2468ea5b159798c612 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 12:18:53 -0800 Subject: [PATCH 11/12] remove print and add a comment --- python/pyspark/ml/param/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index b5b3bb1564cb6..3da36d32c5af0 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -86,7 +86,7 @@ def __init__(self): #: value returned by :py:func:`params` self._params = None - print "setting up "+str(self) + # Copy the params from the class to the object self._copy_params() def _copy_params(self): From 8396aef996a2f7817e68a886b90253e46e3c7cd3 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 26 Jan 2016 15:01:44 -0800 Subject: [PATCH 12/12] Restore accidentatly removed setDefault, add back newline between init and setters in tree params codegen, regen shared params --- python/pyspark/ml/param/_shared_params_code_gen.py | 3 +-- python/pyspark/ml/param/shared.py | 1 + python/pyspark/ml/regression.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 5d0183f183ef0..5e297b8214823 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -175,12 +175,11 @@ def __init__(self): super(DecisionTreeParams, self).__init__()''' dtParamMethods = "" dummyPlaceholders = "" - realParams = "" paramTemplate = """$name = Param($owner, "$name", "$doc")""" for name, doc in decisionTreeParams: variable = paramTemplate.replace("$name", name).replace("$doc", doc) dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " dtParamMethods += _gen_param_code(name, doc, None) + "\n" - code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + + code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" + dtParamMethods) print("\n\n\n".join(code)) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index b74214255156b..db4a8a54d4956 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -574,6 +574,7 @@ class DecisionTreeParams(Params): def __init__(self): super(DecisionTreeParams, self).__init__() + def setMaxDepth(self, value): """ Sets the value of :py:attr:`maxDepth`. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index ba0d182e2ac5a..74a2248ed07c8 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -600,6 +600,9 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) + self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, + maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) kwargs = self.__init__._input_kwargs self.setParams(**kwargs)