From d4706d57e3ce1e704d1484d36c8eaeef8f3c8d23 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 4 Feb 2016 14:58:19 -0800 Subject: [PATCH 01/10] python param subclasses and validation functions --- python/pyspark/ml/classification.py | 45 ++-- python/pyspark/ml/clustering.py | 16 +- python/pyspark/ml/feature.py | 97 +++---- python/pyspark/ml/param/__init__.py | 241 ++++++++++++++++-- .../ml/param/_shared_params_code_gen.py | 85 +++--- python/pyspark/ml/param/shared.py | 46 ++-- python/pyspark/ml/recommendation.py | 22 +- python/pyspark/ml/regression.py | 20 +- python/pyspark/ml/tuning.py | 5 +- 9 files changed, 404 insertions(+), 173 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 253af15cb5cd9..243e2d53172e9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -20,6 +20,7 @@ from pyspark import since from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel +from pyspark.ml.param import * from pyspark.ml.param.shared import * from pyspark.ml.regression import ( RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels) @@ -73,9 +74,10 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti .. versionadded:: 1.3.0 """ - threshold = Param(Params._dummy(), "threshold", - "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match.") + threshold = FloatParam(Params._dummy(), "threshold", + "Threshold in binary classification prediction, in range [0, 1]." + + " If threshold and thresholds are both set, they must match.", + ParamValidators.inRange(0, 1)) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -228,10 +230,10 @@ class TreeClassifierParams(object): """ supportedImpurities = ["entropy", "gini"] - impurity = Param(Params._dummy(), "impurity", + impurity = StringParam(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + - ", ".join(supportedImpurities)) + ", ".join(supportedImpurities), ParamValidators.inList(supportedImpurities)) def __init__(self): super(TreeClassifierParams, self).__init__() @@ -476,9 +478,10 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol .. versionadded:: 1.4.0 """ - lossType = Param(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) + lossType = StringParam(Params._dummy(), "lossType", + "Loss function which GBT tries to minimize (case-insensitive). " + + "Supported options: " + ", ".join(GBTParams.supportedLossTypes), + ParamValidators.inList(GBTParams.supportedLossTypes)) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -583,10 +586,12 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H .. versionadded:: 1.5.0 """ - smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " + - "default is 1.0") - modelType = Param(Params._dummy(), "modelType", "The model type which is a string " + - "(case-sensitive). Supported options: multinomial (default) and bernoulli.") + supportedModelTypes = ['multinomial', 'bernoulli'] + smoothing = FloatParam(Params._dummy(), "smoothing", "The smoothing parameter, should be " + + ">= 0, default is 1.0", ParamValidators.gtEq(0)) + modelType = StringParam(Params._dummy(), "modelType", "The model type which is a string " + + "(case-sensitive). Supported options: multinomial (default) and " + + "bernoulli.", ParamValidators.inList(supportedModelTypes)) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -712,13 +717,15 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, .. versionadded:: 1.6.0 """ - layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " + - "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " + - "neurons and output layer of 10 neurons, default is [1, 1].") - blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " + - "matrices. Data is stacked within partitions. If block size is more than " + - "remaining data in a partition then it is adjusted to the size of this " + - "data. Recommended size is between 10 and 1000, default is 128.") + layers = ListIntParam(Params._dummy(), "layers", "Sizes of layers from input layer to output " + + "layer E.g., Array(780, 100, 10) means 780 inputs, one hidden layer " + + "with 100 neurons and output layer of 10 neurons, default is [1, 1].", + ParamValidators.listLengthGt(1)) + blockSize = IntParam(Params._dummy(), "blockSize", "Block size for stacking input data in " + + "matrices. Data is stacked within partitions. If block size is more " + + "than remaining data in a partition then it is adjusted to the size of " + + "this data. Recommended size is between 10 and 1000, default is 128.", + ParamValidators.gt(0)) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 1cea477acb47d..81f14589b1ff9 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -19,6 +19,7 @@ from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * +from pyspark.ml.param import * from pyspark.mllib.common import inherit_doc __all__ = ['BisectingKMeans', 'BisectingKMeansModel', @@ -87,12 +88,15 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "number of clusters to create") - initMode = Param(Params._dummy(), "initMode", - "the initialization algorithm. This can be either \"random\" to " + - "choose random points as initial cluster centers, or \"k-means||\" " + - "to use a parallel variant of k-means++") - initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode") + initModes = ["random", "k-means||"] + k = IntParam(Params._dummy(), "k", "number of clusters to create", ParamValidators.gt(1)) + initMode = StringParam(Params._dummy(), "initMode", + "the initialization algorithm. This can be either \"random\" to " + + "choose random points as initial cluster centers, or \"k-means||\" " + + "to use a parallel variant of k-means++", + ParamValidators.inList(initModes)) + initSteps = IntParam(Params._dummy(), "initSteps", "steps for k-means initialization mode", + ParamValidators.gt(0)) @keyword_only def __init__(self, featuresCol="features", predictionCol="prediction", k=2, diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index fb31c7310c0a8..31fe39deba00c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -22,6 +22,7 @@ from pyspark import since from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.param.shared import * +from pyspark.ml.param import ParamValidators from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm from pyspark.mllib.common import inherit_doc @@ -77,8 +78,8 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - threshold = Param(Params._dummy(), "threshold", - "threshold in binary classification prediction, in range [0, 1]") + threshold = FloatParam(Params._dummy(), "threshold", + "threshold in binary classification prediction") @keyword_only def __init__(self, threshold=0.0, inputCol=None, outputCol=None): @@ -141,9 +142,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.3.0 """ - + # TODO: add a checksplits validation function? splits = \ - Param(Params._dummy(), "splits", + ListFloatParam(Params._dummy(), "splits", "Split points for mapping continuous features into buckets. With n+1 splits, " + "there are n buckets. A bucket defined by splits x,y holds values in the " + "range [x,y) except the last bucket, which also includes y. The splits " + @@ -213,21 +214,23 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - minTF = Param( + minTF = IntParam( Params._dummy(), "minTF", "Filter to ignore rare words in" + " a document. For each document, terms with frequency/count less than the given" + " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" + " times the term must appear in the document); if this is a double in [0,1), then this " + "specifies a fraction (out of the document's token count). Note that the parameter is " + - "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0") - minDF = Param( + "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0", + ParamValidators.gtEq(0)) + minDF = IntParam( Params._dummy(), "minDF", "Specifies the minimum number of" + " different documents a term must appear in to be included in the vocabulary." + " If this is an integer >= 1, this specifies the number of documents the term must" + " appear in; if this is a double in [0,1), then this specifies the fraction of documents." + - " Default 1.0") - vocabSize = Param( - Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.") + " Default 1.0", ParamValidators.gtEq(0)) + vocabSize = IntParam( + Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", + ParamValidators.gt(0)) @keyword_only def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): @@ -345,7 +348,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + + inverse = BooleanParam(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + "default False.") @keyword_only @@ -406,7 +409,7 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " + + scalingVec = VectorParam(Params._dummy(), "scalingVec", "vector for hadamard product, " + "it must be MLlib Vector type.") @keyword_only @@ -511,7 +514,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - minDocFreq = Param(Params._dummy(), "minDocFreq", + minDocFreq = IntParam(Params._dummy(), "minDocFreq", "minimum of documents in which a term should appear for filtering") @keyword_only @@ -660,8 +663,8 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - min = Param(Params._dummy(), "min", "Lower bound of the output feature range") - max = Param(Params._dummy(), "max", "Upper bound of the output feature range") + min = IntParam(Params._dummy(), "min", "Lower bound of the output feature range") + max = IntParam(Params._dummy(), "max", "Upper bound of the output feature range") @keyword_only def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None): @@ -779,7 +782,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)") + n = IntParam(Params._dummy(), "n", "number of elements per n-gram (>=1)", + ParamValidators.gtEq(1)) @keyword_only def __init__(self, n=2, inputCol=None, outputCol=None): @@ -840,7 +844,7 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - p = Param(Params._dummy(), "p", "the p norm value.") + p = FloatParam(Params._dummy(), "p", "the p norm value.", ParamValidators.gtEq(1)) @keyword_only def __init__(self, p=2.0, inputCol=None, outputCol=None): @@ -917,7 +921,7 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category") + dropLast = BooleanParam(Params._dummy(), "dropLast", "whether to drop the last category") @keyword_only def __init__(self, dropLast=True, inputCol=None, outputCol=None): @@ -978,7 +982,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)") + degree = IntParam(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)", + ParamValidators.gtEq(1)) @keyword_only def __init__(self, degree=2, inputCol=None, outputCol=None): @@ -1048,9 +1053,10 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed): """ # a placeholder to make it appear in the generated doc - numBuckets = Param(Params._dummy(), "numBuckets", + numBuckets = IntParam(Params._dummy(), "numBuckets", "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2. Default 2.") + "categories) into which data points are grouped. Must be >= 2. Default 2.", + ParamValidators.gtEq(2)) @keyword_only def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None): @@ -1135,11 +1141,14 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)") - gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens") - pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing") - toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " + - "lowercase before tokenizing") + minTokenLength = IntParam(Params._dummy(), "minTokenLength", "minimum token length (>= 0)", + ParamValidators.gtEq(0)) + gaps = BooleanParam(Params._dummy(), "gaps", + "whether regex splits on gaps (True) or matches tokens") + pattern = StringParam(Params._dummy(), "pattern", + "regex pattern (Java dialect) used for tokenizing") + toLowercase = BooleanParam(Params._dummy(), "toLowercase", + "whether to convert all characters to lowercase before tokenizing") @keyword_only def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, @@ -1305,8 +1314,8 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - withMean = Param(Params._dummy(), "withMean", "Center data with mean") - withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation") + withMean = BooleanParam(Params._dummy(), "withMean", "Center data with mean") + withStd = BooleanParam(Params._dummy(), "withStd", "Scale to unit standard deviation") @keyword_only def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None): @@ -1471,7 +1480,7 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - labels = Param(Params._dummy(), "labels", + labels = ListStringParam(Params._dummy(), "labels", "Optional array of labels specifying index-string mapping." + " If not provided or if empty, then metadata from inputCol is used instead.") @@ -1522,9 +1531,9 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") - caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + - "comparison over the stop words") + stopWords = ListStringParam(Params._dummy(), "stopWords", "The words to be filtered out") + caseSensitive = BooleanParam(Params._dummy(), "caseSensitive", + "whether to do a case sensitive comparison over the stop words") @keyword_only def __init__(self, inputCol=None, outputCol=None, stopWords=None, @@ -1738,10 +1747,10 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.4.0 """ - maxCategories = Param(Params._dummy(), "maxCategories", + maxCategories = IntParam(Params._dummy(), "maxCategories", "Threshold for the number of values a categorical feature can take " + "(>= 2). If a feature is found to have > maxCategories values, then " + - "it is declared continuous.") + "it is declared continuous.", ParamValidators.gtEq(2)) @keyword_only def __init__(self, maxCategories=20, inputCol=None, outputCol=None): @@ -1837,10 +1846,10 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - - indices = Param(Params._dummy(), "indices", "An array of indices to select features from " + + # TODO: define custom isValid functions? + indices = ListIntParam(Params._dummy(), "indices", "An array of indices to select features from " + "a vector column. There can be no overlap with names.") - names = Param(Params._dummy(), "names", "An array of feature names to select features from " + + names = ListStringParam(Params._dummy(), "names", "An array of feature names to select features from " + "a vector column. These names must be specified by ML " + "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + "indices.") @@ -1931,11 +1940,11 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has .. versionadded:: 1.4.0 """ - vectorSize = Param(Params._dummy(), "vectorSize", + vectorSize = IntParam(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words") - numPartitions = Param(Params._dummy(), "numPartitions", + numPartitions = IntParam(Params._dummy(), "numPartitions", "number of partitions for sentences of words") - minCount = Param(Params._dummy(), "minCount", + minCount = IntParam(Params._dummy(), "minCount", "the minimum number of times a token must appear to be included in the " + "word2vec model's vocabulary") @@ -2066,7 +2075,7 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol): .. versionadded:: 1.5.0 """ - k = Param(Params._dummy(), "k", "the number of principal components") + k = IntParam(Params._dummy(), "k", "the number of principal components") @keyword_only def __init__(self, k=None, inputCol=None, outputCol=None): @@ -2174,7 +2183,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol): .. versionadded:: 1.5.0 """ - formula = Param(Params._dummy(), "formula", "R model formula") + formula = StringParam(Params._dummy(), "formula", "R model formula") @keyword_only def __init__(self, formula=None, featuresCol="features", labelCol="label"): @@ -2251,10 +2260,10 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): # a placeholder to make it appear in the generated doc numTopFeatures = \ - Param(Params._dummy(), "numTopFeatures", + IntParam(Params._dummy(), "numTopFeatures", "Number of features that selector will select, ordered by statistics value " + "descending. If the number of features is < numTopFeatures, then this will select " + - "all features.") + "all features.", ParamValidators.gtEq(1)) @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"): diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index bbf83f0310dd7..539117b1856eb 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -17,12 +17,16 @@ from abc import ABCMeta import copy +import numpy as np +from pyspark.mllib.linalg import DenseVector, Vector from pyspark import since from pyspark.ml.util import Identifiable -__all__ = ['Param', 'Params'] +__all__ = ['IntParam', 'FloatParam', 'StringParam', 'BooleanParam', + 'VectorParam', 'ListIntParam', 'ListFloatParam', 'ListStringParam', + 'Param', 'Params', 'ParamValidators'] class Param(object): @@ -32,13 +36,14 @@ class Param(object): .. versionadded:: 1.3.0 """ - def __init__(self, parent, name, doc, expectedType=None): + def __init__(self, parent, name, doc, isValid=None): if not isinstance(parent, Identifiable): raise TypeError("Parent must be an Identifiable but got type %s." % type(parent)) self.parent = parent.uid self.name = str(name) self.doc = str(doc) - self.expectedType = expectedType + # self.expectedType = expectedType + self.isValid = isValid if isValid is not None else ParamValidators.alwaysTrue() def _copy_new_parent(self, parent): """Copy the current param to a new parent, must be a dummy param.""" @@ -49,6 +54,14 @@ def _copy_new_parent(self, parent): else: raise ValueError("Cannot copy from non-dummy parent %s." % parent) + def _validate(self, value): + if not self.isValid(value): + raise ValueError("Invalid value") + + def _convertAndValidate(self, value): + self._validate(value) + return value + def __str__(self): return str(self.parent) + "__" + self.name @@ -65,6 +78,192 @@ def __eq__(self, other): return False +class IntParam(Param): + """ + Specialized `Param` for integers. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + value = ParamValidators.primitiveConvert(value, int) + self._validate(value) + return value + + +class FloatParam(Param): + """ + Specialized `Param` for floats. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + value = ParamValidators.primitiveConvert(value, float) + self._validate(value) + return value + + +class StringParam(Param): + """ + Specialized `Param` for strings. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + value = ParamValidators.primitiveConvert(value, str) + self._validate(value) + return value + + +class BooleanParam(Param): + """ + Specialized `Param` for Booleans. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + value = ParamValidators.primitiveConvert(value, bool) + self._validate(value) + return value + + +class ListIntParam(Param): + """ + Specialized `Param` for lists of integers. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + if type(value) != list: + value = ParamValidators.convertToList(value) + + if not all(map(lambda v: type(v) == int, value)): + try: + value = map(lambda v: int(v), value) + except ValueError: + raise TypeError("Could not convert %s to a list of integers" % value) + self._validate(value) + return value + + +class ListFloatParam(Param): + """ + Specialized `Param` for lists of floats. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + if type(value) != list: + value = ParamValidators.convertToList(value) + + if not all(map(lambda v: type(v) == float, value)): + try: + value = map(lambda v: float(v), value) + except ValueError: + raise TypeError("Could not convert %s to a list of floats" % value) + self._validate(value) + return value + + +class ListStringParam(Param): + """ + Specialized `Param` for lists of strings. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + if type(value) != list: + value = ParamValidators.convertToList(value) + + if not all(map(lambda v: type(v) == str, value)): + try: + value = map(lambda v: str(v), value) + except ValueError: + raise TypeError("Could not convert %s to a list of strings" % value) + self._validate(value) + return value + + +class VectorParam(Param): + """ + Specialized `Param` for Vector types. + + .. versionadded:: 2.0.0 + """ + + def _convertAndValidate(self, value): + if not isinstance(value, Vector): + try: + value = DenseVector(value) + except: + raise TypeError("Could not convert %s to a Vector" % value) + self._validate(value) + return value + + +class ParamValidators(object): + + @staticmethod + def alwaysTrue(): + return lambda value: True + + @staticmethod + def primitiveConvert(value, primitiveType): + if type(value) != primitiveType: + try: + value = primitiveType(value) + except ValueError: + raise TypeError("Could not convert %s to a %s" % (value, primitiveType.__name__)) + return value + + @staticmethod + def convertToList(value): + if type(value) == np.ndarray: + return list(value) + elif isinstance(value, Vector): + return value.toArray() + else: + raise TypeError("Could not convert %s to list" % value) + + @staticmethod + def gt(lowerBound): + return lambda value: value > lowerBound + + @staticmethod + def gtEq(lowerBound): + return lambda value: value >= lowerBound + + @staticmethod + def lt(lowerBound): + return lambda value: value < lowerBound + + @staticmethod + def ltEq(lowerBound): + return lambda value: value <= lowerBound + + @staticmethod + def inRange(lowerBound, upperBound, lowerInclusive=True, upperInclusive=True): + def inRangeFunction(x): + lowerValid = (x >= lowerBound) if lowerInclusive else (x > lowerBound) + upperValid = (x <= upperBound) if upperInclusive else (x < upperBound) + return lowerValid and upperValid + return inRangeFunction + + @staticmethod + def inList(allowed): + return lambda value: value in allowed + + @staticmethod + def listLengthGt(lowerBound): + return lambda lst: len(lst) > lowerBound + + class Params(Identifiable): """ Components that take parameters. This also provides an internal @@ -274,23 +473,25 @@ def _set(self, **kwargs): """ for param, value in kwargs.items(): p = getattr(self, param) - if p.expectedType is None or type(value) == p.expectedType or value is None: - self._paramMap[getattr(self, param)] = value - else: - try: - # Try and do "safe" conversions that don't lose information - if p.expectedType == float: - self._paramMap[getattr(self, param)] = float(value) - # Python 3 unified long & int - elif p.expectedType == int and type(value).__name__ == 'long': - self._paramMap[getattr(self, param)] = value - else: - raise Exception( - "Provided type {0} incompatible with type {1} for param {2}" - .format(type(value), p.expectedType, p)) - except ValueError: - raise Exception(("Failed to convert {0} to type {1} for param {2}" - .format(type(value), p.expectedType, p))) + value = p._convertAndValidate(value) + self._paramMap[getattr(self, param)] = value + # if p.expectedType is None or type(value) == p.expectedType or value is None: + # self._paramMap[getattr(self, param)] = value + # else: + # try: + # # Try and do "safe" conversions that don't lose information + # if p.expectedType == float: + # self._paramMap[getattr(self, param)] = float(value) + # # Python 3 unified long & int + # elif p.expectedType == int and type(value).__name__ == 'long': + # self._paramMap[getattr(self, param)] = value + # else: + # raise Exception( + # "Provided type {0} incompatible with type {1} for param {2}" + # .format(type(value), p.expectedType, p)) + # except ValueError: + # raise Exception(("Failed to convert {0} to type {1} for param {2}" + # .format(type(value), p.expectedType, p))) return self def _setDefault(self, **kwargs): diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 5e297b8214823..40ec4de91cfae 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -38,7 +38,7 @@ # python _shared_params_code_gen.py > shared.py -def _gen_param_header(name, doc, defaultValueStr, expectedType): +def _gen_param_header(paramTypeStr, name, doc, defaultValueStr, isValidFunctionStr): """ Generates the header part for shared variables @@ -50,7 +50,7 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType): Mixin for param $name: $doc """ - $name = Param(Params._dummy(), "$name", "$doc", $expectedType) + $name = $paramType(Params._dummy(), "$name", "$doc", $isValid) def __init__(self): super(Has$Name, self).__init__()''' @@ -60,15 +60,18 @@ def __init__(self): self._setDefault($name=$defaultValueStr)''' Name = name[0].upper() + name[1:] - expectedTypeName = str(expectedType) - if expectedType is not None: - expectedTypeName = expectedType.__name__ + # expectedTypeName = str(expectedType) + # if expectedType is not None: + # expectedTypeName = expectedType.__name__ + if isValidFunctionStr is None: + isValidFunctionStr = str(None) return template \ + .replace("$paramType", paramTypeStr) \ .replace("$name", name) \ .replace("$Name", Name) \ .replace("$doc", doc) \ .replace("$defaultValueStr", str(defaultValueStr)) \ - .replace("$expectedType", expectedTypeName) + .replace("$isValid", isValidFunctionStr) def _gen_param_code(name, doc, defaultValueStr): @@ -105,47 +108,49 @@ def get$Name(self): if __name__ == "__main__": print(header) print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n") - print("from pyspark.ml.param import Param, Params\n\n") + print("from pyspark.ml.param import *\n\n") shared = [ - ("maxIter", "max number of iterations (>= 0).", None, int), - ("regParam", "regularization parameter (>= 0).", None, float), - ("featuresCol", "features column name.", "'features'", str), - ("labelCol", "label column name.", "'label'", str), - ("predictionCol", "prediction column name.", "'prediction'", str), - ("probabilityCol", "Column name for predicted class conditional probabilities. " + + ("IntParam", "maxIter", "max number of iterations (>= 0).", None, "ParamValidators.gtEq(0)"), + ("FloatParam", "regParam", "regularization parameter (>= 0).", None, "ParamValidators.gtEq(0)"), + ("StringParam", "featuresCol", "features column name.", "'features'", None), + ("StringParam", "labelCol", "label column name.", "'label'", None), + ("StringParam", "predictionCol", "prediction column name.", "'prediction'", None), + ("StringParam", "probabilityCol", "Column name for predicted class conditional probabilities. " + "Note: Not all models output well-calibrated probability estimates! These probabilities " + - "should be treated as confidences, not precise probabilities.", "'probability'", str), - ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'", - str), - ("inputCol", "input column name.", None, str), - ("inputCols", "input column names.", None, None), - ("outputCol", "output column name.", "self.uid + '__output'", str), - ("numFeatures", "number of features.", None, int), - ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + - "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, int), - ("seed", "random seed.", "hash(type(self).__name__)", int), - ("tol", "the convergence tolerance for iterative algorithms.", None, float), - ("stepSize", "Step size to be used for each iteration of optimization.", None, float), - ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " + + "should be treated as confidences, not precise probabilities.", "'probability'", None), + ("StringParam", "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'", None), + ("StringParam", "inputCol", "input column name.", None, None), + ("ListStringParam", "inputCols", "input column names.", None, None), + ("StringParam", "outputCol", "output column name.", "self.uid + '__output'", None), + ("IntParam", "numFeatures", "number of features.", None, "ParamValidators.gtEq(0)"), + ("IntParam", "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + + "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, + "lambda interval: (interval == -1) or (interval >= 1)"), + ("IntParam", "seed", "random seed.", "hash(type(self).__name__)", None), + ("BooleanParam", "tol", "the convergence tolerance for iterative algorithms.", None, None), + ("FloatParam", "stepSize", "Step size to be used for each iteration of optimization.", None, None), + ("StringParam", "handleInvalid", "how to handle invalid entries. Options are skip (which will filter " + "out rows with bad values), or error (which will throw an errror). More options may be " + - "added later.", None, str), - ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + - "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", float), - ("fitIntercept", "whether to fit an intercept term.", "True", bool), - ("standardization", "whether to standardize the training features before fitting the " + - "model.", "True", bool), - ("thresholds", "Thresholds in multi-class classification to adjust the probability of " + + "added later.", None, "ParamValidators.inList(['skip', 'error'])"), + ("FloatParam", "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", + "ParamValidators.inRange(0, 1)"), + ("BooleanParam", "fitIntercept", "whether to fit an intercept term.", "True", None), + ("BooleanParam", "standardization", "whether to standardize the training features before fitting the " + + "model.", "True", None), + ("ListFloatParam", "thresholds", "Thresholds in multi-class classification to adjust the probability of " + "predicting each class. Array must have length equal to the number of classes, with " + "values >= 0. The class with largest value p/t is predicted, where p is the original " + - "probability of that class and t is the class' threshold.", None, None), - ("weightCol", "weight column name. If this is not set or empty, we treat " + - "all instance weights as 1.0.", None, str), - ("solver", "the solver algorithm for optimization. If this is not set or empty, " + - "default value is 'auto'.", "'auto'", str)] + "probability of that class and t is the class' threshold.", None, + "lambda lst: all(map(lambda t: t >= 0, lst))"), + ("StringParam", "weightCol", "weight column name. If this is not set or empty, we treat " + + "all instance weights as 1.0.", None, None), + ("StringParam", "solver", "the solver algorithm for optimization. If this is not set or empty, " + + "default value is 'auto'.", "'auto'", None)] code = [] - for name, doc, defaultValueStr, expectedType in shared: - param_code = _gen_param_header(name, doc, defaultValueStr, expectedType) + for paramClassStr, name, doc, defaultValueStr, isValidFunctionStr in shared: + param_code = _gen_param_header(paramClassStr, name, doc, defaultValueStr, isValidFunctionStr) code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr)) decisionTreeParams = [ diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index db4a8a54d4956..2d81cb0e50ecf 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -17,7 +17,7 @@ # DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py. -from pyspark.ml.param import Param, Params +from pyspark.ml.param import * class HasMaxIter(Params): @@ -25,7 +25,7 @@ class HasMaxIter(Params): Mixin for param maxIter: max number of iterations (>= 0). """ - maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", int) + maxIter = IntParam(Params._dummy(), "maxIter", "max number of iterations (>= 0).", ParamValidators.gtEq(0)) def __init__(self): super(HasMaxIter, self).__init__() @@ -49,7 +49,7 @@ class HasRegParam(Params): Mixin for param regParam: regularization parameter (>= 0). """ - regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", float) + regParam = FloatParam(Params._dummy(), "regParam", "regularization parameter (>= 0).", ParamValidators.gtEq(0)) def __init__(self): super(HasRegParam, self).__init__() @@ -73,7 +73,7 @@ class HasFeaturesCol(Params): Mixin for param featuresCol: features column name. """ - featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", str) + featuresCol = StringParam(Params._dummy(), "featuresCol", "features column name.", None) def __init__(self): super(HasFeaturesCol, self).__init__() @@ -98,7 +98,7 @@ class HasLabelCol(Params): Mixin for param labelCol: label column name. """ - labelCol = Param(Params._dummy(), "labelCol", "label column name.", str) + labelCol = StringParam(Params._dummy(), "labelCol", "label column name.", None) def __init__(self): super(HasLabelCol, self).__init__() @@ -123,7 +123,7 @@ class HasPredictionCol(Params): Mixin for param predictionCol: prediction column name. """ - predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", str) + predictionCol = StringParam(Params._dummy(), "predictionCol", "prediction column name.", None) def __init__(self): super(HasPredictionCol, self).__init__() @@ -148,7 +148,7 @@ class HasProbabilityCol(Params): Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. """ - probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str) + probabilityCol = StringParam(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", None) def __init__(self): super(HasProbabilityCol, self).__init__() @@ -173,7 +173,7 @@ class HasRawPredictionCol(Params): Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name. """ - rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str) + rawPredictionCol = StringParam(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", None) def __init__(self): super(HasRawPredictionCol, self).__init__() @@ -198,7 +198,7 @@ class HasInputCol(Params): Mixin for param inputCol: input column name. """ - inputCol = Param(Params._dummy(), "inputCol", "input column name.", str) + inputCol = StringParam(Params._dummy(), "inputCol", "input column name.", None) def __init__(self): super(HasInputCol, self).__init__() @@ -222,7 +222,7 @@ class HasInputCols(Params): Mixin for param inputCols: input column names. """ - inputCols = Param(Params._dummy(), "inputCols", "input column names.", None) + inputCols = ListStringParam(Params._dummy(), "inputCols", "input column names.", None) def __init__(self): super(HasInputCols, self).__init__() @@ -246,7 +246,7 @@ class HasOutputCol(Params): Mixin for param outputCol: output column name. """ - outputCol = Param(Params._dummy(), "outputCol", "output column name.", str) + outputCol = StringParam(Params._dummy(), "outputCol", "output column name.", None) def __init__(self): super(HasOutputCol, self).__init__() @@ -271,7 +271,7 @@ class HasNumFeatures(Params): Mixin for param numFeatures: number of features. """ - numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", int) + numFeatures = IntParam(Params._dummy(), "numFeatures", "number of features.", ParamValidators.gtEq(0)) def __init__(self): super(HasNumFeatures, self).__init__() @@ -295,7 +295,7 @@ class HasCheckpointInterval(Params): Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. """ - checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int) + checkpointInterval = IntParam(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", lambda interval: (interval == -1) or (interval >= 1)) def __init__(self): super(HasCheckpointInterval, self).__init__() @@ -319,7 +319,7 @@ class HasSeed(Params): Mixin for param seed: random seed. """ - seed = Param(Params._dummy(), "seed", "random seed.", int) + seed = IntParam(Params._dummy(), "seed", "random seed.", None) def __init__(self): super(HasSeed, self).__init__() @@ -344,7 +344,7 @@ class HasTol(Params): Mixin for param tol: the convergence tolerance for iterative algorithms. """ - tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", float) + tol = BooleanParam(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", None) def __init__(self): super(HasTol, self).__init__() @@ -368,7 +368,7 @@ class HasStepSize(Params): Mixin for param stepSize: Step size to be used for each iteration of optimization. """ - stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", float) + stepSize = FloatParam(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", None) def __init__(self): super(HasStepSize, self).__init__() @@ -392,7 +392,7 @@ class HasHandleInvalid(Params): Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later. """ - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str) + handleInvalid = StringParam(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", ParamValidators.inList(['skip', 'error'])) def __init__(self): super(HasHandleInvalid, self).__init__() @@ -416,7 +416,7 @@ class HasElasticNetParam(Params): Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. """ - elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float) + elasticNetParam = FloatParam(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", ParamValidators.inRange(0, 1)) def __init__(self): super(HasElasticNetParam, self).__init__() @@ -441,7 +441,7 @@ class HasFitIntercept(Params): Mixin for param fitIntercept: whether to fit an intercept term. """ - fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", bool) + fitIntercept = BooleanParam(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", None) def __init__(self): super(HasFitIntercept, self).__init__() @@ -466,7 +466,7 @@ class HasStandardization(Params): Mixin for param standardization: whether to standardize the training features before fitting the model. """ - standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", bool) + standardization = BooleanParam(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", None) def __init__(self): super(HasStandardization, self).__init__() @@ -491,7 +491,7 @@ class HasThresholds(Params): Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold. """ - thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None) + thresholds = ListFloatParam(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", lambda lst: all(map(lambda t: t >= 0, lst))) def __init__(self): super(HasThresholds, self).__init__() @@ -515,7 +515,7 @@ class HasWeightCol(Params): Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. """ - weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str) + weightCol = StringParam(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", None) def __init__(self): super(HasWeightCol, self).__init__() @@ -539,7 +539,7 @@ class HasSolver(Params): Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'. """ - solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str) + solver = StringParam(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", None) def __init__(self): super(HasSolver, self).__init__() diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 2b605e5c5078b..3eef481112b3a 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -18,6 +18,7 @@ from pyspark import since from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel +from pyspark.ml.param import * from pyspark.ml.param.shared import * from pyspark.mllib.common import inherit_doc @@ -100,15 +101,18 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha .. versionadded:: 1.4.0 """ - rank = Param(Params._dummy(), "rank", "rank of the factorization") - numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks") - numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks") - implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use implicit preference") - alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference") - userCol = Param(Params._dummy(), "userCol", "column name for user ids") - itemCol = Param(Params._dummy(), "itemCol", "column name for item ids") - ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings") - nonnegative = Param(Params._dummy(), "nonnegative", + rank = IntParam(Params._dummy(), "rank", "rank of the factorization", ParamValidators.gtEq(1)) + numUserBlocks = IntParam(Params._dummy(), "numUserBlocks", "number of user blocks", + ParamValidators.gtEq(1)) + numItemBlocks = IntParam(Params._dummy(), "numItemBlocks", "number of item blocks", + ParamValidators.gtEq(1)) + implicitPrefs = BooleanParam(Params._dummy(), "implicitPrefs", + "whether to use implicit preference") + alpha = FloatParam(Params._dummy(), "alpha", "alpha for implicit preference") + userCol = StringParam(Params._dummy(), "userCol", "column name for user ids") + itemCol = StringParam(Params._dummy(), "itemCol", "column name for item ids") + ratingCol = StringParam(Params._dummy(), "ratingCol", "column name for ratings") + nonnegative = BooleanParam(Params._dummy(), "nonnegative", "whether to use nonnegative constraint for least squares") @keyword_only diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 6b994fe9f93b4..b113a2885daf9 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -187,11 +187,11 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti """ isotonic = \ - Param(Params._dummy(), "isotonic", + BooleanParam(Params._dummy(), "isotonic", "whether the output sequence should be isotonic/increasing (true) or" + "antitonic/decreasing (false).") featureIndex = \ - Param(Params._dummy(), "featureIndex", + IntParam(Params._dummy(), "featureIndex", "The index of the feature if featuresCol is a vector column, no effect otherwise.") @keyword_only @@ -277,7 +277,7 @@ class TreeEnsembleParams(DecisionTreeParams): Mixin for Decision Tree-based ensemble algorithms parameters. """ - subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " + + subsamplingRate = FloatParam(Params._dummy(), "subsamplingRate", "Fraction of the training data " + "used for learning each decision tree, in range (0, 1].") def __init__(self): @@ -305,7 +305,7 @@ class TreeRegressorParams(Params): """ supportedImpurities = ["variance"] - impurity = Param(Params._dummy(), "impurity", + impurity = StringParam(Params._dummy(), "impurity", "Criterion used for information gain calculation (case-insensitive). " + "Supported options: " + ", ".join(supportedImpurities)) @@ -335,9 +335,9 @@ class RandomForestParams(TreeEnsembleParams): """ supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"] - numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).") + numTrees = IntParam(Params._dummy(), "numTrees", "Number of trees to train (>= 1).") featureSubsetStrategy = \ - Param(Params._dummy(), "featureSubsetStrategy", + StringParam(Params._dummy(), "featureSubsetStrategy", "The number of features to consider for splits at each tree node. Supported " + "options: " + ", ".join(supportedFeatureSubsetStrategies)) @@ -607,7 +607,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, .. versionadded:: 1.4.0 """ - lossType = Param(Params._dummy(), "lossType", + lossType = StringParam(Params._dummy(), "lossType", "Loss function which GBT tries to minimize (case-insensitive). " + "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) @@ -720,15 +720,15 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi .. versionadded:: 1.6.0 """ - censorCol = Param(Params._dummy(), "censorCol", + censorCol = StringParam(Params._dummy(), "censorCol", "censor column name. The value of this column could be 0 or 1. " + "If the value is 1, it means the event has occurred i.e. " + "uncensored; otherwise censored.") quantileProbabilities = \ - Param(Params._dummy(), "quantileProbabilities", + ListFloatParam(Params._dummy(), "quantileProbabilities", "quantile probabilities array. Values of the quantile probabilities array " + "should be in the range (0, 1) and the array should be non-empty.") - quantilesCol = Param(Params._dummy(), "quantilesCol", + quantilesCol = StringParam(Params._dummy(), "quantilesCol", "quantiles column name. This column will output quantiles of " + "corresponding quantileProbabilities if it is set.") diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 0cbe97f1d839f..142f7e7e9845e 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -20,7 +20,7 @@ from pyspark import since from pyspark.ml import Estimator, Model -from pyspark.ml.param import Params, Param +from pyspark.ml.param import Params, Param, IntParam, ParamValidators from pyspark.ml.param.shared import HasSeed from pyspark.ml.util import keyword_only from pyspark.sql.functions import rand @@ -120,7 +120,8 @@ class CrossValidator(Estimator, HasSeed): evaluator = Param( Params._dummy(), "evaluator", "evaluator used to select hyper-parameters that maximize the cross-validated metric") - numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation") + numFolds = IntParam(Params._dummy(), "numFolds", "number of folds for cross validation", + ParamValidators.gtEq(2)) @keyword_only def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3, From 0592fcb1aabde90bd42bbd2af9e14894c63e5d49 Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 4 Feb 2016 15:55:02 -0800 Subject: [PATCH 02/10] cleaning up shared param codegen --- python/pyspark/ml/classification.py | 8 +- python/pyspark/ml/clustering.py | 1 - python/pyspark/ml/feature.py | 60 ++++----- python/pyspark/ml/param/__init__.py | 22 +--- .../ml/param/_shared_params_code_gen.py | 116 ++++++++++-------- python/pyspark/ml/param/shared.py | 12 +- python/pyspark/ml/pipeline.py | 4 +- python/pyspark/ml/recommendation.py | 1 - python/pyspark/ml/regression.py | 34 ++--- 9 files changed, 125 insertions(+), 133 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 243e2d53172e9..3b16c78cdc7fc 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -20,7 +20,6 @@ from pyspark import since from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel -from pyspark.ml.param import * from pyspark.ml.param.shared import * from pyspark.ml.regression import ( RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels) @@ -231,9 +230,10 @@ class TreeClassifierParams(object): supportedImpurities = ["entropy", "gini"] impurity = StringParam(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities), ParamValidators.inList(supportedImpurities)) + "Criterion used for information gain calculation (case-insensitive). " + + "Supported options: " + + ", ".join(supportedImpurities), + ParamValidators.inList(supportedImpurities)) def __init__(self): super(TreeClassifierParams, self).__init__() diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 81f14589b1ff9..5070485ec0b6a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -19,7 +19,6 @@ from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * -from pyspark.ml.param import * from pyspark.mllib.common import inherit_doc __all__ = ['BisectingKMeans', 'BisectingKMeansModel', diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 31fe39deba00c..99006dc24a122 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -145,12 +145,12 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): # TODO: add a checksplits validation function? splits = \ ListFloatParam(Params._dummy(), "splits", - "Split points for mapping continuous features into buckets. With n+1 splits, " + - "there are n buckets. A bucket defined by splits x,y holds values in the " + - "range [x,y) except the last bucket, which also includes y. The splits " + - "should be strictly increasing. Values at -inf, inf must be explicitly " + - "provided to cover all Double values; otherwise, values outside the splits " + - "specified will be treated as errors.") + "Split points for mapping continuous features into buckets. With n+1 " + + "splits, there are n buckets. A bucket defined by splits x,y holds values " + + "in the range [x,y) except the last bucket, which also includes y. The " + + "splits should be strictly increasing. Values at -inf, inf must be " + + "explicitly provided to cover all Double values; otherwise, values outside" + + " the splits specified will be treated as errors.") @keyword_only def __init__(self, splits=None, inputCol=None, outputCol=None): @@ -230,7 +230,7 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol): " Default 1.0", ParamValidators.gtEq(0)) vocabSize = IntParam( Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", - ParamValidators.gt(0)) + ParamValidators.gt(0)) @keyword_only def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): @@ -349,7 +349,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol): """ inverse = BooleanParam(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " + - "default False.") + "default False.") @keyword_only def __init__(self, inverse=False, inputCol=None, outputCol=None): @@ -410,7 +410,7 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): """ scalingVec = VectorParam(Params._dummy(), "scalingVec", "vector for hadamard product, " + - "it must be MLlib Vector type.") + "it must be MLlib Vector type.") @keyword_only def __init__(self, scalingVec=None, inputCol=None, outputCol=None): @@ -515,7 +515,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol): """ minDocFreq = IntParam(Params._dummy(), "minDocFreq", - "minimum of documents in which a term should appear for filtering") + "minimum of documents in which a term should appear for filtering") @keyword_only def __init__(self, minDocFreq=0, inputCol=None, outputCol=None): @@ -1054,8 +1054,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed): # a placeholder to make it appear in the generated doc numBuckets = IntParam(Params._dummy(), "numBuckets", - "Maximum number of buckets (quantiles, or " + - "categories) into which data points are grouped. Must be >= 2. Default 2.", + "Maximum number of buckets (quantiles, or categories) into which data " + + "points are grouped. Must be >= 2. Default 2.", ParamValidators.gtEq(2)) @keyword_only @@ -1481,8 +1481,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): """ labels = ListStringParam(Params._dummy(), "labels", - "Optional array of labels specifying index-string mapping." + - " If not provided or if empty, then metadata from inputCol is used instead.") + "Optional array of labels specifying index-string mapping. If not " + + "provided or if empty, then metadata from inputCol is used instead.") @keyword_only def __init__(self, inputCol=None, outputCol=None, labels=None): @@ -1748,9 +1748,9 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol): """ maxCategories = IntParam(Params._dummy(), "maxCategories", - "Threshold for the number of values a categorical feature can take " + - "(>= 2). If a feature is found to have > maxCategories values, then " + - "it is declared continuous.", ParamValidators.gtEq(2)) + "Threshold for the number of values a categorical feature can take " + + "(>= 2). If a feature is found to have > maxCategories values, then " + + "it is declared continuous.", ParamValidators.gtEq(2)) @keyword_only def __init__(self, maxCategories=20, inputCol=None, outputCol=None): @@ -1847,12 +1847,12 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ # TODO: define custom isValid functions? - indices = ListIntParam(Params._dummy(), "indices", "An array of indices to select features from " + - "a vector column. There can be no overlap with names.") - names = ListStringParam(Params._dummy(), "names", "An array of feature names to select features from " + - "a vector column. These names must be specified by ML " + - "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " + - "indices.") + indices = ListIntParam(Params._dummy(), "indices", "An array of indices to select features " + + "from a vector column. There can be no overlap with names.") + names = ListStringParam(Params._dummy(), "names", "An array of feature names to select " + + "features from a vector column. These names must be specified by ML " + + "org.apache.spark.ml.attribute.Attribute. There can be no overlap " + "with indices.") @keyword_only def __init__(self, inputCol=None, outputCol=None, indices=None, names=None): @@ -1941,12 +1941,12 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has """ vectorSize = IntParam(Params._dummy(), "vectorSize", - "the dimension of codes after transforming from words") + "the dimension of codes after transforming from words") numPartitions = IntParam(Params._dummy(), "numPartitions", - "number of partitions for sentences of words") + "number of partitions for sentences of words") minCount = IntParam(Params._dummy(), "minCount", - "the minimum number of times a token must appear to be included in the " + - "word2vec model's vocabulary") + "the minimum number of times a token must appear to be included in the " + + "word2vec model's vocabulary") @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, @@ -2261,9 +2261,9 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol): # a placeholder to make it appear in the generated doc numTopFeatures = \ IntParam(Params._dummy(), "numTopFeatures", - "Number of features that selector will select, ordered by statistics value " + - "descending. If the number of features is < numTopFeatures, then this will select " + - "all features.", ParamValidators.gtEq(1)) + "Number of features that selector will select, ordered by statistics value " + + "descending. If the number of features is < numTopFeatures, then this will " + + "select all features.", ParamValidators.gtEq(1)) @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"): diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 539117b1856eb..98dc185230750 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -42,7 +42,6 @@ def __init__(self, parent, name, doc, isValid=None): self.parent = parent.uid self.name = str(name) self.doc = str(doc) - # self.expectedType = expectedType self.isValid = isValid if isValid is not None else ParamValidators.alwaysTrue() def _copy_new_parent(self, parent): @@ -56,7 +55,8 @@ def _copy_new_parent(self, parent): def _validate(self, value): if not self.isValid(value): - raise ValueError("Invalid value") + raise ValueError("{parent} parameter {name} given invalid value {value}" + .format(parent=self.parent, name=self.name, value=str(value))) def _convertAndValidate(self, value): self._validate(value) @@ -475,23 +475,7 @@ def _set(self, **kwargs): p = getattr(self, param) value = p._convertAndValidate(value) self._paramMap[getattr(self, param)] = value - # if p.expectedType is None or type(value) == p.expectedType or value is None: - # self._paramMap[getattr(self, param)] = value - # else: - # try: - # # Try and do "safe" conversions that don't lose information - # if p.expectedType == float: - # self._paramMap[getattr(self, param)] = float(value) - # # Python 3 unified long & int - # elif p.expectedType == int and type(value).__name__ == 'long': - # self._paramMap[getattr(self, param)] = value - # else: - # raise Exception( - # "Provided type {0} incompatible with type {1} for param {2}" - # .format(type(value), p.expectedType, p)) - # except ValueError: - # raise Exception(("Failed to convert {0} to type {1} for param {2}" - # .format(type(value), p.expectedType, p))) + return self def _setDefault(self, **kwargs): diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 40ec4de91cfae..90ef3eb4a9378 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -60,9 +60,6 @@ def __init__(self): self._setDefault($name=$defaultValueStr)''' Name = name[0].upper() + name[1:] - # expectedTypeName = str(expectedType) - # if expectedType is not None: - # expectedTypeName = expectedType.__name__ if isValidFunctionStr is None: isValidFunctionStr = str(None) return template \ @@ -110,64 +107,72 @@ def get$Name(self): print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n") print("from pyspark.ml.param import *\n\n") shared = [ - ("IntParam", "maxIter", "max number of iterations (>= 0).", None, "ParamValidators.gtEq(0)"), - ("FloatParam", "regParam", "regularization parameter (>= 0).", None, "ParamValidators.gtEq(0)"), - ("StringParam", "featuresCol", "features column name.", "'features'", None), - ("StringParam", "labelCol", "label column name.", "'label'", None), - ("StringParam", "predictionCol", "prediction column name.", "'prediction'", None), - ("StringParam", "probabilityCol", "Column name for predicted class conditional probabilities. " + - "Note: Not all models output well-calibrated probability estimates! These probabilities " + - "should be treated as confidences, not precise probabilities.", "'probability'", None), - ("StringParam", "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'", None), - ("StringParam", "inputCol", "input column name.", None, None), - ("ListStringParam", "inputCols", "input column names.", None, None), - ("StringParam", "outputCol", "output column name.", "self.uid + '__output'", None), - ("IntParam", "numFeatures", "number of features.", None, "ParamValidators.gtEq(0)"), - ("IntParam", "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " + - "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, + ("maxIter", "IntParam", "max number of iterations (>= 0).", None, + "ParamValidators.gtEq(0)"), + ("regParam", "FloatParam", "regularization parameter (>= 0).", None, + "ParamValidators.gtEq(0)"), + ("featuresCol", "StringParam", "features column name.", "'features'", None), + ("labelCol", "StringParam", "label column name.", "'label'", None), + ("predictionCol", "StringParam", "prediction column name.", "'prediction'", None), + ("probabilityCol", "StringParam", "Column name for predicted class conditional " + + "probabilities. Note: Not all models output well-calibrated probability estimates! These" + + " probabilities should be treated as confidences, not precise probabilities.", + "'probability'", None), + ("rawPredictionCol", "StringParam", "raw prediction (a.k.a. confidence) column name.", + "'rawPrediction'", None), + ("inputCol", "StringParam", "input column name.", None, None), + ("inputCols", "ListStringParam", "input column names.", None, None), + ("outputCol", "StringParam", "output column name.", "self.uid + '__output'", None), + ("numFeatures", "IntParam", "number of features.", None, "ParamValidators.gtEq(0)"), + ("checkpointInterval", "IntParam", "set checkpoint interval (>= 1) or disable checkpoint" + + " (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, "lambda interval: (interval == -1) or (interval >= 1)"), - ("IntParam", "seed", "random seed.", "hash(type(self).__name__)", None), - ("BooleanParam", "tol", "the convergence tolerance for iterative algorithms.", None, None), - ("FloatParam", "stepSize", "Step size to be used for each iteration of optimization.", None, None), - ("StringParam", "handleInvalid", "how to handle invalid entries. Options are skip (which will filter " + - "out rows with bad values), or error (which will throw an errror). More options may be " + - "added later.", None, "ParamValidators.inList(['skip', 'error'])"), - ("FloatParam", "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + - "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", + ("seed", "IntParam", "random seed.", "hash(type(self).__name__)", None), + ("tol", "BooleanParam", "the convergence tolerance for iterative algorithms.", None, None), + ("stepSize", "FloatParam", "Step size to be used for each iteration of optimization.", None, + None), + ("handleInvalid", "StringParam", "how to handle invalid entries. Options are skip (which " + + "will filter out rows with bad values), or error (which will throw an errror). More " + + "options may be added later.", None, "ParamValidators.inList(['skip', 'error'])"), + ("elasticNetParam", "FloatParam", "the ElasticNet mixing parameter, in range [0, 1]. For " + + "alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", "ParamValidators.inRange(0, 1)"), - ("BooleanParam", "fitIntercept", "whether to fit an intercept term.", "True", None), - ("BooleanParam", "standardization", "whether to standardize the training features before fitting the " + - "model.", "True", None), - ("ListFloatParam", "thresholds", "Thresholds in multi-class classification to adjust the probability of " + - "predicting each class. Array must have length equal to the number of classes, with " + - "values >= 0. The class with largest value p/t is predicted, where p is the original " + - "probability of that class and t is the class' threshold.", None, + ("fitIntercept", "BooleanParam", "whether to fit an intercept term.", "True", None), + ("standardization", "BooleanParam", "whether to standardize the training features before " + + "fitting the model.", "True", None), + ("thresholds", "ListFloatParam", "Thresholds in multi-class classification to adjust the " + + "probability of predicting each class. Array must have length equal to the number of " + + "classes, with values >= 0. The class with largest value p/t is predicted, where p is " + + "the original probability of that class and t is the class' threshold.", None, "lambda lst: all(map(lambda t: t >= 0, lst))"), - ("StringParam", "weightCol", "weight column name. If this is not set or empty, we treat " + + ("weightCol", "StringParam", "weight column name. If this is not set or empty, we treat " + "all instance weights as 1.0.", None, None), - ("StringParam", "solver", "the solver algorithm for optimization. If this is not set or empty, " + - "default value is 'auto'.", "'auto'", None)] + ("solver", "StringParam", "the solver algorithm for optimization. If this is not set or " + + "empty, default value is 'auto'.", "'auto'", None)] code = [] - for paramClassStr, name, doc, defaultValueStr, isValidFunctionStr in shared: - param_code = _gen_param_header(paramClassStr, name, doc, defaultValueStr, isValidFunctionStr) + for name, paramClassStr, doc, defaultValueStr, isValidStr in shared: + param_code = _gen_param_header(paramClassStr, name, doc, defaultValueStr, isValidStr) code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr)) decisionTreeParams = [ - ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " + - "depth 1 means 1 internal node + 2 leaf nodes."), - ("maxBins", "Max number of bins for" + + ("maxDepth", "IntParam", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf " + + "node; depth 1 means 1 internal node + 2 leaf nodes.", "ParamValidators.gtEq(0)"), + ("maxBins", "IntParam", "Max number of bins for" + " discretizing continuous features. Must be >=2 and >= number of categories for any" + - " categorical feature."), - ("minInstancesPerNode", "Minimum number of instances each child must have after split. " + - "If a split causes the left or right child to have fewer than minInstancesPerNode, the " + - "split will be discarded as invalid. Should be >= 1."), - ("minInfoGain", "Minimum information gain for a split to be considered at a tree node."), - ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."), - ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + - "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees. Users can set how often should the " + - "cache be checkpointed or disable it by setting checkpointInterval.")] + " categorical feature.", "ParamValidators.gtEq(2)"), + ("minInstancesPerNode", "IntParam", "Minimum number of instances each child must have " + + "after split. If a split causes the left or right child to have fewer than " + + "minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", + "ParamValidators.gtEq(1)"), + ("minInfoGain", "FloatParam", "Minimum information gain for a split to be considered at a " + "tree node.", None), + ("maxMemoryInMB", "IntParam", "Maximum memory in MB allocated to histogram aggregation.", + "ParamValidators.gtEq(0)"), + ("cacheNodeIds", "BooleanParam", "If false, the algorithm will pass trees to executors " + + "to match instances with nodes. If true, the algorithm will cache node IDs for each " + + "instance. Caching can speed up training of deeper trees. Users can set how often " + + "should the cache be checkpointed or disable it by setting checkpointInterval.", None)] decisionTreeCode = '''class DecisionTreeParams(Params): """ @@ -180,9 +185,12 @@ def __init__(self): super(DecisionTreeParams, self).__init__()''' dtParamMethods = "" dummyPlaceholders = "" - paramTemplate = """$name = Param($owner, "$name", "$doc")""" - for name, doc in decisionTreeParams: - variable = paramTemplate.replace("$name", name).replace("$doc", doc) + paramTemplate = """$name = $paramClass($owner, "$name", "$doc", $isValid)""" + for name, paramClassStr, doc, isValidStr in decisionTreeParams: + if isValidStr is None: + isValidStr = str(None) + variable = paramTemplate.replace("$name", name).replace("$doc", doc)\ + .replace("$paramClass", paramClassStr).replace("$isValid", isValidStr) dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " dtParamMethods += _gen_param_code(name, doc, None) + "\n" code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" + diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 2d81cb0e50ecf..72ec8b0bd5e46 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -564,12 +564,12 @@ class DecisionTreeParams(Params): Mixin for Decision Tree parameters. """ - maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.") - maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.") - minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.") - minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.") - maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.") - cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.") + maxDepth = IntParam(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", ParamValidators.gtEq(0)) + maxBins = IntParam(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.", ParamValidators.gtEq(2)) + minInstancesPerNode = IntParam(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", ParamValidators.gtEq(1)) + minInfoGain = FloatParam(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", None) + maxMemoryInMB = IntParam(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.", ParamValidators.gtEq(0)) + cacheNodeIds = BooleanParam(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", None) def __init__(self): diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 661074ca96212..6305b7dbd900a 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -149,7 +149,9 @@ class Pipeline(Estimator): .. versionadded:: 1.3.0 """ - stages = Param(Params._dummy(), "stages", "pipeline stages") + stages = Param(Params._dummy(), "stages", "pipeline stages", + isValid=lambda lst: all(map(lambda s: isinstance(s, Estimator) or + isinstance(s, Transformer), lst))) @keyword_only def __init__(self, stages=None): diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 3eef481112b3a..ab2a759ab5155 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -18,7 +18,6 @@ from pyspark import since from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel -from pyspark.ml.param import * from pyspark.ml.param.shared import * from pyspark.mllib.common import inherit_doc diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index b113a2885daf9..b670ca5b0ec8b 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -188,11 +188,11 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti isotonic = \ BooleanParam(Params._dummy(), "isotonic", - "whether the output sequence should be isotonic/increasing (true) or" + - "antitonic/decreasing (false).") + "whether the output sequence should be isotonic/increasing (true) or" + + "antitonic/decreasing (false).") featureIndex = \ IntParam(Params._dummy(), "featureIndex", - "The index of the feature if featuresCol is a vector column, no effect otherwise.") + "The index of the feature if featuresCol is a vector column, no effect otherwise.") @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -277,8 +277,8 @@ class TreeEnsembleParams(DecisionTreeParams): Mixin for Decision Tree-based ensemble algorithms parameters. """ - subsamplingRate = FloatParam(Params._dummy(), "subsamplingRate", "Fraction of the training data " + - "used for learning each decision tree, in range (0, 1].") + subsamplingRate = FloatParam(Params._dummy(), "subsamplingRate", "Fraction of the training " + + "data used for learning each decision tree, in range (0, 1].") def __init__(self): super(TreeEnsembleParams, self).__init__() @@ -306,9 +306,9 @@ class TreeRegressorParams(Params): supportedImpurities = ["variance"] impurity = StringParam(Params._dummy(), "impurity", - "Criterion used for information gain calculation (case-insensitive). " + - "Supported options: " + - ", ".join(supportedImpurities)) + "Criterion used for information gain calculation (case-insensitive). " + + "Supported options: " + + ", ".join(supportedImpurities)) def __init__(self): super(TreeRegressorParams, self).__init__() @@ -608,8 +608,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, """ lossType = StringParam(Params._dummy(), "lossType", - "Loss function which GBT tries to minimize (case-insensitive). " + - "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) + "Loss function which GBT tries to minimize (case-insensitive). " + + "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", @@ -721,16 +721,16 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi """ censorCol = StringParam(Params._dummy(), "censorCol", - "censor column name. The value of this column could be 0 or 1. " + - "If the value is 1, it means the event has occurred i.e. " + - "uncensored; otherwise censored.") + "censor column name. The value of this column could be 0 or 1. " + + "If the value is 1, it means the event has occurred i.e. " + + "uncensored; otherwise censored.") quantileProbabilities = \ ListFloatParam(Params._dummy(), "quantileProbabilities", - "quantile probabilities array. Values of the quantile probabilities array " + - "should be in the range (0, 1) and the array should be non-empty.") + "quantile probabilities array. Values of the quantile probabilities array " + + "should be in the range (0, 1) and the array should be non-empty.") quantilesCol = StringParam(Params._dummy(), "quantilesCol", - "quantiles column name. This column will output quantiles of " + - "corresponding quantileProbabilities if it is set.") + "quantiles column name. This column will output quantiles of " + + "corresponding quantileProbabilities if it is set.") @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", From 4b76febcf3a9cf532098135f065df3503166aead Mon Sep 17 00:00:00 2001 From: sethah Date: Thu, 4 Feb 2016 16:52:01 -0800 Subject: [PATCH 03/10] some doc strings and cleanup --- python/pyspark/ml/feature.py | 2 +- python/pyspark/ml/param/__init__.py | 38 +++++++++++++++++++++++++++++ python/pyspark/ml/recommendation.py | 2 +- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 99006dc24a122..2b8e4a84d23a3 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -21,8 +21,8 @@ from pyspark import since from pyspark.rdd import ignore_unicode_prefix -from pyspark.ml.param.shared import * from pyspark.ml.param import ParamValidators +from pyspark.ml.param.shared import * from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm from pyspark.mllib.common import inherit_doc diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 98dc185230750..a82a6c255b23a 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -208,13 +208,27 @@ def _convertAndValidate(self, value): class ParamValidators(object): + """ + .. note:: DeveloperApi + + Factory methods for common validation functions for `Param.isValid`. + + .. versionadded:: 2.0.0 + + """ @staticmethod def alwaysTrue(): + """ + Returns a validation function which is always `True`. + """ return lambda value: True @staticmethod def primitiveConvert(value, primitiveType): + """ + Convert a value to a primitive, if possible. + """ if type(value) != primitiveType: try: value = primitiveType(value) @@ -224,6 +238,9 @@ def primitiveConvert(value, primitiveType): @staticmethod def convertToList(value): + """ + Convert a value to a list, if possible. + """ if type(value) == np.ndarray: return list(value) elif isinstance(value, Vector): @@ -233,22 +250,37 @@ def convertToList(value): @staticmethod def gt(lowerBound): + """ + Check if value > lowerBound. + """ return lambda value: value > lowerBound @staticmethod def gtEq(lowerBound): + """ + Check if value >= lowerBound. + """ return lambda value: value >= lowerBound @staticmethod def lt(lowerBound): + """ + Check if value < lowerBound. + """ return lambda value: value < lowerBound @staticmethod def ltEq(lowerBound): + """ + Check if value <= lowerBound. + """ return lambda value: value <= lowerBound @staticmethod def inRange(lowerBound, upperBound, lowerInclusive=True, upperInclusive=True): + """ + Check if value is in range lowerBound to upperBound. + """ def inRangeFunction(x): lowerValid = (x >= lowerBound) if lowerInclusive else (x > lowerBound) upperValid = (x <= upperBound) if upperInclusive else (x < upperBound) @@ -257,10 +289,16 @@ def inRangeFunction(x): @staticmethod def inList(allowed): + """ + Check for a value in an allowed set of values. + """ return lambda value: value in allowed @staticmethod def listLengthGt(lowerBound): + """ + Check that the list length is greater than `lowerBound`. + """ return lambda lst: len(lst) > lowerBound diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index ab2a759ab5155..f1683abc9ccc0 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -112,7 +112,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha itemCol = StringParam(Params._dummy(), "itemCol", "column name for item ids") ratingCol = StringParam(Params._dummy(), "ratingCol", "column name for ratings") nonnegative = BooleanParam(Params._dummy(), "nonnegative", - "whether to use nonnegative constraint for least squares") + "whether to use nonnegative constraint for least squares") @keyword_only def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, From 9aa7c027c1580b79ea8291f1453fa14415136bd5 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 5 Feb 2016 11:31:04 -0800 Subject: [PATCH 04/10] adding tests --- python/pyspark/ml/param/__init__.py | 9 ++- .../ml/param/_shared_params_code_gen.py | 2 +- python/pyspark/ml/param/shared.py | 2 +- python/pyspark/ml/tests.py | 66 +++++++++++++++---- 4 files changed, 61 insertions(+), 18 deletions(-) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index a82a6c255b23a..176215a382f10 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -66,7 +66,8 @@ def __str__(self): return str(self.parent) + "__" + self.name def __repr__(self): - return "Param(parent=%r, name=%r, doc=%r)" % (self.parent, self.name, self.doc) + class_name = self.__class__.__name__ + return "%s(parent=%r, name=%r, doc=%r)" % (class_name, self.parent, self.name, self.doc) def __hash__(self): return hash(str(self)) @@ -125,7 +126,11 @@ class BooleanParam(Param): """ def _convertAndValidate(self, value): - value = ParamValidators.primitiveConvert(value, bool) + if type(value) != bool: + if value in {0, 1}: + value = bool(value) + else: + raise TypeError("Could not convert %s to a Boolean" % value) self._validate(value) return value diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 90ef3eb4a9378..3deba8e2cf580 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -128,7 +128,7 @@ def get$Name(self): " (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, "lambda interval: (interval == -1) or (interval >= 1)"), ("seed", "IntParam", "random seed.", "hash(type(self).__name__)", None), - ("tol", "BooleanParam", "the convergence tolerance for iterative algorithms.", None, None), + ("tol", "FloatParam", "the convergence tolerance for iterative algorithms.", None, None), ("stepSize", "FloatParam", "Step size to be used for each iteration of optimization.", None, None), ("handleInvalid", "StringParam", "how to handle invalid entries. Options are skip (which " + diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 72ec8b0bd5e46..e004d949e1418 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -344,7 +344,7 @@ class HasTol(Params): Mixin for param tol: the convergence tolerance for iterative algorithms. """ - tol = BooleanParam(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", None) + tol = FloatParam(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", None) def __init__(self): super(HasTol, self).__init__() diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 5fcfa9e61f6da..e2b5a852d63cd 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -20,6 +20,7 @@ """ import sys +import numpy as np try: import xmlrunner except ImportError: @@ -103,20 +104,57 @@ class ParamTypeConversionTests(PySparkTestCase): Test that param type conversion happens. """ - def test_int_to_float(self): - from pyspark.mllib.linalg import Vectors - df = self.sc.parallelize([ - Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF() - lr = LogisticRegression(elasticNetParam=0) - lr.fit(df) - lr.setElasticNetParam(0) - lr.fit(df) - - def test_invalid_to_float(self): - from pyspark.mllib.linalg import Vectors - self.assertRaises(Exception, lambda: LogisticRegression(elasticNetParam="happy")) - lr = LogisticRegression(elasticNetParam=0) - self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda")) + def test_list_int(self): + vs = VectorSlicer(indices=[1.0, 4.0]) + self.assertEqual(vs.getIndices(), [1, 4]) + vs = VectorSlicer(indices=np.array([1.0, 4.0])) + self.assertEqual(vs.getIndices(), [1, 4]) + vs = VectorSlicer(indices=DenseVector([1.0, 4.0])) + self.assertEqual(vs.getIndices(), [1, 4]) + self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"])) + + def test_list_float(self): + b = Bucketizer(splits=[1, 4]) + self.assertEqual(b.getSplits(), [1.0, 4.0]) + self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", "b"])) + + def test_list_string(self): + idx_to_string = IndexToString(labels=[1, 4]) + self.assertEqual(idx_to_string.getLabels(), ['1', '4']) + self.assertRaises(TypeError, lambda: IndexToString(labels="a")) + + def test_int(self): + lr = LogisticRegression(maxIter=5.0) + self.assertEqual(lr.getMaxIter(), 5) + lr.setMaxIter(5L) + self.assertEqual(lr.getMaxIter(), 5) + self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt")) + + def test_float(self): + lr = LogisticRegression(tol=1) + self.assertEqual(lr.getTol(), 1.0) + lr.setTol(1L) + self.assertEqual(lr.getTol(), 1.0) + self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat")) + + def test_string(self): + lr = LogisticRegression(featuresCol=1) + self.assertEqual(lr.getFeaturesCol(), '1') + + def test_bool(self): + lr = LogisticRegression(fitIntercept=1) + self.assertEqual(lr.getFitIntercept(), True) + lr.setFitIntercept(0.0) + self.assertEqual(lr.getFitIntercept(), False) + self.assertRaises(TypeError, lambda: lr.setFitIntercept(1.2)) + self.assertRaises(TypeError, lambda: lr.setFitIntercept("false")) + + def test_vector(self): + ewp = ElementwiseProduct(scalingVec=[1, 3]) + self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0])) + ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4])) + self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) + self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"])) class PipelineTests(PySparkTestCase): From 05045ca060dcc214e99cc5fcc24d1f5cb1b86d5c Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 5 Feb 2016 13:56:37 -0800 Subject: [PATCH 05/10] minor cleanup --- python/pyspark/ml/feature.py | 2 -- python/pyspark/ml/pipeline.py | 4 ++-- python/pyspark/ml/regression.py | 4 ++-- python/pyspark/ml/tests.py | 38 ++++++++++++++++----------------- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 2b8e4a84d23a3..7926168469deb 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -142,7 +142,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.3.0 """ - # TODO: add a checksplits validation function? splits = \ ListFloatParam(Params._dummy(), "splits", "Split points for mapping continuous features into buckets. With n+1 " + @@ -1846,7 +1845,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol): .. versionadded:: 1.6.0 """ - # TODO: define custom isValid functions? indices = ListIntParam(Params._dummy(), "indices", "An array of indices to select features " + "from a vector column. There can be no overlap with names.") names = ListStringParam(Params._dummy(), "names", "An array of feature names to select " + diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py index 6305b7dbd900a..176df0c28c9cc 100644 --- a/python/pyspark/ml/pipeline.py +++ b/python/pyspark/ml/pipeline.py @@ -150,8 +150,8 @@ class Pipeline(Estimator): """ stages = Param(Params._dummy(), "stages", "pipeline stages", - isValid=lambda lst: all(map(lambda s: isinstance(s, Estimator) or - isinstance(s, Transformer), lst))) + isValid=lambda lst: + all(map(lambda s: isinstance(s, Estimator) or isinstance(s, Transformer), lst))) @keyword_only def __init__(self, stages=None): diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index b670ca5b0ec8b..09d0e5a55a730 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -338,8 +338,8 @@ class RandomForestParams(TreeEnsembleParams): numTrees = IntParam(Params._dummy(), "numTrees", "Number of trees to train (>= 1).") featureSubsetStrategy = \ StringParam(Params._dummy(), "featureSubsetStrategy", - "The number of features to consider for splits at each tree node. Supported " + - "options: " + ", ".join(supportedFeatureSubsetStrategies)) + "The number of features to consider for splits at each tree node. Supported " + + "options: " + ", ".join(supportedFeatureSubsetStrategies)) def __init__(self): super(RandomForestParams, self).__init__() diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index e2b5a852d63cd..6ac4f63a32e1d 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -104,25 +104,6 @@ class ParamTypeConversionTests(PySparkTestCase): Test that param type conversion happens. """ - def test_list_int(self): - vs = VectorSlicer(indices=[1.0, 4.0]) - self.assertEqual(vs.getIndices(), [1, 4]) - vs = VectorSlicer(indices=np.array([1.0, 4.0])) - self.assertEqual(vs.getIndices(), [1, 4]) - vs = VectorSlicer(indices=DenseVector([1.0, 4.0])) - self.assertEqual(vs.getIndices(), [1, 4]) - self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"])) - - def test_list_float(self): - b = Bucketizer(splits=[1, 4]) - self.assertEqual(b.getSplits(), [1.0, 4.0]) - self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", "b"])) - - def test_list_string(self): - idx_to_string = IndexToString(labels=[1, 4]) - self.assertEqual(idx_to_string.getLabels(), ['1', '4']) - self.assertRaises(TypeError, lambda: IndexToString(labels="a")) - def test_int(self): lr = LogisticRegression(maxIter=5.0) self.assertEqual(lr.getMaxIter(), 5) @@ -156,6 +137,25 @@ def test_vector(self): self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"])) + def test_list_int(self): + vs = VectorSlicer(indices=[1.0, 4.0]) + self.assertEqual(vs.getIndices(), [1, 4]) + vs = VectorSlicer(indices=np.array([1.0, 4.0])) + self.assertEqual(vs.getIndices(), [1, 4]) + vs = VectorSlicer(indices=DenseVector([1.0, 4.0])) + self.assertEqual(vs.getIndices(), [1, 4]) + self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"])) + + def test_list_float(self): + b = Bucketizer(splits=[1, 4]) + self.assertEqual(b.getSplits(), [1.0, 4.0]) + self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", "b"])) + + def test_list_string(self): + idx_to_string = IndexToString(labels=[1, 4]) + self.assertEqual(idx_to_string.getLabels(), ['1', '4']) + self.assertRaises(TypeError, lambda: IndexToString(labels="a")) + class PipelineTests(PySparkTestCase): From f02f5895bdefefc829e0524b3a09d025dfc77430 Mon Sep 17 00:00:00 2001 From: sethah Date: Fri, 5 Feb 2016 22:40:37 -0800 Subject: [PATCH 06/10] python version compatibility --- python/pyspark/ml/param/__init__.py | 8 ++++---- python/pyspark/ml/tests.py | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 176215a382f10..a1be9a854a89e 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -127,7 +127,7 @@ class BooleanParam(Param): def _convertAndValidate(self, value): if type(value) != bool: - if value in {0, 1}: + if value in [0, 1]: value = bool(value) else: raise TypeError("Could not convert %s to a Boolean" % value) @@ -148,7 +148,7 @@ def _convertAndValidate(self, value): if not all(map(lambda v: type(v) == int, value)): try: - value = map(lambda v: int(v), value) + value = list(map(lambda v: int(v), value)) except ValueError: raise TypeError("Could not convert %s to a list of integers" % value) self._validate(value) @@ -168,7 +168,7 @@ def _convertAndValidate(self, value): if not all(map(lambda v: type(v) == float, value)): try: - value = map(lambda v: float(v), value) + value = list(map(lambda v: float(v), value)) except ValueError: raise TypeError("Could not convert %s to a list of floats" % value) self._validate(value) @@ -188,7 +188,7 @@ def _convertAndValidate(self, value): if not all(map(lambda v: type(v) == str, value)): try: - value = map(lambda v: str(v), value) + value = list(map(lambda v: str(v), value)) except ValueError: raise TypeError("Could not convert %s to a list of strings" % value) self._validate(value) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 6ac4f63a32e1d..f30a767704b14 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -107,15 +107,11 @@ class ParamTypeConversionTests(PySparkTestCase): def test_int(self): lr = LogisticRegression(maxIter=5.0) self.assertEqual(lr.getMaxIter(), 5) - lr.setMaxIter(5L) - self.assertEqual(lr.getMaxIter(), 5) self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt")) def test_float(self): lr = LogisticRegression(tol=1) self.assertEqual(lr.getTol(), 1.0) - lr.setTol(1L) - self.assertEqual(lr.getTol(), 1.0) self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat")) def test_string(self): From d90503266849eae4dabf2085ce704f655ba085c0 Mon Sep 17 00:00:00 2001 From: sethah Date: Mon, 8 Feb 2016 13:11:58 -0800 Subject: [PATCH 07/10] adding docstring to new privat param methods --- python/pyspark/ml/param/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index a1be9a854a89e..e9d7d4cc0877f 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -54,11 +54,17 @@ def _copy_new_parent(self, parent): raise ValueError("Cannot copy from non-dummy parent %s." % parent) def _validate(self, value): + """Check if `value` is valid and raise `ValueError` if not.""" if not self.isValid(value): raise ValueError("{parent} parameter {name} given invalid value {value}" .format(parent=self.parent, name=self.name, value=str(value))) def _convertAndValidate(self, value): + """ + Check that `value` is the expected type for this param and convert + to the correct type if not. Additionally check that value meets validation + requirements a specific `Param` instance. + """ self._validate(value) return value From 77aad43437a466760d972bc13d749ea8c598be16 Mon Sep 17 00:00:00 2001 From: sethah Date: Tue, 9 Feb 2016 16:31:59 -0800 Subject: [PATCH 08/10] correct binarizer threshold description --- python/pyspark/ml/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 7926168469deb..61e04827e993e 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -79,7 +79,7 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol): """ threshold = FloatParam(Params._dummy(), "threshold", - "threshold in binary classification prediction") + "threshold used to binarize continuous features") @keyword_only def __init__(self, threshold=0.0, inputCol=None, outputCol=None): From 1adda2dbf6054860d183c1f7eeb2656973e836a6 Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 2 Mar 2016 08:55:16 -0800 Subject: [PATCH 09/10] undoing polynomial expansion change --- .../scala/org/apache/spark/ml/feature/PolynomialExpansion.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 0a9b9719c15d3..42b26c8ee836c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -46,7 +46,7 @@ class PolynomialExpansion(override val uid: String) * @group param */ val degree = new IntParam(this, "degree", "the polynomial degree to expand (>= 1)", - ParamValidators.gtEq(1)) + ParamValidators.gt(1)) setDefault(degree -> 2) From 16e9233e78cb0360a94c92780017a38fdf9ca04c Mon Sep 17 00:00:00 2001 From: sethah Date: Wed, 2 Mar 2016 09:47:32 -0800 Subject: [PATCH 10/10] revert --- .../scala/org/apache/spark/ml/feature/PolynomialExpansion.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index 42b26c8ee836c..0a9b9719c15d3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -46,7 +46,7 @@ class PolynomialExpansion(override val uid: String) * @group param */ val degree = new IntParam(this, "degree", "the polynomial degree to expand (>= 1)", - ParamValidators.gt(1)) + ParamValidators.gtEq(1)) setDefault(degree -> 2)