Skip to content

Commit

Permalink
[SPARK-23008][ML] OnehotEncoderEstimator python API
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

OnehotEncoderEstimator python API.

## How was this patch tested?

doctest

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20209 from WeichenXu123/ohe_py.

(cherry picked from commit b5042d7)
Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
  • Loading branch information
WeichenXu123 authored and jkbradley committed Jan 12, 2018
1 parent 6bb2296 commit 55695c7
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 0 deletions.
113 changes: 113 additions & 0 deletions python/pyspark/ml/feature.py
Expand Up @@ -45,6 +45,7 @@
'NGram',
'Normalizer',
'OneHotEncoder',
'OneHotEncoderEstimator', 'OneHotEncoderModel',
'PCA', 'PCAModel',
'PolynomialExpansion',
'QuantileDiscretizer',
Expand Down Expand Up @@ -1641,6 +1642,118 @@ def getDropLast(self):
return self.getOrDefault(self.dropLast)


@inherit_doc
class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid,
JavaMLReadable, JavaMLWritable):
"""
A one-hot encoder that maps a column of category indices to a column of binary vectors, with
at most a single one-value per row that indicates the input category index.
For example with 5 categories, an input value of 2.0 would map to an output vector of
`[0.0, 0.0, 1.0, 0.0]`.
The last category is not included by default (configurable via `dropLast`),
because it makes the vector entries sum up to one, and hence linearly dependent.
So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
Note: This is different from scikit-learn's OneHotEncoder, which keeps all categories.
The output vectors are sparse.
When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is
added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros
vector.
Note: When encoding multi-column by using `inputCols` and `outputCols` params, input/output
cols come in pairs, specified by the order in the arrays, and each pair is treated
independently.
See `StringIndexer` for converting categorical values into category indices
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
>>> ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
>>> model = ohe.fit(df)
>>> model.transform(df).head().output
SparseVector(2, {0: 1.0})
>>> ohePath = temp_path + "/oheEstimator"
>>> ohe.save(ohePath)
>>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
>>> loadedOHE.getInputCols() == ohe.getInputCols()
True
>>> modelPath = temp_path + "/ohe-model"
>>> model.save(modelPath)
>>> loadedModel = OneHotEncoderModel.load(modelPath)
>>> loadedModel.categorySizes == model.categorySizes
True
.. versionadded:: 2.3.0
"""

handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " +
"transform(). Options are 'keep' (invalid data presented as an extra " +
"categorical feature) or error (throw an error). Note that this Param " +
"is only used during transform; during fitting, invalid data will " +
"result in an error.",
typeConverter=TypeConverters.toString)

dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",
typeConverter=TypeConverters.toBoolean)

@keyword_only
def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
"""
__init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
"""
super(OneHotEncoderEstimator, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
self._setDefault(handleInvalid="error", dropLast=True)
kwargs = self._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("2.3.0")
def setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
"""
setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
Sets params for this OneHotEncoderEstimator.
"""
kwargs = self._input_kwargs
return self._set(**kwargs)

@since("2.3.0")
def setDropLast(self, value):
"""
Sets the value of :py:attr:`dropLast`.
"""
return self._set(dropLast=value)

@since("2.3.0")
def getDropLast(self):
"""
Gets the value of dropLast or its default value.
"""
return self.getOrDefault(self.dropLast)

def _create_model(self, java_model):
return OneHotEncoderModel(java_model)


class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
Model fitted by :py:class:`OneHotEncoderEstimator`.
.. versionadded:: 2.3.0
"""

@property
@since("2.3.0")
def categorySizes(self):
"""
Original number of categories for each feature being encoded.
The array contains one value for each input column, in order.
"""
return self._call_java("categorySizes")


@inherit_doc
class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
JavaMLWritable):
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/ml/param/_shared_params_code_gen.py
Expand Up @@ -119,6 +119,7 @@ def get$Name(self):
("inputCol", "input column name.", None, "TypeConverters.toString"),
("inputCols", "input column names.", None, "TypeConverters.toListString"),
("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
("outputCols", "output column names.", None, "TypeConverters.toListString"),
("numFeatures", "number of features.", None, "TypeConverters.toInt"),
("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
"E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
Expand Down
23 changes: 23 additions & 0 deletions python/pyspark/ml/param/shared.py
Expand Up @@ -256,6 +256,29 @@ def getOutputCol(self):
return self.getOrDefault(self.outputCol)


class HasOutputCols(Params):
"""
Mixin for param outputCols: output column names.
"""

outputCols = Param(Params._dummy(), "outputCols", "output column names.", typeConverter=TypeConverters.toListString)

def __init__(self):
super(HasOutputCols, self).__init__()

def setOutputCols(self, value):
"""
Sets the value of :py:attr:`outputCols`.
"""
return self._set(outputCols=value)

def getOutputCols(self):
"""
Gets the value of outputCols or its default value.
"""
return self.getOrDefault(self.outputCols)


class HasNumFeatures(Params):
"""
Mixin for param numFeatures: number of features.
Expand Down

0 comments on commit 55695c7

Please sign in to comment.