Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-8472] [ML] [PySpark] Python API for DCT #8485

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 64 additions & 1 deletion python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from pyspark.mllib.common import inherit_doc
from pyspark.mllib.linalg import _convert_to_vector

__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
__all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel',
'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer',
'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel',
'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel',
Expand Down Expand Up @@ -166,6 +166,69 @@ def getSplits(self):
return self.getOrDefault(self.splits)


@inherit_doc
class DCT(JavaTransformer, HasInputCol, HasOutputCol):
"""
A feature transformer that takes the 1D discrete cosine transform
of a real vector. No zero padding is performed on the input vector.
It returns a real vector of the same length representing the DCT.
The return vector is scaled such that the transform matrix is
unitary (aka scaled DCT-II).

More information on
`https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II Wikipedia`.

>>> from pyspark.mllib.linalg import Vectors
>>> df1 = sqlContext.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]),)], ["vec"])
>>> dct = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
>>> df2 = dct.transform(df1)
>>> df2.head().resultVec
DenseVector([10.969..., -0.707..., -2.041...])
>>> df3 = DCT(inverse=True, inputCol="resultVec", outputCol="origVec").transform(df2)
>>> df3.head().origVec
DenseVector([5.0, 8.0, 6.0])
"""

# a placeholder to make it appear in the generated doc
inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
"default False.")

@keyword_only
def __init__(self, inverse=False, inputCol=None, outputCol=None):
"""
__init__(self, inverse=False, inputCol=None, outputCol=None)
"""
super(DCT, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
"default False.")
self._setDefault(inverse=False)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
def setParams(self, inverse=False, inputCol=None, outputCol=None):
"""
setParams(self, inverse=False, inputCol=None, outputCol=None)
Sets params for this DCT.
"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

def setInverse(self, value):
"""
Sets the value of :py:attr:`inverse`.
"""
self._paramMap[self.inverse] = value
return self

def getInverse(self):
"""
Gets the value of inverse or its default value.
"""
return self.getOrDefault(self.inverse)


@inherit_doc
class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
"""
Expand Down