-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-9654][ML][PYSPARK] Add IndexToString to PySpark #7976
Changes from 9 commits
1dc4579
0445fcc
af2f869
510bce5
c6da160
9f5af3a
7b3b5ca
244e083
e95b61b
b1795aa
ab90dcd
43ae197
c400e16
64de5c9
2316a90
15390bb
28afcfd
f19445d
51ae7ee
ed0ca91
8fca8b3
3ef852f
41d0d27
cd5d418
4f56b17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,8 +26,8 @@ | |
from pyspark.mllib.common import inherit_doc | ||
from pyspark.mllib.linalg import _convert_to_vector | ||
|
||
__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', | ||
'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', | ||
__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'NGram', | ||
'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', | ||
'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', | ||
'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', | ||
'PCAModel', 'RFormula', 'RFormulaModel'] | ||
|
@@ -731,6 +731,11 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol): | |
>>> sorted(set([(i[0], i[1]) for i in td.select(td.id, td.indexed).collect()]), | ||
... key=lambda x: x[0]) | ||
[(0, 0.0), (1, 2.0), (2, 1.0), (3, 0.0), (4, 0.0), (5, 1.0)] | ||
>>> inverter = IndexToString(inputCol="indexed", outputCol="label2", labels=model.labels()) | ||
>>> itd = inverter.transform(td) | ||
>>> sorted(set([(i[0], str(i[1])) for i in itd.select(itd.id, itd.label2).collect()]), | ||
... key=lambda x: x[0]) | ||
[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')] | ||
""" | ||
|
||
@keyword_only | ||
|
@@ -760,6 +765,60 @@ class StringIndexerModel(JavaModel): | |
""" | ||
Model fitted by StringIndexer. | ||
""" | ||
@property | ||
def labels(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. copy Scala doc: "Ordered list of labels, corresponding to indices to be assigned" |
||
return self._java_obj.labels | ||
|
||
|
||
class IndexToString(JavaTransformer, HasInputCol, HasOutputCol): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use inherit_doc tag |
||
""" | ||
Convert provided indexes back to strings using either the metadata on the input column | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please copy updated Scala doc here. Also, please mark as Experimental (as in, e.g., RFormula) |
||
or user provided labels. | ||
Note: By default we keep the original columns during StringIndexerModel's transformation, | ||
so the inverse should only be used on new columns such as predicted labels. | ||
""" | ||
# a placeholder to make the labels show up in generated doc | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. insert newline above |
||
labels = Param(Params._dummy(), "lables", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: "lables" |
||
"Optional labels to be provided by the user, if not supplied column " + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "if not supplied" -> "if equal to the empty array then" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes less sense, if it isn't supplied then it uses the column metadata. |
||
"metadata is read for labels. The default value is an empty array, " + | ||
"but the empty array is ignored and column metadata used instead.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After the above nit, this becomes redundant IMO. Since this is a matter of taste, feel free to keep or cut |
||
|
||
@keyword_only | ||
def __init__(self, inputCol=None, outputCol=None, labels=[]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should avoid using mutable values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern is the underlying Scala code uses an empty array as the default. |
||
""" | ||
Initialize this instace of the IndexToString using the provided java_obj. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. first line should be:
as in other transformers (See VectorAssembler) typo: instace |
||
""" | ||
super(IndexToString, self).__init__() | ||
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString", | ||
self.uid) | ||
self.labels = Param(self, "labels", | ||
"Optional labels to be provided by the user, if not supplied column " + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as L957 |
||
"metadata is read for labels. The default value is an empty array, " + | ||
"but the empty array is ignored and column metadata used instead.") | ||
kwargs = self.__init__._input_kwargs | ||
self.setParams(**kwargs) | ||
|
||
@keyword_only | ||
def setParams(self, inputCol=None, outputCol=None, labels=[]): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My concern is the underlying Scala code uses an empty array as the default. |
||
""" | ||
setParams(self, inputCol="input", outputCol="output", labels=[]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. correct col defaults: None |
||
Sets params for this IndexToString | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit "." at end of line |
||
""" | ||
kwargs = self.setParams._input_kwargs | ||
return self._set(**kwargs) | ||
|
||
def setLabels(self, value): | ||
""" | ||
Specify the labels to be used. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
""" | ||
self._paramMap[self.labels] = value | ||
return self | ||
|
||
def getLabels(self): | ||
""" | ||
Get the labels. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
""" | ||
return self.getOrDefault(self.labels) | ||
|
||
|
||
@inherit_doc | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no longer needed since "label" is a public val