Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-10273] Add @since annotation to pyspark.mllib.feature #8633

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 57 additions & 1 deletion python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

from py4j.protocol import Py4JJavaError

from pyspark import SparkContext
from pyspark import SparkContext, since
from pyspark.rdd import RDD, ignore_unicode_prefix
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import (
Expand Down Expand Up @@ -84,11 +84,14 @@ class Normalizer(VectorTransformer):
>>> nor2 = Normalizer(float("inf"))
>>> nor2.transform(v)
DenseVector([0.0, 0.5, 1.0])

.. versionadded:: 1.2.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we won't introduce new API in the micro releases, I think it's better to use 1.2 for short. cc @mengxr

Also we could just use float number as version, see pyspark/sql as examples.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think matching the overall project versioning scheme make it's clearer - but I'm happy to implement it either way.

One thing to watch for with using floats is that you can't differentiate between 1.1 and 1.10 (but it looks like that's unlikely to be a problem from the versioning history).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For 1.10 or 1.1.1, we still need to use string. This change is not necessary, just one minor comment.

"""
def __init__(self, p=2.0):
assert p >= 1.0, "p should be greater than 1.0"
self.p = float(p)

@since('1.2.0')
def transform(self, vector):
"""
Applies unit length normalization on a vector.
Expand Down Expand Up @@ -133,7 +136,11 @@ class StandardScalerModel(JavaVectorTransformer):
.. note:: Experimental

Represents a StandardScaler model that can transform vectors.

.. versionadded:: 1.2.0
"""

@since('1.2.0')
def transform(self, vector):
"""
Applies standardization transformation on a vector.
Expand All @@ -149,6 +156,7 @@ def transform(self, vector):
"""
return JavaVectorTransformer.transform(self, vector)

@since('1.4.0')
def setWithMean(self, withMean):
"""
Setter of the boolean which decides
Expand All @@ -157,6 +165,7 @@ def setWithMean(self, withMean):
self.call("setWithMean", withMean)
return self

@since('1.4.0')
def setWithStd(self, withStd):
"""
Setter of the boolean which decides
Expand Down Expand Up @@ -189,13 +198,16 @@ class StandardScaler(object):
>>> for r in result.collect(): r
DenseVector([-0.7071, 0.7071, -0.7071])
DenseVector([0.7071, -0.7071, 0.7071])

.. versionadded:: 1.2.0
"""
def __init__(self, withMean=False, withStd=True):
if not (withMean or withStd):
warnings.warn("Both withMean and withStd are false. The model does nothing.")
self.withMean = withMean
self.withStd = withStd

@since('1.2.0')
def fit(self, dataset):
"""
Computes the mean and variance and stores as a model to be used
Expand All @@ -215,7 +227,11 @@ class ChiSqSelectorModel(JavaVectorTransformer):
.. note:: Experimental

Represents a Chi Squared selector model.

.. versionadded:: 1.4.0
"""

@since('1.4.0')
def transform(self, vector):
"""
Applies transformation on a vector.
Expand Down Expand Up @@ -245,10 +261,13 @@ class ChiSqSelector(object):
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
DenseVector([5.0])

.. versionadded:: 1.4.0
"""
def __init__(self, numTopFeatures):
self.numTopFeatures = int(numTopFeatures)

@since('1.4.0')
def fit(self, data):
"""
Returns a ChiSquared feature selector.
Expand All @@ -265,6 +284,8 @@ def fit(self, data):
class PCAModel(JavaVectorTransformer):
"""
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.

.. versionadded:: 1.5.0
"""


Expand All @@ -281,13 +302,16 @@ class PCA(object):
1.648...
>>> pcArray[1]
-4.013...

.. versionadded:: 1.5.0
"""
def __init__(self, k):
"""
:param k: number of principal components.
"""
self.k = int(k)

@since('1.5.0')
def fit(self, data):
"""
Computes a [[PCAModel]] that contains the principal components of the input vectors.
Expand All @@ -312,14 +336,18 @@ class HashingTF(object):
>>> doc = "a a b b c d".split(" ")
>>> htf.transform(doc)
SparseVector(100, {...})

.. versionadded:: 1.2.0
"""
def __init__(self, numFeatures=1 << 20):
self.numFeatures = numFeatures

@since('1.2.0')
def indexOf(self, term):
""" Returns the index of the input term. """
return hash(term) % self.numFeatures

@since('1.2.0')
def transform(self, document):
"""
Transforms the input document (list of terms) to term frequency
Expand All @@ -339,7 +367,10 @@ def transform(self, document):
class IDFModel(JavaVectorTransformer):
"""
Represents an IDF model that can transform term frequency vectors.

.. versionadded:: 1.2.0
"""
@since('1.2.0')
def transform(self, x):
"""
Transforms term frequency (TF) vectors to TF-IDF vectors.
Expand All @@ -358,6 +389,7 @@ def transform(self, x):
"""
return JavaVectorTransformer.transform(self, x)

@since('1.4.0')
def idf(self):
"""
Returns the current IDF vector.
Expand Down Expand Up @@ -401,10 +433,13 @@ class IDF(object):
DenseVector([0.0, 0.0, 1.3863, 0.863])
>>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
SparseVector(4, {1: 0.0, 3: 0.5754})

.. versionadded:: 1.2.0
"""
def __init__(self, minDocFreq=0):
self.minDocFreq = minDocFreq

@since('1.2.0')
def fit(self, dataset):
"""
Computes the inverse document frequency.
Expand All @@ -420,7 +455,10 @@ def fit(self, dataset):
class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
"""
class for Word2Vec model

.. versionadded:: 1.2.0
"""
@since('1.2.0')
def transform(self, word):
"""
Transforms a word to its vector representation
Expand All @@ -435,6 +473,7 @@ def transform(self, word):
except Py4JJavaError:
raise ValueError("%s not found" % word)

@since('1.2.0')
def findSynonyms(self, word, num):
"""
Find synonyms of a word
Expand All @@ -450,14 +489,19 @@ def findSynonyms(self, word, num):
words, similarity = self.call("findSynonyms", word, num)
return zip(words, similarity)

@since('1.4.0')
def getVectors(self):
"""
Returns a map of words to their vector representations.
"""
return self.call("getVectors")

@classmethod
@since('1.5.0')
def load(cls, sc, path):
"""
Load a model from the given path.
"""
jmodel = sc._jvm.org.apache.spark.mllib.feature \
.Word2VecModel.load(sc._jsc.sc(), path)
return Word2VecModel(jmodel)
Expand Down Expand Up @@ -507,6 +551,8 @@ class Word2Vec(object):
... rmtree(path)
... except OSError:
... pass

.. versionadded:: 1.2.0
"""
def __init__(self):
"""
Expand All @@ -519,20 +565,23 @@ def __init__(self):
self.seed = random.randint(0, sys.maxsize)
self.minCount = 5

@since('1.2.0')
def setVectorSize(self, vectorSize):
"""
Sets vector size (default: 100).
"""
self.vectorSize = vectorSize
return self

@since('1.2.0')
def setLearningRate(self, learningRate):
"""
Sets initial learning rate (default: 0.025).
"""
self.learningRate = learningRate
return self

@since('1.2.0')
def setNumPartitions(self, numPartitions):
"""
Sets number of partitions (default: 1). Use a small number for
Expand All @@ -541,6 +590,7 @@ def setNumPartitions(self, numPartitions):
self.numPartitions = numPartitions
return self

@since('1.2.0')
def setNumIterations(self, numIterations):
"""
Sets number of iterations (default: 1), which should be smaller
Expand All @@ -549,13 +599,15 @@ def setNumIterations(self, numIterations):
self.numIterations = numIterations
return self

@since('1.2.0')
def setSeed(self, seed):
"""
Sets random seed.
"""
self.seed = seed
return self

@since('1.4.0')
def setMinCount(self, minCount):
"""
Sets minCount, the minimum number of times a token must appear
Expand All @@ -564,6 +616,7 @@ def setMinCount(self, minCount):
self.minCount = minCount
return self

@since('1.2.0')
def fit(self, data):
"""
Computes the vector representation of each word in vocabulary.
Expand Down Expand Up @@ -596,10 +649,13 @@ class ElementwiseProduct(VectorTransformer):
>>> rdd = sc.parallelize([a, b])
>>> eprod.transform(rdd).collect()
[DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]

.. versionadded:: 1.5.0
"""
def __init__(self, scalingVector):
self.scalingVector = _convert_to_vector(scalingVector)

@since('1.5.0')
def transform(self, vector):
"""
Computes the Hadamard product of the vector.
Expand Down