Skip to content

Commit

Permalink
[SPARK-10273] Add @SInCE annotation to pyspark.mllib.feature
Browse files Browse the repository at this point in the history
Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to methods + "versionadded::" to classes (derived from the git file history in pyspark).

Author: noelsmith <mail@noelsmith.com>

Closes #8633 from noel-smith/SPARK-10273-since-mllib-feature.
  • Loading branch information
noel-smith authored and mengxr committed Sep 15, 2015
1 parent 4ae4d54 commit 610971e
Showing 1 changed file with 57 additions and 1 deletion.
58 changes: 57 additions & 1 deletion python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

from py4j.protocol import Py4JJavaError

from pyspark import SparkContext
from pyspark import SparkContext, since
from pyspark.rdd import RDD, ignore_unicode_prefix
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg import (
Expand Down Expand Up @@ -84,11 +84,14 @@ class Normalizer(VectorTransformer):
>>> nor2 = Normalizer(float("inf"))
>>> nor2.transform(v)
DenseVector([0.0, 0.5, 1.0])
.. versionadded:: 1.2.0
"""
def __init__(self, p=2.0):
assert p >= 1.0, "p should be greater than 1.0"
self.p = float(p)

@since('1.2.0')
def transform(self, vector):
"""
Applies unit length normalization on a vector.
Expand Down Expand Up @@ -133,7 +136,11 @@ class StandardScalerModel(JavaVectorTransformer):
.. note:: Experimental
Represents a StandardScaler model that can transform vectors.
.. versionadded:: 1.2.0
"""

@since('1.2.0')
def transform(self, vector):
"""
Applies standardization transformation on a vector.
Expand All @@ -149,6 +156,7 @@ def transform(self, vector):
"""
return JavaVectorTransformer.transform(self, vector)

@since('1.4.0')
def setWithMean(self, withMean):
"""
Setter of the boolean which decides
Expand All @@ -157,6 +165,7 @@ def setWithMean(self, withMean):
self.call("setWithMean", withMean)
return self

@since('1.4.0')
def setWithStd(self, withStd):
"""
Setter of the boolean which decides
Expand Down Expand Up @@ -189,13 +198,16 @@ class StandardScaler(object):
>>> for r in result.collect(): r
DenseVector([-0.7071, 0.7071, -0.7071])
DenseVector([0.7071, -0.7071, 0.7071])
.. versionadded:: 1.2.0
"""
def __init__(self, withMean=False, withStd=True):
if not (withMean or withStd):
warnings.warn("Both withMean and withStd are false. The model does nothing.")
self.withMean = withMean
self.withStd = withStd

@since('1.2.0')
def fit(self, dataset):
"""
Computes the mean and variance and stores as a model to be used
Expand All @@ -215,7 +227,11 @@ class ChiSqSelectorModel(JavaVectorTransformer):
.. note:: Experimental
Represents a Chi Squared selector model.
.. versionadded:: 1.4.0
"""

@since('1.4.0')
def transform(self, vector):
"""
Applies transformation on a vector.
Expand Down Expand Up @@ -245,10 +261,13 @@ class ChiSqSelector(object):
SparseVector(1, {0: 6.0})
>>> model.transform(DenseVector([8.0, 9.0, 5.0]))
DenseVector([5.0])
.. versionadded:: 1.4.0
"""
def __init__(self, numTopFeatures):
self.numTopFeatures = int(numTopFeatures)

@since('1.4.0')
def fit(self, data):
"""
Returns a ChiSquared feature selector.
Expand All @@ -265,6 +284,8 @@ def fit(self, data):
class PCAModel(JavaVectorTransformer):
"""
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
.. versionadded:: 1.5.0
"""


Expand All @@ -281,13 +302,16 @@ class PCA(object):
1.648...
>>> pcArray[1]
-4.013...
.. versionadded:: 1.5.0
"""
def __init__(self, k):
"""
:param k: number of principal components.
"""
self.k = int(k)

@since('1.5.0')
def fit(self, data):
"""
Computes a [[PCAModel]] that contains the principal components of the input vectors.
Expand All @@ -312,14 +336,18 @@ class HashingTF(object):
>>> doc = "a a b b c d".split(" ")
>>> htf.transform(doc)
SparseVector(100, {...})
.. versionadded:: 1.2.0
"""
def __init__(self, numFeatures=1 << 20):
self.numFeatures = numFeatures

@since('1.2.0')
def indexOf(self, term):
""" Returns the index of the input term. """
return hash(term) % self.numFeatures

@since('1.2.0')
def transform(self, document):
"""
Transforms the input document (list of terms) to term frequency
Expand All @@ -339,7 +367,10 @@ def transform(self, document):
class IDFModel(JavaVectorTransformer):
"""
Represents an IDF model that can transform term frequency vectors.
.. versionadded:: 1.2.0
"""
@since('1.2.0')
def transform(self, x):
"""
Transforms term frequency (TF) vectors to TF-IDF vectors.
Expand All @@ -358,6 +389,7 @@ def transform(self, x):
"""
return JavaVectorTransformer.transform(self, x)

@since('1.4.0')
def idf(self):
"""
Returns the current IDF vector.
Expand Down Expand Up @@ -401,10 +433,13 @@ class IDF(object):
DenseVector([0.0, 0.0, 1.3863, 0.863])
>>> model.transform(Vectors.sparse(n, (1, 3), (1.0, 2.0)))
SparseVector(4, {1: 0.0, 3: 0.5754})
.. versionadded:: 1.2.0
"""
def __init__(self, minDocFreq=0):
self.minDocFreq = minDocFreq

@since('1.2.0')
def fit(self, dataset):
"""
Computes the inverse document frequency.
Expand All @@ -420,7 +455,10 @@ def fit(self, dataset):
class Word2VecModel(JavaVectorTransformer, JavaSaveable, JavaLoader):
"""
class for Word2Vec model
.. versionadded:: 1.2.0
"""
@since('1.2.0')
def transform(self, word):
"""
Transforms a word to its vector representation
Expand All @@ -435,6 +473,7 @@ def transform(self, word):
except Py4JJavaError:
raise ValueError("%s not found" % word)

@since('1.2.0')
def findSynonyms(self, word, num):
"""
Find synonyms of a word
Expand All @@ -450,14 +489,19 @@ def findSynonyms(self, word, num):
words, similarity = self.call("findSynonyms", word, num)
return zip(words, similarity)

@since('1.4.0')
def getVectors(self):
"""
Returns a map of words to their vector representations.
"""
return self.call("getVectors")

@classmethod
@since('1.5.0')
def load(cls, sc, path):
"""
Load a model from the given path.
"""
jmodel = sc._jvm.org.apache.spark.mllib.feature \
.Word2VecModel.load(sc._jsc.sc(), path)
return Word2VecModel(jmodel)
Expand Down Expand Up @@ -507,6 +551,8 @@ class Word2Vec(object):
... rmtree(path)
... except OSError:
... pass
.. versionadded:: 1.2.0
"""
def __init__(self):
"""
Expand All @@ -519,20 +565,23 @@ def __init__(self):
self.seed = random.randint(0, sys.maxsize)
self.minCount = 5

@since('1.2.0')
def setVectorSize(self, vectorSize):
"""
Sets vector size (default: 100).
"""
self.vectorSize = vectorSize
return self

@since('1.2.0')
def setLearningRate(self, learningRate):
"""
Sets initial learning rate (default: 0.025).
"""
self.learningRate = learningRate
return self

@since('1.2.0')
def setNumPartitions(self, numPartitions):
"""
Sets number of partitions (default: 1). Use a small number for
Expand All @@ -541,6 +590,7 @@ def setNumPartitions(self, numPartitions):
self.numPartitions = numPartitions
return self

@since('1.2.0')
def setNumIterations(self, numIterations):
"""
Sets number of iterations (default: 1), which should be smaller
Expand All @@ -549,13 +599,15 @@ def setNumIterations(self, numIterations):
self.numIterations = numIterations
return self

@since('1.2.0')
def setSeed(self, seed):
"""
Sets random seed.
"""
self.seed = seed
return self

@since('1.4.0')
def setMinCount(self, minCount):
"""
Sets minCount, the minimum number of times a token must appear
Expand All @@ -564,6 +616,7 @@ def setMinCount(self, minCount):
self.minCount = minCount
return self

@since('1.2.0')
def fit(self, data):
"""
Computes the vector representation of each word in vocabulary.
Expand Down Expand Up @@ -596,10 +649,13 @@ class ElementwiseProduct(VectorTransformer):
>>> rdd = sc.parallelize([a, b])
>>> eprod.transform(rdd).collect()
[DenseVector([2.0, 2.0, 9.0]), DenseVector([9.0, 6.0, 12.0])]
.. versionadded:: 1.5.0
"""
def __init__(self, scalingVector):
self.scalingVector = _convert_to_vector(scalingVector)

@since('1.5.0')
def transform(self, vector):
"""
Computes the Hadamard product of the vector.
Expand Down

0 comments on commit 610971e

Please sign in to comment.