Skip to content

Commit

Permalink
[SPARK-14564][ML][MLLIB][PYSPARK] Python Word2Vec missing setWindowSi…
Browse files Browse the repository at this point in the history
…ze method

## What changes were proposed in this pull request?
Added windowSize getter/setter to ML/MLlib

## How was this patch tested?
Added test cases in tests.py under both ML and MLlib

Author: Jason Lee <cjlee@us.ibm.com>

Closes #12428 from jasoncl/SPARK-14564.
  • Loading branch information
jasoncl authored and jkbradley committed Apr 18, 2016
1 parent d280d1d commit 3d66a2c
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@ private[python] class PythonMLLibAPI extends Serializable {
* @param numPartitions number of partitions
* @param numIterations number of iterations
* @param seed initial seed for random generator
* @param windowSize size of window
* @return A handle to java Word2VecModelWrapper instance at python side
*/
def trainWord2VecModel(
Expand All @@ -680,14 +681,16 @@ private[python] class PythonMLLibAPI extends Serializable {
numPartitions: Int,
numIterations: Int,
seed: Long,
minCount: Int): Word2VecModelWrapper = {
minCount: Int,
windowSize: Int): Word2VecModelWrapper = {
val word2vec = new Word2Vec()
.setVectorSize(vectorSize)
.setLearningRate(learningRate)
.setNumPartitions(numPartitions)
.setNumIterations(numIterations)
.setSeed(seed)
.setMinCount(minCount)
.setWindowSize(windowSize)
try {
val model = word2vec.fit(dataJRDD.rdd.persist(StorageLevel.MEMORY_AND_DISK_SER))
new Word2VecModelWrapper(model)
Expand Down
28 changes: 23 additions & 5 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -2219,28 +2219,31 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
minCount = Param(Params._dummy(), "minCount",
"the minimum number of times a token must appear to be included in the " +
"word2vec model's vocabulary", typeConverter=TypeConverters.toInt)
windowSize = Param(Params._dummy(), "windowSize",
"the window size (context words from [-window, window]). Default value is 5",
typeConverter=TypeConverters.toInt)

@keyword_only
def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=None, inputCol=None, outputCol=None):
seed=None, inputCol=None, outputCol=None, windowSize=5):
"""
__init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \
seed=None, inputCol=None, outputCol=None)
seed=None, inputCol=None, outputCol=None, windowSize=5)
"""
super(Word2Vec, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=None)
seed=None, windowSize=5)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("1.4.0")
def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
seed=None, inputCol=None, outputCol=None):
seed=None, inputCol=None, outputCol=None, windowSize=5):
"""
setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \
inputCol=None, outputCol=None)
inputCol=None, outputCol=None, windowSize=5)
Sets params for this Word2Vec.
"""
kwargs = self.setParams._input_kwargs
Expand Down Expand Up @@ -2291,6 +2294,21 @@ def getMinCount(self):
"""
return self.getOrDefault(self.minCount)

@since("2.0.0")
def setWindowSize(self, value):
"""
Sets the value of :py:attr:`windowSize`.
"""
self._set(windowSize=value)
return self

@since("2.0.0")
def getWindowSize(self):
"""
Gets the value of windowSize or its default value.
"""
return self.getOrDefault(self.windowSize)

def _create_model(self, java_model):
return Word2VecModel(java_model)

Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/ml/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ def test_param_property_error(self):
params = param_store.params # should not invoke the property 'test_property'
self.assertEqual(len(params), 1)

def test_word2vec_param(self):
model = Word2Vec().setWindowSize(6)
# Check windowSize is set properly
self.assertEqual(model.getWindowSize(), 6)


class FeatureTests(PySparkTestCase):

Expand Down
11 changes: 10 additions & 1 deletion python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,7 @@ def __init__(self):
self.numIterations = 1
self.seed = random.randint(0, sys.maxsize)
self.minCount = 5
self.windowSize = 5

@since('1.2.0')
def setVectorSize(self, vectorSize):
Expand Down Expand Up @@ -669,6 +670,14 @@ def setMinCount(self, minCount):
self.minCount = minCount
return self

@since('2.0.0')
def setWindowSize(self, windowSize):
"""
Sets window size (default: 5).
"""
self.windowSize = windowSize
return self

@since('1.2.0')
def fit(self, data):
"""
Expand All @@ -682,7 +691,7 @@ def fit(self, data):
jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize),
float(self.learningRate), int(self.numPartitions),
int(self.numIterations), int(self.seed),
int(self.minCount))
int(self.minCount), int(self.windowSize))
return Word2VecModel(jmodel)


Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,13 +1027,15 @@ def test_word2vec_setters(self):
.setNumPartitions(2) \
.setNumIterations(10) \
.setSeed(1024) \
.setMinCount(3)
.setMinCount(3) \
.setWindowSize(6)
self.assertEqual(model.vectorSize, 2)
self.assertTrue(model.learningRate < 0.02)
self.assertEqual(model.numPartitions, 2)
self.assertEqual(model.numIterations, 10)
self.assertEqual(model.seed, 1024)
self.assertEqual(model.minCount, 3)
self.assertEqual(model.windowSize, 6)

def test_word2vec_get_vectors(self):
data = [
Expand Down

0 comments on commit 3d66a2c

Please sign in to comment.