From 5a7cf31e1c76c2fe4e172c2889fa23deb2152af9 Mon Sep 17 00:00:00 2001 From: William Benton Date: Wed, 14 Sep 2016 12:07:14 -0500 Subject: [PATCH 1/5] test for spurious rejection of similar vectors in findSynonyms --- .../spark/mllib/feature/Word2VecSuite.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 22de4c4ac40e6..fe843b4a0e444 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.mllib.feature import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils @@ -68,6 +69,21 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { assert(syms(1)._1 == "japan") } + test("findSynonyms doesn't reject similar word vectors when called with a vector") { + val num = 2 + val word2VecMap = Map( + ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), + ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), + ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), + ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) + ) + val model = new Word2VecModel(word2VecMap) + val syms = model.findSynonyms(Vectors.dense(Array(0.52d, 0.50d, 0.50d, 0.50d)), num) + assert(syms.length == num) + assert(syms(0)._1 == "china") + assert(syms(1)._1 == "taiwan") + } + test("model load / save") { val word2VecMap = Map( From 832ed41f717e278d9dfbd3c4cfabc30fb4b7beaf Mon Sep 17 00:00:00 2001 From: William Benton Date: Wed, 14 Sep 2016 16:35:29 -0500 Subject: [PATCH 2/5] `Word2VecModel.findSynonyms` no longer spuriously rejects the best match Previously, the `findSynonyms` method in `Word2VecModel` rejected the closest-matching vector. This was typically correct in cases where we were searching for synonyms of a word, but was incorrect in cases where we were searching for words most similar to a given vector, since the given vector might not correspond to a word in the model's vocabulary. With this commit, `findSynonyms` will not discard the best matching term unless the best matching word is also the query word (or is maximally similar to the query vector). --- .../apache/spark/ml/feature/Word2Vec.scala | 9 ++++++-- .../apache/spark/mllib/feature/Word2Vec.scala | 22 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index c2b434c3d5cb1..e584f3a7b8e4a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -227,7 +227,7 @@ class Word2VecModel private[ml] ( */ @Since("1.5.0") def findSynonyms(word: String, num: Int): DataFrame = { - findSynonyms(wordVectors.transform(word), num) + findSynonyms(wordVectors.transform(word), num, Some(word)) } /** @@ -237,8 +237,13 @@ class Word2VecModel private[ml] ( */ @Since("2.0.0") def findSynonyms(word: Vector, num: Int): DataFrame = { + findSynonyms(word, num, None) + } + + private[spark] def findSynonyms( + word: Vector, num: Int, wordOpt: Option[String]): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") + spark.createDataFrame(wordVectors.findSynonyms(word, num, wordOpt)).toDF("word", "similarity") } /** @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 908198740b501..da9ba631dcf07 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -526,7 +526,7 @@ class Word2VecModel private[spark] ( @Since("1.1.0") def findSynonyms(word: String, num: Int): Array[(String, Double)] = { val vector = transform(word) - findSynonyms(vector, num) + findSynonyms(vector, num, Some(word)) } /** @@ -537,6 +537,22 @@ class Word2VecModel private[spark] ( */ @Since("1.1.0") def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = { + findSynonyms(vector, num, None) + } + + /** + * Find synonyms of the vector representation of a word, rejecting + * vectors identical to vector and words identical to the + * value of wordOpt, if one is supplied. + * @param vector vector representation of a word + * @param num number of synonyms to find + * @param wordOpt optionally, a word to reject from the results list + * @return array of (word, cosineSimilarity) + */ + private[spark] def findSynonyms( + vector: Vector, + num: Int, + wordOpt: Option[String]): Array[(String, Double)] = { require(num > 0, "Number of similar words should > 0") // TODO: optimize top-k val fVector = vector.toArray.map(_.toFloat) @@ -566,8 +582,8 @@ class Word2VecModel private[spark] ( wordList.zip(cosVec) .toSeq .sortBy(-_._2) - .take(num + 1) - .tail + .filter(tup => wordOpt.map(w => !w.equals(tup._1)).getOrElse(true) && tup._2 != 1.0d) + .take(num) .toArray } From 08424f4988cb06c98750440715df7d4312806cd9 Mon Sep 17 00:00:00 2001 From: William Benton Date: Thu, 15 Sep 2016 09:52:40 -0500 Subject: [PATCH 3/5] Incorporated feedback from review. Update PySpark docstring and scaladocs to reflect fixed behavior. --- .../mllib/api/python/Word2VecModelWrapper.scala | 10 +++++++--- .../apache/spark/mllib/feature/Word2Vec.scala | 16 +++++++++++----- .../spark/mllib/feature/Word2VecSuite.scala | 2 +- python/pyspark/mllib/feature.py | 12 +++++++++--- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala index 4b4ed2291d139..6e26d36284feb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala @@ -45,16 +45,20 @@ private[python] class Word2VecModelWrapper(model: Word2VecModel) { def findSynonyms(word: String, num: Int): JList[Object] = { val vec = transform(word) - findSynonyms(vec, num) + findSynonyms(vec, num, Some(word)) } def findSynonyms(vector: Vector, num: Int): JList[Object] = { - val result = model.findSynonyms(vector, num) + findSynonyms(vector, num, None) + } + + def findSynonyms( + vector: Vector, num: Int, wordOpt: Option[String]): JList[Object] = { + val result = model.findSynonyms(vector, num, wordOpt) val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } - def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index da9ba631dcf07..12dc08e5d144a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -518,7 +518,7 @@ class Word2VecModel private[spark] ( } /** - * Find synonyms of a word + * Find synonyms of a word; do not include the word itself in results. * @param word a word * @param num number of synonyms to find * @return array of (word, cosineSimilarity) @@ -530,7 +530,9 @@ class Word2VecModel private[spark] ( } /** - * Find synonyms of the vector representation of a word + * Find synonyms of the vector representation of a word, possibly + * including any words in the model vocabulary whose vector respresentation + * is the supplied vector. * @param vector vector representation of a word * @param num number of synonyms to find * @return array of (word, cosineSimilarity) @@ -542,8 +544,7 @@ class Word2VecModel private[spark] ( /** * Find synonyms of the vector representation of a word, rejecting - * vectors identical to vector and words identical to the - * value of wordOpt, if one is supplied. + * words identical to the value of wordOpt, if one is supplied. * @param vector vector representation of a word * @param num number of synonyms to find * @param wordOpt optionally, a word to reject from the results list @@ -579,10 +580,15 @@ class Word2VecModel private[spark] ( ind += 1 } + // NB: This code (and the documented behavior of findSynonyms + // elsewhere) assumes that distinct words do not have identical + // vector representations + wordList.zip(cosVec) .toSeq .sortBy(-_._2) - .filter(tup => wordOpt.map(w => !w.equals(tup._1)).getOrElse(true) && tup._2 != 1.0d) + .take(num + 1) + .dropWhile(p => wordOpt.map(w => w.equals(p._1)).getOrElse(false)) .take(num) .toArray } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index fe843b4a0e444..f4fa216b8eba0 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -78,7 +78,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) ) val model = new Word2VecModel(word2VecMap) - val syms = model.findSynonyms(Vectors.dense(Array(0.52d, 0.50d, 0.50d, 0.50d)), num) + val syms = model.findSynonyms(Vectors.dense(Array(0.52, 0.5, 0.5, 0.5)), num) assert(syms.length == num) assert(syms(0)._1 == "china") assert(syms(1)._1 == "taiwan") diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index b32d0c70ec6a7..5d99644fca254 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -544,8 +544,7 @@ def load(cls, sc, path): @ignore_unicode_prefix class Word2Vec(object): - """ - Word2Vec creates vector representation of words in a text corpus. + """Word2Vec creates vector representation of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in @@ -567,13 +566,19 @@ class Word2Vec(object): >>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" ")) >>> model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc) + Querying for synonyms of a word will not return that word: + >>> syms = model.findSynonyms("a", 2) >>> [s[0] for s in syms] [u'b', u'c'] + + But querying for synonyms of a vector may return the word whose + representation is that vector: + >>> vec = model.transform("a") >>> syms = model.findSynonyms(vec, 2) >>> [s[0] for s in syms] - [u'b', u'c'] + [u'a', u'b'] >>> import os, tempfile >>> path = tempfile.mkdtemp() @@ -591,6 +596,7 @@ class Word2Vec(object): ... pass .. versionadded:: 1.2.0 + """ def __init__(self): """ From 18e6bfeb3736b62802c78bebe0296b77f2e5865c Mon Sep 17 00:00:00 2001 From: William Benton Date: Fri, 16 Sep 2016 10:07:44 -0500 Subject: [PATCH 4/5] Incorporated further review feedback * changed filtering code path * made `Word2Vec.findSynonyms(Vector, Int, Option[String])` private * refactored ML pipeline and Python Word2Vec model wrappers to use public APIs --- .../apache/spark/ml/feature/Word2Vec.scala | 25 ++++++++----------- .../api/python/Word2VecModelWrapper.scala | 24 +++++++++++++----- .../apache/spark/mllib/feature/Word2Vec.scala | 17 +++++++------ 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index e584f3a7b8e4a..14c05123c62ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -221,29 +221,26 @@ class Word2VecModel private[ml] ( } /** - * Find "num" number of words closest in similarity to the given word. - * Returns a dataframe with the words and the cosine similarities between the - * synonyms and the given word. + * Find "num" number of words closest in similarity to the given word, not + * including the word itself. Returns a dataframe with the words and the + * cosine similarities between the synonyms and the given word. */ @Since("1.5.0") def findSynonyms(word: String, num: Int): DataFrame = { - findSynonyms(wordVectors.transform(word), num, Some(word)) + val spark = SparkSession.builder().getOrCreate() + spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") } /** - * Find "num" number of words closest to similarity to the given vector representation - * of the word. Returns a dataframe with the words and the cosine similarities between the - * synonyms and the given word vector. + * Find "num" number of words whose vector representation most similar to the supplied vector. + * If the supplied vector is the vector representation of a word in the model's vocabulary, + * that word will be in the results. Returns a dataframe with the words and the cosine + * similarities between the synonyms and the given word vector. */ @Since("2.0.0") - def findSynonyms(word: Vector, num: Int): DataFrame = { - findSynonyms(word, num, None) - } - - private[spark] def findSynonyms( - word: Vector, num: Int, wordOpt: Option[String]): DataFrame = { + def findSynonyms(vec: Vector, num: Int): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(wordVectors.findSynonyms(word, num, wordOpt)).toDF("word", "similarity") + spark.createDataFrame(wordVectors.findSynonyms(vec, num)).toDF("word", "similarity") } /** @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala index 6e26d36284feb..5cbfbff3e4a62 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala @@ -43,22 +43,34 @@ private[python] class Word2VecModelWrapper(model: Word2VecModel) { rdd.rdd.map(model.transform) } + /** + * Finds synonyms of a word; do not include the word itself in results. + * @param word a word + * @param num number of synonyms to find + * @return a list consisting of a list of words and a vector of cosine similarities + */ def findSynonyms(word: String, num: Int): JList[Object] = { - val vec = transform(word) - findSynonyms(vec, num, Some(word)) + prepareResult(model.findSynonyms(word, num)) } + /** + * Finds words similar to the the vector representation of a word without + * filtering results. + * @param vector a vector + * @param num number of synonyms to find + * @return a list consisting of a list of words and a vector of cosine similarities + */ def findSynonyms(vector: Vector, num: Int): JList[Object] = { - findSynonyms(vector, num, None) + prepareResult(model.findSynonyms(vector, num)) } - def findSynonyms( - vector: Vector, num: Int, wordOpt: Option[String]): JList[Object] = { - val result = model.findSynonyms(vector, num, wordOpt) + private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } + + def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 12dc08e5d144a..573945f5e75ae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -550,7 +550,7 @@ class Word2VecModel private[spark] ( * @param wordOpt optionally, a word to reject from the results list * @return array of (word, cosineSimilarity) */ - private[spark] def findSynonyms( + private def findSynonyms( vector: Vector, num: Int, wordOpt: Option[String]): Array[(String, Double)] = { @@ -584,13 +584,14 @@ class Word2VecModel private[spark] ( // elsewhere) assumes that distinct words do not have identical // vector representations - wordList.zip(cosVec) - .toSeq - .sortBy(-_._2) - .take(num + 1) - .dropWhile(p => wordOpt.map(w => w.equals(p._1)).getOrElse(false)) - .take(num) - .toArray + val scored = wordList.zip(cosVec).toSeq.sortBy(-_._2) + + val filtered = wordOpt match { + case Some(w) => scored.take(num + 1).filter(tup => w != tup._1) + case None => scored + } + + filtered.take(num).toArray } /** From 624c0f8b1ad78f98cabcc51a1ed7a94f5f20a25a Mon Sep 17 00:00:00 2001 From: William Benton Date: Fri, 16 Sep 2016 10:56:58 -0500 Subject: [PATCH 5/5] Remove comment about old findSynonyms behavior --- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 573945f5e75ae..42ca9665e5843 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -580,10 +580,6 @@ class Word2VecModel private[spark] ( ind += 1 } - // NB: This code (and the documented behavior of findSynonyms - // elsewhere) assumes that distinct words do not have identical - // vector representations - val scored = wordList.zip(cosVec).toSeq.sortBy(-_._2) val filtered = wordOpt match {