From 5a7cf31e1c76c2fe4e172c2889fa23deb2152af9 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Wed, 14 Sep 2016 12:07:14 -0500
Subject: [PATCH 1/5] test for spurious rejection of similar vectors in
 findSynonyms

---
 .../spark/mllib/feature/Word2VecSuite.scala      | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 22de4c4ac40e6..fe843b4a0e444 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.mllib.feature
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.util.Utils
 
@@ -68,6 +69,21 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(syms(1)._1 == "japan")
   }
 
+  test("findSynonyms doesn't reject similar word vectors when called with a vector") {
+    val num = 2
+    val word2VecMap = Map(
+      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
+      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
+      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
+      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
+    )
+    val model = new Word2VecModel(word2VecMap)
+    val syms = model.findSynonyms(Vectors.dense(Array(0.52d, 0.50d, 0.50d, 0.50d)), num)
+    assert(syms.length == num)
+    assert(syms(0)._1 == "china")
+    assert(syms(1)._1 == "taiwan")
+  }
+
   test("model load / save") {
 
     val word2VecMap = Map(

From 832ed41f717e278d9dfbd3c4cfabc30fb4b7beaf Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Wed, 14 Sep 2016 16:35:29 -0500
Subject: [PATCH 2/5] `Word2VecModel.findSynonyms` no longer spuriously rejects
 the best match

Previously, the `findSynonyms` method in `Word2VecModel` rejected the
closest-matching vector.  This was typically correct in cases where we
were searching for synonyms of a word, but was incorrect in cases
where we were searching for words most similar to a given vector,
since the given vector might not correspond to a word in the model's
vocabulary.

With this commit, `findSynonyms` will not discard the best matching
term unless the best matching word is also the query word (or is
maximally similar to the query vector).
---
 .../apache/spark/ml/feature/Word2Vec.scala    |  9 ++++++--
 .../apache/spark/mllib/feature/Word2Vec.scala | 22 ++++++++++++++++---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index c2b434c3d5cb1..e584f3a7b8e4a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -227,7 +227,7 @@ class Word2VecModel private[ml] (
    */
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
-    findSynonyms(wordVectors.transform(word), num)
+    findSynonyms(wordVectors.transform(word), num, Some(word))
   }
 
   /**
@@ -237,8 +237,13 @@ class Word2VecModel private[ml] (
    */
   @Since("2.0.0")
   def findSynonyms(word: Vector, num: Int): DataFrame = {
+    findSynonyms(word, num, None)
+  }
+
+  private[spark] def findSynonyms(
+      word: Vector, num: Int, wordOpt: Option[String]): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity")
+    spark.createDataFrame(wordVectors.findSynonyms(word, num, wordOpt)).toDF("word", "similarity")
   }
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 908198740b501..da9ba631dcf07 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -526,7 +526,7 @@ class Word2VecModel private[spark] (
   @Since("1.1.0")
   def findSynonyms(word: String, num: Int): Array[(String, Double)] = {
     val vector = transform(word)
-    findSynonyms(vector, num)
+    findSynonyms(vector, num, Some(word))
   }
 
   /**
@@ -537,6 +537,22 @@ class Word2VecModel private[spark] (
    */
   @Since("1.1.0")
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
+    findSynonyms(vector, num, None)
+  }
+
+  /**
+   * Find synonyms of the vector representation of a word, rejecting
+   * vectors identical to <tt>vector</tt> and words identical to the
+   * value of wordOpt, if one is supplied.
+   * @param vector vector representation of a word
+   * @param num number of synonyms to find
+   * @param wordOpt optionally, a word to reject from the results list
+   * @return array of (word, cosineSimilarity)
+   */
+  private[spark] def findSynonyms(
+      vector: Vector,
+      num: Int,
+      wordOpt: Option[String]): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
     // TODO: optimize top-k
     val fVector = vector.toArray.map(_.toFloat)
@@ -566,8 +582,8 @@ class Word2VecModel private[spark] (
     wordList.zip(cosVec)
       .toSeq
       .sortBy(-_._2)
-      .take(num + 1)
-      .tail
+      .filter(tup => wordOpt.map(w => !w.equals(tup._1)).getOrElse(true) && tup._2 != 1.0d)
+      .take(num)
       .toArray
   }
 

From 08424f4988cb06c98750440715df7d4312806cd9 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Thu, 15 Sep 2016 09:52:40 -0500
Subject: [PATCH 3/5] Incorporated feedback from review.

Update PySpark docstring and scaladocs to reflect fixed behavior.
---
 .../mllib/api/python/Word2VecModelWrapper.scala  | 10 +++++++---
 .../apache/spark/mllib/feature/Word2Vec.scala    | 16 +++++++++++-----
 .../spark/mllib/feature/Word2VecSuite.scala      |  2 +-
 python/pyspark/mllib/feature.py                  | 12 +++++++++---
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
index 4b4ed2291d139..6e26d36284feb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
@@ -45,16 +45,20 @@ private[python] class Word2VecModelWrapper(model: Word2VecModel) {
 
   def findSynonyms(word: String, num: Int): JList[Object] = {
     val vec = transform(word)
-    findSynonyms(vec, num)
+    findSynonyms(vec, num, Some(word))
   }
 
   def findSynonyms(vector: Vector, num: Int): JList[Object] = {
-    val result = model.findSynonyms(vector, num)
+    findSynonyms(vector, num, None)
+  }
+
+  def findSynonyms(
+      vector: Vector, num: Int, wordOpt: Option[String]): JList[Object] = {
+    val result = model.findSynonyms(vector, num, wordOpt)
     val similarity = Vectors.dense(result.map(_._2))
     val words = result.map(_._1)
     List(words, similarity).map(_.asInstanceOf[Object]).asJava
   }
-
   def getVectors: JMap[String, JList[Float]] = {
     model.getVectors.map { case (k, v) =>
       (k, v.toList.asJava)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index da9ba631dcf07..12dc08e5d144a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -518,7 +518,7 @@ class Word2VecModel private[spark] (
   }
 
   /**
-   * Find synonyms of a word
+   * Find synonyms of a word; do not include the word itself in results.
    * @param word a word
    * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
@@ -530,7 +530,9 @@ class Word2VecModel private[spark] (
   }
 
   /**
-   * Find synonyms of the vector representation of a word
+   * Find synonyms of the vector representation of a word, possibly
+   * including any words in the model vocabulary whose vector respresentation
+   * is the supplied vector.
    * @param vector vector representation of a word
    * @param num number of synonyms to find
    * @return array of (word, cosineSimilarity)
@@ -542,8 +544,7 @@ class Word2VecModel private[spark] (
 
   /**
    * Find synonyms of the vector representation of a word, rejecting
-   * vectors identical to <tt>vector</tt> and words identical to the
-   * value of wordOpt, if one is supplied.
+   * words identical to the value of wordOpt, if one is supplied.
    * @param vector vector representation of a word
    * @param num number of synonyms to find
    * @param wordOpt optionally, a word to reject from the results list
@@ -579,10 +580,15 @@ class Word2VecModel private[spark] (
       ind += 1
     }
 
+    // NB: This code (and the documented behavior of findSynonyms
+    // elsewhere) assumes that distinct words do not have identical
+    // vector representations
+
     wordList.zip(cosVec)
       .toSeq
       .sortBy(-_._2)
-      .filter(tup => wordOpt.map(w => !w.equals(tup._1)).getOrElse(true) && tup._2 != 1.0d)
+      .take(num + 1)
+      .dropWhile(p => wordOpt.map(w => w.equals(p._1)).getOrElse(false))
       .take(num)
       .toArray
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index fe843b4a0e444..f4fa216b8eba0 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -78,7 +78,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext {
       ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
     )
     val model = new Word2VecModel(word2VecMap)
-    val syms = model.findSynonyms(Vectors.dense(Array(0.52d, 0.50d, 0.50d, 0.50d)), num)
+    val syms = model.findSynonyms(Vectors.dense(Array(0.52, 0.5, 0.5, 0.5)), num)
     assert(syms.length == num)
     assert(syms(0)._1 == "china")
     assert(syms(1)._1 == "taiwan")
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index b32d0c70ec6a7..5d99644fca254 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -544,8 +544,7 @@ def load(cls, sc, path):
 
 @ignore_unicode_prefix
 class Word2Vec(object):
-    """
-    Word2Vec creates vector representation of words in a text corpus.
+    """Word2Vec creates vector representation of words in a text corpus.
     The algorithm first constructs a vocabulary from the corpus
     and then learns vector representation of words in the vocabulary.
     The vector representation can be used as features in
@@ -567,13 +566,19 @@ class Word2Vec(object):
     >>> doc = sc.parallelize(localDoc).map(lambda line: line.split(" "))
     >>> model = Word2Vec().setVectorSize(10).setSeed(42).fit(doc)
 
+    Querying for synonyms of a word will not return that word:
+
     >>> syms = model.findSynonyms("a", 2)
     >>> [s[0] for s in syms]
     [u'b', u'c']
+
+    But querying for synonyms of a vector may return the word whose
+    representation is that vector:
+
     >>> vec = model.transform("a")
     >>> syms = model.findSynonyms(vec, 2)
     >>> [s[0] for s in syms]
-    [u'b', u'c']
+    [u'a', u'b']
 
     >>> import os, tempfile
     >>> path = tempfile.mkdtemp()
@@ -591,6 +596,7 @@ class Word2Vec(object):
     ...     pass
 
     .. versionadded:: 1.2.0
+
     """
     def __init__(self):
         """

From 18e6bfeb3736b62802c78bebe0296b77f2e5865c Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Fri, 16 Sep 2016 10:07:44 -0500
Subject: [PATCH 4/5] Incorporated further review feedback

* changed filtering code path
* made `Word2Vec.findSynonyms(Vector, Int, Option[String])` private
* refactored ML pipeline and Python Word2Vec model wrappers to use public APIs
---
 .../apache/spark/ml/feature/Word2Vec.scala    | 25 ++++++++-----------
 .../api/python/Word2VecModelWrapper.scala     | 24 +++++++++++++-----
 .../apache/spark/mllib/feature/Word2Vec.scala | 17 +++++++------
 3 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index e584f3a7b8e4a..14c05123c62ed 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -221,29 +221,26 @@ class Word2VecModel private[ml] (
   }
 
   /**
-   * Find "num" number of words closest in similarity to the given word.
-   * Returns a dataframe with the words and the cosine similarities between the
-   * synonyms and the given word.
+   * Find "num" number of words closest in similarity to the given word, not
+   * including the word itself. Returns a dataframe with the words and the
+   * cosine similarities between the synonyms and the given word.
    */
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
-    findSynonyms(wordVectors.transform(word), num, Some(word))
+    val spark = SparkSession.builder().getOrCreate()
+    spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity")
   }
 
   /**
-   * Find "num" number of words closest to similarity to the given vector representation
-   * of the word. Returns a dataframe with the words and the cosine similarities between the
-   * synonyms and the given word vector.
+   * Find "num" number of words whose vector representation most similar to the supplied vector.
+   * If the supplied vector is the vector representation of a word in the model's vocabulary,
+   * that word will be in the results.  Returns a dataframe with the words and the cosine
+   * similarities between the synonyms and the given word vector.
    */
   @Since("2.0.0")
-  def findSynonyms(word: Vector, num: Int): DataFrame = {
-    findSynonyms(word, num, None)
-  }
-
-  private[spark] def findSynonyms(
-      word: Vector, num: Int, wordOpt: Option[String]): DataFrame = {
+  def findSynonyms(vec: Vector, num: Int): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(wordVectors.findSynonyms(word, num, wordOpt)).toDF("word", "similarity")
+    spark.createDataFrame(wordVectors.findSynonyms(vec, num)).toDF("word", "similarity")
   }
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
index 6e26d36284feb..5cbfbff3e4a62 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/Word2VecModelWrapper.scala
@@ -43,22 +43,34 @@ private[python] class Word2VecModelWrapper(model: Word2VecModel) {
     rdd.rdd.map(model.transform)
   }
 
+  /**
+   * Finds synonyms of a word; do not include the word itself in results.
+   * @param word a word
+   * @param num number of synonyms to find
+   * @return a list consisting of a list of words and a vector of cosine similarities
+   */
   def findSynonyms(word: String, num: Int): JList[Object] = {
-    val vec = transform(word)
-    findSynonyms(vec, num, Some(word))
+    prepareResult(model.findSynonyms(word, num))
   }
 
+  /**
+   * Finds words similar to the the vector representation of a word without
+   * filtering results.
+   * @param vector a vector
+   * @param num number of synonyms to find
+   * @return a list consisting of a list of words and a vector of cosine similarities
+   */
   def findSynonyms(vector: Vector, num: Int): JList[Object] = {
-    findSynonyms(vector, num, None)
+    prepareResult(model.findSynonyms(vector, num))
   }
 
-  def findSynonyms(
-      vector: Vector, num: Int, wordOpt: Option[String]): JList[Object] = {
-    val result = model.findSynonyms(vector, num, wordOpt)
+  private def prepareResult(result: Array[(String, Double)]) = {
     val similarity = Vectors.dense(result.map(_._2))
     val words = result.map(_._1)
     List(words, similarity).map(_.asInstanceOf[Object]).asJava
   }
+
+
   def getVectors: JMap[String, JList[Float]] = {
     model.getVectors.map { case (k, v) =>
       (k, v.toList.asJava)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 12dc08e5d144a..573945f5e75ae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -550,7 +550,7 @@ class Word2VecModel private[spark] (
    * @param wordOpt optionally, a word to reject from the results list
    * @return array of (word, cosineSimilarity)
    */
-  private[spark] def findSynonyms(
+  private def findSynonyms(
       vector: Vector,
       num: Int,
       wordOpt: Option[String]): Array[(String, Double)] = {
@@ -584,13 +584,14 @@ class Word2VecModel private[spark] (
     // elsewhere) assumes that distinct words do not have identical
     // vector representations
 
-    wordList.zip(cosVec)
-      .toSeq
-      .sortBy(-_._2)
-      .take(num + 1)
-      .dropWhile(p => wordOpt.map(w => w.equals(p._1)).getOrElse(false))
-      .take(num)
-      .toArray
+    val scored = wordList.zip(cosVec).toSeq.sortBy(-_._2)
+
+    val filtered = wordOpt match {
+      case Some(w) => scored.take(num + 1).filter(tup => w != tup._1)
+      case None => scored
+    }
+
+    filtered.take(num).toArray
   }
 
   /**

From 624c0f8b1ad78f98cabcc51a1ed7a94f5f20a25a Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Fri, 16 Sep 2016 10:56:58 -0500
Subject: [PATCH 5/5] Remove comment about old findSynonyms behavior

---
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala  | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 573945f5e75ae..42ca9665e5843 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -580,10 +580,6 @@ class Word2VecModel private[spark] (
       ind += 1
     }
 
-    // NB: This code (and the documented behavior of findSynonyms
-    // elsewhere) assumes that distinct words do not have identical
-    // vector representations
-
     val scored = wordList.zip(cosVec).toSeq.sortBy(-_._2)
 
     val filtered = wordOpt match {