From 05eaf59fa34d6f8d721e48b3300271d347aae699 Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Sun, 5 Feb 2017 15:05:32 -0500
Subject: [PATCH 1/7] provide methods to return synonyms directly, without
 wrapping them in a dataframe

In performance sensitive applications (such as user facing apis) the roundtrip to and from dataframes is costly and unnecessary
---
 .../apache/spark/ml/feature/Word2Vec.scala    | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 3ed08c983d561..861b5ee6e100c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -232,11 +232,11 @@ class Word2VecModel private[ml] (
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity")
+    spark.createDataFrame(findSynonymsLocal(word, num)).toDF("word", "similarity")
   }
 
   /**
-   * Find "num" number of words whose vector representation most similar to the supplied vector.
+   * Find "num" number of words whose vector representation is most similar to the supplied vector.
    * If the supplied vector is the vector representation of a word in the model's vocabulary,
    * that word will be in the results.  Returns a dataframe with the words and the cosine
    * similarities between the synonyms and the given word vector.
@@ -244,7 +244,28 @@ class Word2VecModel private[ml] (
   @Since("2.0.0")
   def findSynonyms(vec: Vector, num: Int): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(wordVectors.findSynonyms(vec, num)).toDF("word", "similarity")
+    spark.createDataFrame(findSynonymsLocal(vec, num)).toDF("word", "similarity")
+  }
+
+  /**
+   * Find "num" number of words whose vector representation is most similar to the supplied vector.
+   * If the supplied vector is the vector representation of a word in the model's vocabulary,
+   * that word will be in the results. Returns an array of the words and the cosine
+   * similarities between the synonyms and the given word vector.
+   */
+  @Since("2.2.0")
+  def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = {
+    wordVectors.findSynonyms(vec, num)
+  }
+
+  /**
+   * Find "num" number of words closest in similarity to the given word, not
+   * including the word itself. Returns a dataframe with the words and the
+   * cosine similarities between the synonyms and the given word.
+   */
+  @Since("2.2.0")
+  def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = {
+    wordVectors.findSynonyms(word, num)
   }
 
   /** @group setParam */

From edc806039518950760404d85878dbd04d12291f8 Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Mon, 6 Feb 2017 10:39:57 -0500
Subject: [PATCH 2/7] update the javadoc

use @return to specify return values. Correct the return type for `findSynonymsLocal`
---
 .../org/apache/spark/ml/feature/Word2Vec.scala | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 861b5ee6e100c..65c801124e729 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -226,8 +226,9 @@ class Word2VecModel private[ml] (
 
   /**
    * Find "num" number of words closest in similarity to the given word, not
-   * including the word itself. Returns a dataframe with the words and the
-   * cosine similarities between the synonyms and the given word.
+   * including the word itself.
+   * @return a dataframe with columns "word" and "similarity" of the word and the cosine
+   * similarities between the synonyms and the given word vector.
    */
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
@@ -238,7 +239,8 @@ class Word2VecModel private[ml] (
   /**
    * Find "num" number of words whose vector representation is most similar to the supplied vector.
    * If the supplied vector is the vector representation of a word in the model's vocabulary,
-   * that word will be in the results.  Returns a dataframe with the words and the cosine
+   * that word will be in the results.
+   * @return a dataframe with columns "word" and "similarity" of the word and the cosine
    * similarities between the synonyms and the given word vector.
    */
   @Since("2.0.0")
@@ -250,8 +252,9 @@ class Word2VecModel private[ml] (
   /**
    * Find "num" number of words whose vector representation is most similar to the supplied vector.
    * If the supplied vector is the vector representation of a word in the model's vocabulary,
-   * that word will be in the results. Returns an array of the words and the cosine
-   * similarities between the synonyms and the given word vector.
+   * that word will be in the results.
+   * @return an array of the words and the cosine similarities between the synonyms given
+   * word vector.
    */
   @Since("2.2.0")
   def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = {
@@ -260,8 +263,9 @@ class Word2VecModel private[ml] (
 
   /**
    * Find "num" number of words closest in similarity to the given word, not
-   * including the word itself. Returns a dataframe with the words and the
-   * cosine similarities between the synonyms and the given word.
+   * including the word itself.
+   * @return an array of the words and the cosine similarities between the synonyms given
+   * word vector.
    */
   @Since("2.2.0")
   def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = {

From 7988385e8412a0176d1595f9d59dda843d0e4e23 Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Wed, 15 Feb 2017 08:21:50 -0500
Subject: [PATCH 3/7] rename to findSynonymsArray

is clearer both about the return type and that it runs locally
also - add a explicit test
---
 .../apache/spark/ml/feature/Word2Vec.scala    |  8 +++---
 .../spark/ml/feature/Word2VecSuite.scala      | 25 +++++++++++++++++++
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 65c801124e729..3894183b956af 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -233,7 +233,7 @@ class Word2VecModel private[ml] (
   @Since("1.5.0")
   def findSynonyms(word: String, num: Int): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(findSynonymsLocal(word, num)).toDF("word", "similarity")
+    spark.createDataFrame(findSynonymsArray(word, num)).toDF("word", "similarity")
   }
 
   /**
@@ -246,7 +246,7 @@ class Word2VecModel private[ml] (
   @Since("2.0.0")
   def findSynonyms(vec: Vector, num: Int): DataFrame = {
     val spark = SparkSession.builder().getOrCreate()
-    spark.createDataFrame(findSynonymsLocal(vec, num)).toDF("word", "similarity")
+    spark.createDataFrame(findSynonymsArray(vec, num)).toDF("word", "similarity")
   }
 
   /**
@@ -257,7 +257,7 @@ class Word2VecModel private[ml] (
    * word vector.
    */
   @Since("2.2.0")
-  def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = {
+  def findSynonymsArray(vec: Vector, num: Int): Array[(String, Double)] = {
     wordVectors.findSynonyms(vec, num)
   }
 
@@ -268,7 +268,7 @@ class Word2VecModel private[ml] (
    * word vector.
    */
   @Since("2.2.0")
-  def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = {
+  def findSynonymsArray(word: String, num: Int): Array[(String, Double)] = {
     wordVectors.findSynonyms(word, num)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 613cc3d60b227..179e3a4fd89da 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -144,6 +144,31 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     }
   }
 
+  test("findSynonymsArray") {
+
+    val spark = this.spark
+    import spark.implicits._
+
+    val sentence = "a b " * 100 + "a c " * 10
+    val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
+    val docDF = doc.zip(doc).toDF("text", "alsotext")
+
+    val model = new Word2Vec()
+      .setVectorSize(3)
+      .setInputCol("text")
+      .setOutputCol("result")
+      .setSeed(42L)
+      .fit(docDF)
+
+    val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078)
+    val (synonyms, similarity) = model.findSynonymsArray("a", 2).unzip
+
+    assert(synonyms === Array("b", "c"))
+    expectedSimilarity.zip(similarity).foreach {
+      case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5)
+    }
+  }
+
   test("window size") {
 
     val spark = this.spark

From a3c60250789ca11e0a49d1c6be01497d5f83f838 Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Sun, 5 Mar 2017 08:42:05 -0500
Subject: [PATCH 4/7] combine into one test

avoid training a new model. Instead just assert that both methods return the same results
---
 .../spark/ml/feature/Word2VecSuite.scala      | 34 +++++--------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 179e3a4fd89da..8766ad01bd202 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -134,38 +134,22 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       .fit(docDF)
 
     val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078)
-    val (synonyms, similarity) = model.findSynonyms("a", 2).rdd.map {
+    val result = model.findSynonyms("a", 2).rdd.map {
       case Row(w: String, sim: Double) => (w, sim)
-    }.collect().unzip
+    }.collect()
+    val (synonyms, similarity) = result.unzip
 
     assert(synonyms === Array("b", "c"))
     expectedSimilarity.zip(similarity).foreach {
       case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5)
     }
-  }
-
-  test("findSynonymsArray") {
-
-    val spark = this.spark
-    import spark.implicits._
-
-    val sentence = "a b " * 100 + "a c " * 10
-    val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" "))
-    val docDF = doc.zip(doc).toDF("text", "alsotext")
 
-    val model = new Word2Vec()
-      .setVectorSize(3)
-      .setInputCol("text")
-      .setOutputCol("result")
-      .setSeed(42L)
-      .fit(docDF)
-
-    val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078)
-    val (synonyms, similarity) = model.findSynonymsArray("a", 2).unzip
-
-    assert(synonyms === Array("b", "c"))
-    expectedSimilarity.zip(similarity).foreach {
-      case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5)
+    result.zip(model.findSynonymsArray("a", 2)).foreach {
+      case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) =>
+        assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1)
+        assert(
+          math.abs((expectedSynonymAndSimilarity._2 - actualSynonymAndSimilarity._2)
+            / expectedSynonymAndSimilarity._2) < 1E-5)
     }
   }
 

From 2cca29a9b2c4e779ab6d3282d4025786868d9dbb Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Sun, 5 Mar 2017 08:58:01 -0500
Subject: [PATCH 5/7] use existing fuzzyEquals implementation

rather than rolling a new implementation for each test
---
 .../scala/org/apache/spark/ml/feature/Word2VecSuite.scala   | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 8766ad01bd202..aa2f5622646b9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -141,15 +141,13 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
 
     assert(synonyms === Array("b", "c"))
     expectedSimilarity.zip(similarity).foreach {
-      case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5)
+      case (expected, actual) => assert(expected ~== actual absTol 1E-5)
     }
 
     result.zip(model.findSynonymsArray("a", 2)).foreach {
       case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) =>
         assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1)
-        assert(
-          math.abs((expectedSynonymAndSimilarity._2 - actualSynonymAndSimilarity._2)
-            / expectedSynonymAndSimilarity._2) < 1E-5)
+        assert(expectedSynonymAndSimilarity._2 ~== actualSynonymAndSimilarity._2 absTol 1E-5)
     }
   }
 

From 881353d7d104f3bff9246418c6149af6513c858b Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Sun, 5 Mar 2017 09:05:06 -0500
Subject: [PATCH 6/7] simplify case class

This is more readable. Use the nice features of scala!
---
 .../scala/org/apache/spark/ml/feature/Word2VecSuite.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index aa2f5622646b9..4574d3ccb9610 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -145,9 +145,9 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     }
 
     result.zip(model.findSynonymsArray("a", 2)).foreach {
-      case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) =>
-        assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1)
-        assert(expectedSynonymAndSimilarity._2 ~== actualSynonymAndSimilarity._2 absTol 1E-5)
+      case ((expectedSynonym, expectedSimilarity), (actualSynonym, actualSimilarity)) =>
+        assert(expectedSynonym === actualSynonym)
+        assert(expectedSimilarity ~== actualSimilarity absTol 1E-5)
     }
   }
 

From 3a02800dbb3bf03f5ecb58f15691848d8df77cf0 Mon Sep 17 00:00:00 2001
From: Asher Krim <akrim@hubspot.com>
Date: Mon, 6 Mar 2017 07:23:25 -0500
Subject: [PATCH 7/7] use map for result assertions

results may not be in the same order
---
 .../spark/ml/feature/Word2VecSuite.scala      | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 4574d3ccb9610..2043a16c15f1a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -133,21 +133,22 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
       .setSeed(42L)
       .fit(docDF)
 
-    val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078)
-    val result = model.findSynonyms("a", 2).rdd.map {
+    val expected = Map(("b", 0.2608488929093532), ("c", -0.8271274846926078))
+    val findSynonymsResult = model.findSynonyms("a", 2).rdd.map {
       case Row(w: String, sim: Double) => (w, sim)
-    }.collect()
-    val (synonyms, similarity) = result.unzip
+    }.collectAsMap()
 
-    assert(synonyms === Array("b", "c"))
-    expectedSimilarity.zip(similarity).foreach {
-      case (expected, actual) => assert(expected ~== actual absTol 1E-5)
+    expected.foreach {
+      case (expectedSynonym, expectedSimilarity) =>
+        assert(findSynonymsResult.contains(expectedSynonym))
+        assert(expectedSimilarity ~== findSynonymsResult.get(expectedSynonym).get absTol 1E-5)
     }
 
-    result.zip(model.findSynonymsArray("a", 2)).foreach {
-      case ((expectedSynonym, expectedSimilarity), (actualSynonym, actualSimilarity)) =>
-        assert(expectedSynonym === actualSynonym)
-        assert(expectedSimilarity ~== actualSimilarity absTol 1E-5)
+    val findSynonymsArrayResult = model.findSynonymsArray("a", 2).toMap
+    findSynonymsResult.foreach {
+      case (expectedSynonym, expectedSimilarity) =>
+        assert(findSynonymsArrayResult.contains(expectedSynonym))
+        assert(expectedSimilarity ~== findSynonymsArrayResult.get(expectedSynonym).get absTol 1E-5)
     }
   }