From 05eaf59fa34d6f8d721e48b3300271d347aae699 Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Sun, 5 Feb 2017 15:05:32 -0500 Subject: [PATCH 1/7] provide methods to return synonyms directly, without wrapping them in a dataframe In performance sensitive applications (such as user facing apis) the roundtrip to and from dataframes is costly and unnecessary --- .../apache/spark/ml/feature/Word2Vec.scala | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 3ed08c983d561..861b5ee6e100c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -232,11 +232,11 @@ class Word2VecModel private[ml] ( @Since("1.5.0") def findSynonyms(word: String, num: Int): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") + spark.createDataFrame(findSynonymsLocal(word, num)).toDF("word", "similarity") } /** - * Find "num" number of words whose vector representation most similar to the supplied vector. + * Find "num" number of words whose vector representation is most similar to the supplied vector. * If the supplied vector is the vector representation of a word in the model's vocabulary, * that word will be in the results. Returns a dataframe with the words and the cosine * similarities between the synonyms and the given word vector. @@ -244,7 +244,28 @@ class Word2VecModel private[ml] ( @Since("2.0.0") def findSynonyms(vec: Vector, num: Int): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(wordVectors.findSynonyms(vec, num)).toDF("word", "similarity") + spark.createDataFrame(findSynonymsLocal(vec, num)).toDF("word", "similarity") + } + + /** + * Find "num" number of words whose vector representation is most similar to the supplied vector. + * If the supplied vector is the vector representation of a word in the model's vocabulary, + * that word will be in the results. Returns an array of the words and the cosine + * similarities between the synonyms and the given word vector. + */ + @Since("2.2.0") + def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = { + wordVectors.findSynonyms(vec, num) + } + + /** + * Find "num" number of words closest in similarity to the given word, not + * including the word itself. Returns a dataframe with the words and the + * cosine similarities between the synonyms and the given word. + */ + @Since("2.2.0") + def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = { + wordVectors.findSynonyms(word, num) } /** @group setParam */ From edc806039518950760404d85878dbd04d12291f8 Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Mon, 6 Feb 2017 10:39:57 -0500 Subject: [PATCH 2/7] update the javadoc use @return to specify return values. Correct the return type for `findSynonymsLocal` --- .../org/apache/spark/ml/feature/Word2Vec.scala | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 861b5ee6e100c..65c801124e729 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -226,8 +226,9 @@ class Word2VecModel private[ml] ( /** * Find "num" number of words closest in similarity to the given word, not - * including the word itself. Returns a dataframe with the words and the - * cosine similarities between the synonyms and the given word. + * including the word itself. + * @return a dataframe with columns "word" and "similarity" of the word and the cosine + * similarities between the synonyms and the given word vector. */ @Since("1.5.0") def findSynonyms(word: String, num: Int): DataFrame = { @@ -238,7 +239,8 @@ class Word2VecModel private[ml] ( /** * Find "num" number of words whose vector representation is most similar to the supplied vector. * If the supplied vector is the vector representation of a word in the model's vocabulary, - * that word will be in the results. Returns a dataframe with the words and the cosine + * that word will be in the results. + * @return a dataframe with columns "word" and "similarity" of the word and the cosine * similarities between the synonyms and the given word vector. */ @Since("2.0.0") @@ -250,8 +252,9 @@ class Word2VecModel private[ml] ( /** * Find "num" number of words whose vector representation is most similar to the supplied vector. * If the supplied vector is the vector representation of a word in the model's vocabulary, - * that word will be in the results. Returns an array of the words and the cosine - * similarities between the synonyms and the given word vector. + * that word will be in the results. + * @return an array of the words and the cosine similarities between the synonyms given + * word vector. */ @Since("2.2.0") def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = { @@ -260,8 +263,9 @@ class Word2VecModel private[ml] ( /** * Find "num" number of words closest in similarity to the given word, not - * including the word itself. Returns a dataframe with the words and the - * cosine similarities between the synonyms and the given word. + * including the word itself. + * @return an array of the words and the cosine similarities between the synonyms given + * word vector. */ @Since("2.2.0") def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = { From 7988385e8412a0176d1595f9d59dda843d0e4e23 Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Wed, 15 Feb 2017 08:21:50 -0500 Subject: [PATCH 3/7] rename to findSynonymsArray is clearer both about the return type and that it runs locally also - add a explicit test --- .../apache/spark/ml/feature/Word2Vec.scala | 8 +++--- .../spark/ml/feature/Word2VecSuite.scala | 25 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 65c801124e729..3894183b956af 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -233,7 +233,7 @@ class Word2VecModel private[ml] ( @Since("1.5.0") def findSynonyms(word: String, num: Int): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(findSynonymsLocal(word, num)).toDF("word", "similarity") + spark.createDataFrame(findSynonymsArray(word, num)).toDF("word", "similarity") } /** @@ -246,7 +246,7 @@ class Word2VecModel private[ml] ( @Since("2.0.0") def findSynonyms(vec: Vector, num: Int): DataFrame = { val spark = SparkSession.builder().getOrCreate() - spark.createDataFrame(findSynonymsLocal(vec, num)).toDF("word", "similarity") + spark.createDataFrame(findSynonymsArray(vec, num)).toDF("word", "similarity") } /** @@ -257,7 +257,7 @@ class Word2VecModel private[ml] ( * word vector. */ @Since("2.2.0") - def findSynonymsLocal(vec: Vector, num: Int): Array[(String, Double)] = { + def findSynonymsArray(vec: Vector, num: Int): Array[(String, Double)] = { wordVectors.findSynonyms(vec, num) } @@ -268,7 +268,7 @@ class Word2VecModel private[ml] ( * word vector. */ @Since("2.2.0") - def findSynonymsLocal(word: String, num: Int): Array[(String, Double)] = { + def findSynonymsArray(word: String, num: Int): Array[(String, Double)] = { wordVectors.findSynonyms(word, num) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 613cc3d60b227..179e3a4fd89da 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -144,6 +144,31 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul } } + test("findSynonymsArray") { + + val spark = this.spark + import spark.implicits._ + + val sentence = "a b " * 100 + "a c " * 10 + val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) + val docDF = doc.zip(doc).toDF("text", "alsotext") + + val model = new Word2Vec() + .setVectorSize(3) + .setInputCol("text") + .setOutputCol("result") + .setSeed(42L) + .fit(docDF) + + val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078) + val (synonyms, similarity) = model.findSynonymsArray("a", 2).unzip + + assert(synonyms === Array("b", "c")) + expectedSimilarity.zip(similarity).foreach { + case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5) + } + } + test("window size") { val spark = this.spark From a3c60250789ca11e0a49d1c6be01497d5f83f838 Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Sun, 5 Mar 2017 08:42:05 -0500 Subject: [PATCH 4/7] combine into one test avoid training a new model. Instead just assert that both methods return the same results --- .../spark/ml/feature/Word2VecSuite.scala | 34 +++++-------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 179e3a4fd89da..8766ad01bd202 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -134,38 +134,22 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul .fit(docDF) val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078) - val (synonyms, similarity) = model.findSynonyms("a", 2).rdd.map { + val result = model.findSynonyms("a", 2).rdd.map { case Row(w: String, sim: Double) => (w, sim) - }.collect().unzip + }.collect() + val (synonyms, similarity) = result.unzip assert(synonyms === Array("b", "c")) expectedSimilarity.zip(similarity).foreach { case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5) } - } - - test("findSynonymsArray") { - - val spark = this.spark - import spark.implicits._ - - val sentence = "a b " * 100 + "a c " * 10 - val doc = sc.parallelize(Seq(sentence, sentence)).map(line => line.split(" ")) - val docDF = doc.zip(doc).toDF("text", "alsotext") - val model = new Word2Vec() - .setVectorSize(3) - .setInputCol("text") - .setOutputCol("result") - .setSeed(42L) - .fit(docDF) - - val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078) - val (synonyms, similarity) = model.findSynonymsArray("a", 2).unzip - - assert(synonyms === Array("b", "c")) - expectedSimilarity.zip(similarity).foreach { - case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5) + result.zip(model.findSynonymsArray("a", 2)).foreach { + case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) => + assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1) + assert( + math.abs((expectedSynonymAndSimilarity._2 - actualSynonymAndSimilarity._2) + / expectedSynonymAndSimilarity._2) < 1E-5) } } From 2cca29a9b2c4e779ab6d3282d4025786868d9dbb Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Sun, 5 Mar 2017 08:58:01 -0500 Subject: [PATCH 5/7] use existing fuzzyEquals implementation rather than rolling a new implementation for each test --- .../scala/org/apache/spark/ml/feature/Word2VecSuite.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 8766ad01bd202..aa2f5622646b9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -141,15 +141,13 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul assert(synonyms === Array("b", "c")) expectedSimilarity.zip(similarity).foreach { - case (expected, actual) => assert(math.abs((expected - actual) / expected) < 1E-5) + case (expected, actual) => assert(expected ~== actual absTol 1E-5) } result.zip(model.findSynonymsArray("a", 2)).foreach { case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) => assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1) - assert( - math.abs((expectedSynonymAndSimilarity._2 - actualSynonymAndSimilarity._2) - / expectedSynonymAndSimilarity._2) < 1E-5) + assert(expectedSynonymAndSimilarity._2 ~== actualSynonymAndSimilarity._2 absTol 1E-5) } } From 881353d7d104f3bff9246418c6149af6513c858b Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Sun, 5 Mar 2017 09:05:06 -0500 Subject: [PATCH 6/7] simplify case class This is more readable. Use the nice features of scala! --- .../scala/org/apache/spark/ml/feature/Word2VecSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index aa2f5622646b9..4574d3ccb9610 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -145,9 +145,9 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul } result.zip(model.findSynonymsArray("a", 2)).foreach { - case (expectedSynonymAndSimilarity, actualSynonymAndSimilarity) => - assert(expectedSynonymAndSimilarity._1 === actualSynonymAndSimilarity._1) - assert(expectedSynonymAndSimilarity._2 ~== actualSynonymAndSimilarity._2 absTol 1E-5) + case ((expectedSynonym, expectedSimilarity), (actualSynonym, actualSimilarity)) => + assert(expectedSynonym === actualSynonym) + assert(expectedSimilarity ~== actualSimilarity absTol 1E-5) } } From 3a02800dbb3bf03f5ecb58f15691848d8df77cf0 Mon Sep 17 00:00:00 2001 From: Asher Krim Date: Mon, 6 Mar 2017 07:23:25 -0500 Subject: [PATCH 7/7] use map for result assertions results may not be in the same order --- .../spark/ml/feature/Word2VecSuite.scala | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 4574d3ccb9610..2043a16c15f1a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -133,21 +133,22 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul .setSeed(42L) .fit(docDF) - val expectedSimilarity = Array(0.2608488929093532, -0.8271274846926078) - val result = model.findSynonyms("a", 2).rdd.map { + val expected = Map(("b", 0.2608488929093532), ("c", -0.8271274846926078)) + val findSynonymsResult = model.findSynonyms("a", 2).rdd.map { case Row(w: String, sim: Double) => (w, sim) - }.collect() - val (synonyms, similarity) = result.unzip + }.collectAsMap() - assert(synonyms === Array("b", "c")) - expectedSimilarity.zip(similarity).foreach { - case (expected, actual) => assert(expected ~== actual absTol 1E-5) + expected.foreach { + case (expectedSynonym, expectedSimilarity) => + assert(findSynonymsResult.contains(expectedSynonym)) + assert(expectedSimilarity ~== findSynonymsResult.get(expectedSynonym).get absTol 1E-5) } - result.zip(model.findSynonymsArray("a", 2)).foreach { - case ((expectedSynonym, expectedSimilarity), (actualSynonym, actualSimilarity)) => - assert(expectedSynonym === actualSynonym) - assert(expectedSimilarity ~== actualSimilarity absTol 1E-5) + val findSynonymsArrayResult = model.findSynonymsArray("a", 2).toMap + findSynonymsResult.foreach { + case (expectedSynonym, expectedSimilarity) => + assert(findSynonymsArrayResult.contains(expectedSynonym)) + assert(expectedSimilarity ~== findSynonymsArrayResult.get(expectedSynonym).get absTol 1E-5) } }