From 8c23dddd9ca7d0f767457a79ad3ab8f93655b257 Mon Sep 17 00:00:00 2001
From: Eric Li <eli@targetedvictory.com>
Date: Mon, 18 May 2015 13:33:33 -0400
Subject: [PATCH 1/5] Fix wordVectors divided by norm = 0

---
 .../apache/spark/mllib/feature/Word2Vec.scala   |  2 +-
 .../spark/mllib/feature/Word2VecSuite.scala     | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 98e83112f52ae..5c6fa12a16d67 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -521,7 +521,7 @@ class Word2VecModel private[mllib] (
     val updatedCosines = new Array[Double](numWords)
     var ind = 0
     while (ind < numWords) {
-      updatedCosines(ind) = cosineVec(ind) / wordVecNorms(ind)
+      updatedCosines(ind) = if (wordVecNorms(ind) == 0) 0.0 else cosineVec(ind) / wordVecNorms(ind)
       ind += 1
     }
     wordList.zip(updatedCosines)
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 98a98a7599bcb..404eefcb2d17a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -55,8 +55,23 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
     assert(syms(1)._1 == "japan")
   }
 
-  test("model load / save") {
+  test("Word2VecModel for norm equals to 0") {
+    val num = 4
+    val word2VecMap = Map(
+      ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
+      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
+      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
+      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)),
+      ("us", Array(0.00f, 0.00f, 0.00f, 0.00f))
+    )
+    val model = new Word2VecModel(word2VecMap)
+    val syms = model.findSynonyms("china", num)
+    assert(syms.length == num)
+    assert(!syms.last._2.isNaN)
+    assert(syms.last._2 == 0)
+  }
 
+  test("model load / save") {
     val word2VecMap = Map(
       ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),
       ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),

From a941a3d8e1434cbf694754059e6aec9da88a3fd0 Mon Sep 17 00:00:00 2001
From: Eric Li <eli@targetedvictory.com>
Date: Mon, 18 May 2015 13:36:25 -0400
Subject: [PATCH 2/5] SPARK-7617: normalize fVector

---
 .../apache/spark/mllib/feature/Word2Vec.scala  | 18 +++++++++++++++++-
 .../spark/mllib/feature/Word2VecSuite.scala    | 14 ++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 5c6fa12a16d67..b7cfbe504f6ba 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -509,7 +509,7 @@ class Word2VecModel private[mllib] (
   def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = {
     require(num > 0, "Number of similar words should > 0")
 
-    val fVector = vector.toArray.map(_.toFloat)
+    val fVector = euclideanNormalize(vector.toArray.map(_.toFloat))
     val cosineVec = Array.fill[Float](numWords)(0)
     val alpha: Float = 1
     val beta: Float = 0
@@ -540,6 +540,22 @@ class Word2VecModel private[mllib] (
       (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
     }
   }
+
+  /**
+   * Euclidean Normalization for a vector
+   * @param vector An array to be normalized
+   * @return a new normalized array
+   */
+  def euclideanNormalize(vector: Array[Float]):Array[Float] = {
+    val norm = blas.snrm2(vector.size, vector, 1)
+
+    if (norm == 0) {
+      Array.fill[Float](vector.size)(0)
+    } else {
+      blas.sscal(vector.size, 1/norm, vector, 1)
+      vector
+    }
+  }
 }
 
 @Experimental
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
index 404eefcb2d17a..4dce7f062808c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala
@@ -71,6 +71,20 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext {
     assert(syms.last._2 == 0)
   }
 
+  test("Word2VecModel should normalize fVector") {
+    val num = 2
+    val word2VecMap = Map(
+      ("china", Array(0.49f, 0.50f, 0.50f, 0.50f)),
+      ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)),
+      ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)),
+      ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f))
+    )
+    val model = new Word2VecModel(word2VecMap)
+    val syms = model.findSynonyms("china", num)
+    assert(syms.length == num)
+    assert(syms(0)._1 == "japan")
+  }
+
   test("model load / save") {
     val word2VecMap = Map(
       ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)),

From 8642ff26e69ea341ac319c5540e0964f9324e97a Mon Sep 17 00:00:00 2001
From: Eric Li <eli@targetedvictory.com>
Date: Mon, 18 May 2015 16:41:45 -0400
Subject: [PATCH 3/5] SPARK-7618: Cache normalized wordVectors; Lazy loading
 wordVectors and wordVectorsNormalized.

---
 .../apache/spark/mllib/feature/Word2Vec.scala | 38 +++++++++----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index b7cfbe504f6ba..35a0294cab069 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -445,19 +445,22 @@ class Word2VecModel private[mllib] (
   // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word
   //              mapped with index i can be retrieved by the slice
   //              (ind * vectorSize, ind * vectorSize + vectorSize)
-  // wordVecNorms: Array of length numWords, each value being the Euclidean norm
-  //               of the wordVector.
-  private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = {
+  lazy private val wordVectors: Array[Float] = {
     val wordVectors = new Array[Float](vectorSize * numWords)
-    val wordVecNorms = new Array[Double](numWords)
-    var i = 0
-    while (i < numWords) {
-      val vec = model.get(wordList(i)).get
-      Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
-      wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1)
-      i += 1
+    for (i <- 0 until numWords) {
+      Array.copy(model.get(wordList(i)).get, 0, wordVectors, i * vectorSize, vectorSize)
     }
-    (wordVectors, wordVecNorms)
+    wordVectors
+  }
+
+  // wordVectorsNormalized: Array of length numWords * vectorSize, wordVectors after
+  //               Euclidean normalization.
+  lazy private val wordVectorsNormalized: Array[Float] = {
+    val wordVectorsNormalized = new Array[Float](vectorSize * numWords)
+    for (i <- 0 until numWords) {
+      Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized, i * vectorSize, vectorSize)
+    }
+    wordVectorsNormalized
   }
 
   private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = {
@@ -515,16 +518,9 @@ class Word2VecModel private[mllib] (
     val beta: Float = 0
 
     blas.sgemv(
-      "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1)
-
-    // Need not divide with the norm of the given vector since it is constant.
-    val updatedCosines = new Array[Double](numWords)
-    var ind = 0
-    while (ind < numWords) {
-      updatedCosines(ind) = if (wordVecNorms(ind) == 0) 0.0 else cosineVec(ind) / wordVecNorms(ind)
-      ind += 1
-    }
-    wordList.zip(updatedCosines)
+      "T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize, fVector, 1, beta, cosineVec, 1)
+
+    wordList.zip(cosineVec.map(_.toDouble))
       .toSeq
       .sortBy(- _._2)
       .take(num + 1)

From dba6731d5c8ebc62656b690f4594bf1ccd834b28 Mon Sep 17 00:00:00 2001
From: Eric Li <eli@targetedvictory.com>
Date: Wed, 20 May 2015 15:33:47 -0400
Subject: [PATCH 4/5] Fix Scala style: line wrapping

---
 .../scala/org/apache/spark/mllib/feature/Word2Vec.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index e43e5dda7765e..aa70291f60b08 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -458,7 +458,8 @@ class Word2VecModel private[mllib] (
   lazy private val wordVectorsNormalized: Array[Float] = {
     val wordVectorsNormalized = new Array[Float](vectorSize * numWords)
     for (i <- 0 until numWords) {
-      Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized, i * vectorSize, vectorSize)
+      Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized,
+        i * vectorSize, vectorSize)
     }
     wordVectorsNormalized
   }
@@ -517,8 +518,8 @@ class Word2VecModel private[mllib] (
     val alpha: Float = 1
     val beta: Float = 0
 
-    blas.sgemv(
-      "T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize, fVector, 1, beta, cosineVec, 1)
+    blas.sgemv("T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize,
+      fVector, 1, beta, cosineVec, 1)
 
     wordList.zip(cosineVec.map(_.toDouble))
       .toSeq

From 1d732d07fb52ba3d19f658fec3ca78074f0e621a Mon Sep 17 00:00:00 2001
From: Eric Li <eli@targetedvictory.com>
Date: Mon, 20 Jul 2015 12:41:54 -0400
Subject: [PATCH 5/5] Fix scala style

---
 .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index 48352c31fd800..60bb0732eacd0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -546,7 +546,7 @@ class Word2VecModel private[spark] (
    * @param vector An array to be normalized
    * @return a new normalized array
    */
-  def euclideanNormalize(vector: Array[Float]):Array[Float] = {
+  def euclideanNormalize(vector: Array[Float]): Array[Float] = {
     val norm = blas.snrm2(vector.size, vector, 1)
 
     if (norm == 0) {