From 8c23dddd9ca7d0f767457a79ad3ab8f93655b257 Mon Sep 17 00:00:00 2001 From: Eric Li Date: Mon, 18 May 2015 13:33:33 -0400 Subject: [PATCH 1/5] Fix wordVectors divided by norm = 0 --- .../apache/spark/mllib/feature/Word2Vec.scala | 2 +- .../spark/mllib/feature/Word2VecSuite.scala | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 98e83112f52ae..5c6fa12a16d67 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -521,7 +521,7 @@ class Word2VecModel private[mllib] ( val updatedCosines = new Array[Double](numWords) var ind = 0 while (ind < numWords) { - updatedCosines(ind) = cosineVec(ind) / wordVecNorms(ind) + updatedCosines(ind) = if (wordVecNorms(ind) == 0) 0.0 else cosineVec(ind) / wordVecNorms(ind) ind += 1 } wordList.zip(updatedCosines) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 98a98a7599bcb..404eefcb2d17a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -55,8 +55,23 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext { assert(syms(1)._1 == "japan") } - test("model load / save") { + test("Word2VecModel for norm equals to 0") { + val num = 4 + val word2VecMap = Map( + ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), + ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), + ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), + ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)), + ("us", Array(0.00f, 0.00f, 0.00f, 0.00f)) + ) + val model = new Word2VecModel(word2VecMap) + val syms = model.findSynonyms("china", num) + assert(syms.length == num) + assert(!syms.last._2.isNaN) + assert(syms.last._2 == 0) + } + test("model load / save") { val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), From a941a3d8e1434cbf694754059e6aec9da88a3fd0 Mon Sep 17 00:00:00 2001 From: Eric Li Date: Mon, 18 May 2015 13:36:25 -0400 Subject: [PATCH 2/5] SPARK-7617: normalize fVector --- .../apache/spark/mllib/feature/Word2Vec.scala | 18 +++++++++++++++++- .../spark/mllib/feature/Word2VecSuite.scala | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 5c6fa12a16d67..b7cfbe504f6ba 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -509,7 +509,7 @@ class Word2VecModel private[mllib] ( def findSynonyms(vector: Vector, num: Int): Array[(String, Double)] = { require(num > 0, "Number of similar words should > 0") - val fVector = vector.toArray.map(_.toFloat) + val fVector = euclideanNormalize(vector.toArray.map(_.toFloat)) val cosineVec = Array.fill[Float](numWords)(0) val alpha: Float = 1 val beta: Float = 0 @@ -540,6 +540,22 @@ class Word2VecModel private[mllib] ( (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) } } + + /** + * Euclidean Normalization for a vector + * @param vector An array to be normalized + * @return a new normalized array + */ + def euclideanNormalize(vector: Array[Float]):Array[Float] = { + val norm = blas.snrm2(vector.size, vector, 1) + + if (norm == 0) { + Array.fill[Float](vector.size)(0) + } else { + blas.sscal(vector.size, 1/norm, vector, 1) + vector + } + } } @Experimental diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala index 404eefcb2d17a..4dce7f062808c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/Word2VecSuite.scala @@ -71,6 +71,20 @@ class Word2VecSuite extends FunSuite with MLlibTestSparkContext { assert(syms.last._2 == 0) } + test("Word2VecModel should normalize fVector") { + val num = 2 + val word2VecMap = Map( + ("china", Array(0.49f, 0.50f, 0.50f, 0.50f)), + ("japan", Array(0.40f, 0.50f, 0.50f, 0.50f)), + ("taiwan", Array(0.60f, 0.50f, 0.50f, 0.50f)), + ("korea", Array(0.45f, 0.60f, 0.60f, 0.60f)) + ) + val model = new Word2VecModel(word2VecMap) + val syms = model.findSynonyms("china", num) + assert(syms.length == num) + assert(syms(0)._1 == "japan") + } + test("model load / save") { val word2VecMap = Map( ("china", Array(0.50f, 0.50f, 0.50f, 0.50f)), From 8642ff26e69ea341ac319c5540e0964f9324e97a Mon Sep 17 00:00:00 2001 From: Eric Li Date: Mon, 18 May 2015 16:41:45 -0400 Subject: [PATCH 3/5] SPARK-7618: Cache normalized wordVectors; Lazy loading wordVectors and wordVectorsNormalized. --- .../apache/spark/mllib/feature/Word2Vec.scala | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index b7cfbe504f6ba..35a0294cab069 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -445,19 +445,22 @@ class Word2VecModel private[mllib] ( // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word // mapped with index i can be retrieved by the slice // (ind * vectorSize, ind * vectorSize + vectorSize) - // wordVecNorms: Array of length numWords, each value being the Euclidean norm - // of the wordVector. - private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = { + lazy private val wordVectors: Array[Float] = { val wordVectors = new Array[Float](vectorSize * numWords) - val wordVecNorms = new Array[Double](numWords) - var i = 0 - while (i < numWords) { - val vec = model.get(wordList(i)).get - Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize) - wordVecNorms(i) = blas.snrm2(vectorSize, vec, 1) - i += 1 + for (i <- 0 until numWords) { + Array.copy(model.get(wordList(i)).get, 0, wordVectors, i * vectorSize, vectorSize) } - (wordVectors, wordVecNorms) + wordVectors + } + + // wordVectorsNormalized: Array of length numWords * vectorSize, wordVectors after + // Euclidean normalization. + lazy private val wordVectorsNormalized: Array[Float] = { + val wordVectorsNormalized = new Array[Float](vectorSize * numWords) + for (i <- 0 until numWords) { + Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized, i * vectorSize, vectorSize) + } + wordVectorsNormalized } private def cosineSimilarity(v1: Array[Float], v2: Array[Float]): Double = { @@ -515,16 +518,9 @@ class Word2VecModel private[mllib] ( val beta: Float = 0 blas.sgemv( - "T", vectorSize, numWords, alpha, wordVectors, vectorSize, fVector, 1, beta, cosineVec, 1) - - // Need not divide with the norm of the given vector since it is constant. - val updatedCosines = new Array[Double](numWords) - var ind = 0 - while (ind < numWords) { - updatedCosines(ind) = if (wordVecNorms(ind) == 0) 0.0 else cosineVec(ind) / wordVecNorms(ind) - ind += 1 - } - wordList.zip(updatedCosines) + "T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize, fVector, 1, beta, cosineVec, 1) + + wordList.zip(cosineVec.map(_.toDouble)) .toSeq .sortBy(- _._2) .take(num + 1) From dba6731d5c8ebc62656b690f4594bf1ccd834b28 Mon Sep 17 00:00:00 2001 From: Eric Li Date: Wed, 20 May 2015 15:33:47 -0400 Subject: [PATCH 4/5] Fix Scala style: line wrapping --- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index e43e5dda7765e..aa70291f60b08 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -458,7 +458,8 @@ class Word2VecModel private[mllib] ( lazy private val wordVectorsNormalized: Array[Float] = { val wordVectorsNormalized = new Array[Float](vectorSize * numWords) for (i <- 0 until numWords) { - Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized, i * vectorSize, vectorSize) + Array.copy(euclideanNormalize(model.get(wordList(i)).get), 0, wordVectorsNormalized, + i * vectorSize, vectorSize) } wordVectorsNormalized } @@ -517,8 +518,8 @@ class Word2VecModel private[mllib] ( val alpha: Float = 1 val beta: Float = 0 - blas.sgemv( - "T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize, fVector, 1, beta, cosineVec, 1) + blas.sgemv("T", vectorSize, numWords, alpha, wordVectorsNormalized, vectorSize, + fVector, 1, beta, cosineVec, 1) wordList.zip(cosineVec.map(_.toDouble)) .toSeq From 1d732d07fb52ba3d19f658fec3ca78074f0e621a Mon Sep 17 00:00:00 2001 From: Eric Li Date: Mon, 20 Jul 2015 12:41:54 -0400 Subject: [PATCH 5/5] Fix scala style --- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 48352c31fd800..60bb0732eacd0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -546,7 +546,7 @@ class Word2VecModel private[spark] ( * @param vector An array to be normalized * @return a new normalized array */ - def euclideanNormalize(vector: Array[Float]):Array[Float] = { + def euclideanNormalize(vector: Array[Float]): Array[Float] = { val norm = blas.snrm2(vector.size, vector, 1) if (norm == 0) {