From 54ae63e30ea699293d47ab05732e774ab3481049 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Mon, 18 May 2015 15:23:18 +0800 Subject: [PATCH 1/2] add requirement for word2vec model --- .../main/scala/org/apache/spark/mllib/feature/Word2Vec.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 731f7576c2335..42c755b814040 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -410,6 +410,9 @@ class Word2Vec extends Serializable with Logging { i += 1 } + require(word2VecMap.size > 0, "The word2vec map should not be empty. You may need to check " + + "the setting of minCount, which could be large enough to remove all your words in sentences.") + new Word2VecModel(word2VecMap.toMap) } From 21770c53c6be0c50b8b5deb73f801d55ff423475 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Mon, 18 May 2015 15:51:22 +0800 Subject: [PATCH 2/2] check the vocab size --- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index 42c755b814040..b7d478fb921ec 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -158,6 +158,9 @@ class Word2Vec extends Serializable with Logging { .sortWith((a, b) => a.cn > b.cn) vocabSize = vocab.length + require(vocabSize > 0, "The vocabulary size should be large than 0. You may need to check " + + "the setting of minCount, which could be large enough to remove all your words in sentences.") + var a = 0 while (a < vocabSize) { vocabHash += vocab(a).word -> a @@ -410,9 +413,6 @@ class Word2Vec extends Serializable with Logging { i += 1 } - require(word2VecMap.size > 0, "The word2vec map should not be empty. You may need to check " + - "the setting of minCount, which could be large enough to remove all your words in sentences.") - new Word2VecModel(word2VecMap.toMap) }