From 9fd8112e703c4ec06b32ebf5609e71885e9ffa14 Mon Sep 17 00:00:00 2001 From: kzhang28 Date: Sun, 17 Jul 2016 19:03:11 -0400 Subject: [PATCH 1/2] Update SparkKMeans.scala Please make sure the Kmeans input dataset does not contain duplicates, otherwise takeSample() method may take duplicated examples. --- .../main/scala/org/apache/spark/examples/SparkKMeans.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala index fec3160e9f37b..765123f8bec3c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala @@ -33,7 +33,6 @@ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } - def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity @@ -75,7 +74,10 @@ object SparkKMeans { val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble - + /* + * A requirement of using takeSample() method: the element in data (line 74) should be unique (No duplicates), otherwise the size of newPoints (line 89) will not + * necessarily be the same as K and an execption will be thrown when executing line 94 + */ val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 From 0c8a42685a82ad769c8d10eb5953bcac56cc4098 Mon Sep 17 00:00:00 2001 From: kzhang28 Date: Sun, 17 Jul 2016 19:10:42 -0400 Subject: [PATCH 2/2] Update SparkKMeans.scala --- .../src/main/scala/org/apache/spark/examples/SparkKMeans.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala index 765123f8bec3c..ce4f1e468b320 100644 --- a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala +++ b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala @@ -76,7 +76,7 @@ object SparkKMeans { val convergeDist = args(2).toDouble /* * A requirement of using takeSample() method: the element in data (line 74) should be unique (No duplicates), otherwise the size of newPoints (line 89) will not - * necessarily be the same as K and an execption will be thrown when executing line 94 + * necessarily be the same as K and an exception will be thrown when executing line 94 */ val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0