From 582cd11e5331a8e2704a5603080eec41c9002cf4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 4 Jul 2014 00:27:22 +0800 Subject: [PATCH] simply add checker for the number of clusters. --- .../scala/org/apache/spark/mllib/clustering/KMeans.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index de22fbb6ffc10..1398fd24f30fc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -117,6 +117,11 @@ class KMeans private ( * performance, because this is an iterative algorithm. */ def run(data: RDD[Vector]): KMeansModel = { + + if (data.count() < k) { + throw new IllegalArgumentException("Number of clusters must not be greater than data number") + } + // Compute squared norms and cache them. val norms = data.map(v => breezeNorm(v.toBreeze, 2.0)) norms.persist()